From 4c3b98f1bfb3de4df647e48d6f91bef37dcb9828 Mon Sep 17 00:00:00 2001 From: achiefa Date: Thu, 30 Oct 2025 15:16:42 +0000 Subject: [PATCH 01/16] Making the photon a downloadable resource --- validphys2/src/validphys/loader.py | 38 ++++++++++++++++++++++ validphys2/src/validphys/photon/compute.py | 4 +++ 2 files changed, 42 insertions(+) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index 2372f7fd15..3c75d55ba8 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -216,6 +216,14 @@ def available_ekos(self): return { eko_path.parent.name.split("_")[1] for eko_path in self._theories_path.glob("*/eko.tar") } + + @property + @functools.lru_cache + def available_photons_qed(self): + """Return a string token for each of the available theories""" + return { + eko_path.parent.name.split("_")[1] for eko_path in self._theories_path.glob("*/eko.npz") + } @functools.cached_property def available_photons(self): @@ -320,6 +328,14 @@ def check_eko(self, theoryID): if not eko_path.exists(): raise EkoNotFound(f"Could not find eko {eko_path} in theory: {theoryID}") return eko_path + + @functools.lru_cache + def check_photonQED(self, theoryID, luxset): + """Check the Photon QED set exists and return the path to it""" + photon_qed_path = self._photons_qed_path / f"photon_qed_{theoryID.id}_{luxset}.tar" + if not photon_qed_path.exists(): + raise PhotonQEDNotFound(f"Could not find Photon QED set {photon_qed_path} in theory: {theoryID}") + return photon_qed_path @functools.lru_cache def check_photonQED(self, theoryID, luxset): @@ -851,6 +867,16 @@ def eko_index(self): @_key_or_loader_error def eko_urls(self): return self.nnprofile['eko_urls'] + + @property + @_key_or_loader_error + def photon_qed_index(self): + return self.nnprofile['photon_qed_index'] + + @property + @_key_or_loader_error + def photon_qed_urls(self): + return self.nnprofile['photon_qed_urls'] @property @_key_or_loader_error @@ -884,6 +910,7 @@ def lhapdf_urls(self): def _remote_files_from_url(self, url, index, thing='files'): index_url = url + index + import ipdb; ipdb.set_trace() try: resp = requests.get(index_url) resp.raise_for_status() @@ -931,6 +958,13 @@ def remote_ekos(self): token = 'eko_' rt = self.remote_files(self.eko_urls, self.eko_index, thing="ekos") return {k[len(token) :]: v for k, v in rt.items()} + + @property + @functools.lru_cache + def remote_photons_qed(self): + token = 'photon_qed_' + rt = self.remote_files(self.photon_qed_urls, self.photon_qed_index, thing="photons_qed") + return {k[len(token) :]: v for k, v in rt.items()} @property @functools.lru_cache @@ -972,6 +1006,10 @@ def downloadable_theories(self): @property def downloadable_ekos(self): return list(self.remote_ekos) + + @property + def downloadable_photonsQED(self): + return list(self.remote_photons_qed) @property def downloadable_photons(self): diff --git a/validphys2/src/validphys/photon/compute.py b/validphys2/src/validphys/photon/compute.py index d3effa8ec1..119245c718 100644 --- a/validphys2/src/validphys/photon/compute.py +++ b/validphys2/src/validphys/photon/compute.py @@ -3,6 +3,7 @@ from functools import lru_cache import logging import tempfile +from concurrent.futures import ThreadPoolExecutor from joblib import Parallel, delayed import numpy as np @@ -16,11 +17,13 @@ from validphys.core import FKTableSpec from validphys.loader import Loader, PhotonQEDNotFound from validphys.n3fit_data import replica_luxseed +from validphys.loader import Loader, PhotonQEDNotFound from . import structure_functions as sf from .alpha import Alpha log = logging.getLogger(__name__) +loader = Loader() # not the complete fiatlux runcard since some parameters are set in the code FIATLUX_DEFAULT = { @@ -157,6 +160,7 @@ def _compute_photon_set(self): f2 = f2_func(luxset.members[photonreplica]) fl = fl_func(luxset.members[photonreplica]) + alpha = Alpha(theory, self.fiatlux_runcard["q2_max"]) with tempfile.NamedTemporaryFile(mode="w") as tmp: yaml.dump(fiatlux_runcard, tmp) lux = fiatlux.FiatLux(tmp.name) From b9d56ed8f18488926ed406e60452f5e9a69e00a4 Mon Sep 17 00:00:00 2001 From: achiefa Date: Wed, 5 Nov 2025 16:28:17 +0000 Subject: [PATCH 02/16] Save parameters --- n3fit/src/n3fit/backends/keras_backend/callbacks.py | 11 +++++++++-- n3fit/src/n3fit/model_trainer.py | 4 +++- n3fit/src/n3fit/performfit.py | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 8dc6bbee48..eec950cdd7 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -14,6 +14,7 @@ import logging from time import time +import pathlib from keras import backend as K from keras.callbacks import Callback, TensorBoard @@ -23,7 +24,6 @@ log = logging.getLogger(__name__) - class CallbackStep(Callback): """ Wrapper around the keras Callback that keeps track of how the steps are divided @@ -117,10 +117,13 @@ class StoppingCallback(CallbackStep): will be set to true """ - def __init__(self, stopping_object, log_freq=100): + def __init__(self, stopping_object, log_freq=100, savedir=None): super().__init__() self.log_freq = log_freq self.stopping_object = stopping_object + if savedir is not None: + self.savedir = savedir / "weights" + self.savedir.mkdir(parents=True, exist_ok=True) def on_step_end(self, epoch, logs=None): """Function to be called at the end of every epoch @@ -137,6 +140,10 @@ def on_step_end(self, epoch, logs=None): # but it needs to be run every epoch, which makes no sense if K.backend() == "jax": _ = self.model.compute_losses() + + # Save parameters for NTK + if self.savedir is not None and ((epoch + 1) % self.log_freq) == 0: + self.model.save_weights(self.savedir / f"params_epoch_{epoch}.h5") self.stopping_object.monitor_chi2(logs, epoch, print_stats=print_stats) if self.stopping_object.stop_here(): diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 9917e22c89..683e04f817 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -112,6 +112,7 @@ def __init__( theoryid=None, lux_params=None, replicas=None, + replicadir=None ): """ Parameters @@ -168,6 +169,7 @@ def __init__( self.lux_params = lux_params self.replicas = replicas self.experiments_data = experiments_data + self.replicadir = replicadir # Initialise internal variables which define behaviour if debug: @@ -710,7 +712,7 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool: In the same way, every ``PUSH_INTEGRABILITY_EACH`` epochs the integrability will be multiplied by their respective integrability multipliers """ - callback_st = callbacks.StoppingCallback(stopping_object) + callback_st = callbacks.StoppingCallback(stopping_object, savedir=self.replicadir) callback_pos = callbacks.LagrangeCallback( self.training["posdatasets"], self.training["posmultipliers"], diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 558e9690b6..6cf085d5fb 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -197,6 +197,7 @@ def performfit( theoryid=theoryid, lux_params=fiatlux, replicas=replica_idxs, + replicadir = replica_path / f"replica_{replica_idxs[0]}" ) # This is just to give a descriptive name to the fit function From d115ea0aa16d2b1a411f0f4c8f32908865537695 Mon Sep 17 00:00:00 2001 From: achiefa Date: Mon, 23 Feb 2026 21:11:14 +0100 Subject: [PATCH 03/16] Allow checkpointing for multiple replicas --- .../n3fit/backends/keras_backend/callbacks.py | 50 +++++++++++++++---- n3fit/src/n3fit/model_trainer.py | 34 ++++++++++--- n3fit/src/n3fit/performfit.py | 8 ++- 3 files changed, 76 insertions(+), 16 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index eec950cdd7..605c249577 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -13,8 +13,8 @@ """ import logging +from pathlib import Path from time import time -import pathlib from keras import backend as K from keras.callbacks import Callback, TensorBoard @@ -24,6 +24,7 @@ log = logging.getLogger(__name__) + class CallbackStep(Callback): """ Wrapper around the keras Callback that keeps track of how the steps are divided @@ -117,13 +118,10 @@ class StoppingCallback(CallbackStep): will be set to true """ - def __init__(self, stopping_object, log_freq=100, savedir=None): + def __init__(self, stopping_object, log_freq=100): super().__init__() self.log_freq = log_freq self.stopping_object = stopping_object - if savedir is not None: - self.savedir = savedir / "weights" - self.savedir.mkdir(parents=True, exist_ok=True) def on_step_end(self, epoch, logs=None): """Function to be called at the end of every epoch @@ -140,10 +138,6 @@ def on_step_end(self, epoch, logs=None): # but it needs to be run every epoch, which makes no sense if K.backend() == "jax": _ = self.model.compute_losses() - - # Save parameters for NTK - if self.savedir is not None and ((epoch + 1) % self.log_freq) == 0: - self.model.save_weights(self.savedir / f"params_epoch_{epoch}.h5") self.stopping_object.monitor_chi2(logs, epoch, print_stats=print_stats) if self.stopping_object.stop_here(): @@ -203,6 +197,44 @@ def on_step_end(self, epoch, logs=None): self._update_weights() +class StoreCallback(CallbackStep): + """ + Given a ``savedir``, the callback will store the model parameters in + that directory every ``check_freq`` epochs. + + Parameters + ---------- + pdf_model: MetaModel + The multi-replica PDF model + replica_paths: list[Path] + One path for replica. Weights are saved under /weights/. + check_freq: int + Save every this many epochs (default: 100) + """ + + def __init__(self, pdf_model, replica_paths, check_freq=100): + super().__init__() + self.check_freq = check_freq + self.pdf_model = pdf_model + self.weight_dirs = [] + for path in replica_paths: + weight_dir = path / "weights" + weight_dir.mkdir(parents=True, exist_ok=True) + self.weight_dirs.append(weight_dir) + + def on_step_end(self, epoch, logs=None): + """Function to be called at the end of every epoch + Every ``check_freq`` number of epochs, the parameters of the model will + be stored in the indicated directory. + """ + if ((epoch + 1) % self.check_freq) == 0: + pdf_replicas = self.pdf_model.split_replicas() + for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs): + filepath = weight_dir / f"params_epoch_{epoch}.h5" + replica_model.save_weights(filepath) + log.info(f"Saved parameters at epoch {epoch} in {filepath}") + + def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0): """ Generate tensorboard logging details at ``log_dir``. diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 683e04f817..6b1f8f86ba 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -112,7 +112,9 @@ def __init__( theoryid=None, lux_params=None, replicas=None, - replicadir=None + save_checkpoints=False, + replica_path=None, + checkpoint_freq=100, ): """ Parameters @@ -153,6 +155,13 @@ def __init__( if not give, the photon is not generated replicas: list list with the replicas ids to be fitted + save_checkpoints: bool + whether to save checkpoints (i.e. model parameters) during the fit. This requires + `replica_path` to be set as well. Not doing this will raise an error. + replica_path: Path + root path for all replicas. + checkpoint_freq: int + frequency (in epochs) at which to save checkpoints. Only relevant if `save_checkpoints` is True. """ # Save all input information self.exp_info = list(exp_info) @@ -169,7 +178,13 @@ def __init__( self.lux_params = lux_params self.replicas = replicas self.experiments_data = experiments_data - self.replicadir = replicadir + + # Checkpointing options + self.save_checkpoints = save_checkpoints + self.replica_path = replica_path + self.checkpoint_freq = checkpoint_freq + if self.save_checkpoints and self.replica_path is None: + raise ValueError("To save checkpoints, the 'replica_path' key must be set as well.") # Initialise internal variables which define behaviour if debug: @@ -712,7 +727,7 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool: In the same way, every ``PUSH_INTEGRABILITY_EACH`` epochs the integrability will be multiplied by their respective integrability multipliers """ - callback_st = callbacks.StoppingCallback(stopping_object, savedir=self.replicadir) + callback_st = callbacks.StoppingCallback(stopping_object) callback_pos = callbacks.LagrangeCallback( self.training["posdatasets"], self.training["posmultipliers"], @@ -723,11 +738,18 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool: self.training["integmultipliers"], update_freq=PUSH_INTEGRABILITY_EACH, ) + callback_list = [callback_st, callback_pos, callback_integ] + + if self.save_checkpoints: + pdf_model = training_model.get_layer("PDFs") + replica_paths = [self.replica_path / f"replica_{r}" for r in self.replicas] + checpoint_callback = callbacks.StoreCallback( + pdf_model=pdf_model, replica_paths=replica_paths, check_freq=self.checkpoint_freq + ) + callback_list.append(checpoint_callback) training_model.perform_fit( - epochs=epochs, - verbose=False, - callbacks=self.callbacks + [callback_st, callback_pos, callback_integ], + epochs=epochs, verbose=False, callbacks=self.callbacks + callback_list ) def _hyperopt_override(self, params): diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 6cf085d5fb..dc5ad83cb0 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -42,6 +42,8 @@ def performfit( maxcores=None, double_precision=False, parallel_models=True, + save_checkpoints=False, + checkpoint_freq=100, ): """ This action will (upon having read a validcard) process a full PDF fit @@ -128,6 +130,8 @@ def performfit( whether to use double precision parallel_models: bool whether to run models in parallel + save_checkpoints: bool + whether to save checkpoints (i.e. model parameters) during the fit. """ from n3fit.backends import set_initial_state @@ -197,7 +201,9 @@ def performfit( theoryid=theoryid, lux_params=fiatlux, replicas=replica_idxs, - replicadir = replica_path / f"replica_{replica_idxs[0]}" + save_checkpoints=save_checkpoints, + replica_path=replica_path, + checkpoint_freq=checkpoint_freq, ) # This is just to give a descriptive name to the fit function From f6b71943f21622865fbc7d3238a468c30d1f2932 Mon Sep 17 00:00:00 2001 From: achiefa Date: Mon, 23 Feb 2026 23:28:36 +0100 Subject: [PATCH 04/16] Serialization for colibri --- .../n3fit/backends/keras_backend/callbacks.py | 11 +++++++--- n3fit/src/n3fit/model_trainer.py | 21 +++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 605c249577..07760c2160 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -218,7 +218,7 @@ def __init__(self, pdf_model, replica_paths, check_freq=100): self.pdf_model = pdf_model self.weight_dirs = [] for path in replica_paths: - weight_dir = path / "weights" + weight_dir = path / "parameters" weight_dir.mkdir(parents=True, exist_ok=True) self.weight_dirs.append(weight_dir) @@ -230,8 +230,13 @@ def on_step_end(self, epoch, logs=None): if ((epoch + 1) % self.check_freq) == 0: pdf_replicas = self.pdf_model.split_replicas() for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs): - filepath = weight_dir / f"params_epoch_{epoch}.h5" - replica_model.save_weights(filepath) + filepath = weight_dir / f"params_{epoch}.npz" + # save parameters as expected by colibri + trainable_weights_flat = np.concatenate( + [w.numpy().flatten() for w in replica_model.trainable_weights] + ) + np.savez(filepath, params=trainable_weights_flat) + # replica_model.save_weights(filepath) log.info(f"Saved parameters at epoch {epoch} in {filepath}") diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 6b1f8f86ba..1c7eb98281 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -945,6 +945,27 @@ def hyperparametrizable(self, params): ) replicas_settings.append(tmp) + # TODO: tempoerary fix to use NTK utilities in colibri + # Create model pkl for colibri n3fit module + _init_args = { + "flav_info": self.flavinfo, + "replica_range_settings": { + "min_replica": np.sort(self.replicas)[0], + "max_replica": np.sort(self.replicas)[-1], + }, + "impose_sumrule": self.impose_sumrule, + "fitbasis": self.fitbasis, + "nodes": params["nodes_per_layer"], + "activations": params["activation_per_layer"], + "initializer_name": params["initializer"], + "layer_type": params["layer_type"], + } + state = {"_init_args": _init_args} + import pickle + + with open(self.replica_path / "pdf_model.pkl", "wb") as file: + pickle.dump(state, file) + ### Training loop for k, partition in enumerate(self.kpartitions): From ebd3e3afd447275c9269e72c3cc135f0bd274e3e Mon Sep 17 00:00:00 2001 From: achiefa Date: Mon, 23 Feb 2026 23:44:41 +0000 Subject: [PATCH 05/16] change folder structure --- n3fit/src/n3fit/model_trainer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 1c7eb98281..7ff1f65e28 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -12,6 +12,7 @@ from collections import namedtuple from itertools import zip_longest import logging +import pickle import numpy as np @@ -742,7 +743,10 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool: if self.save_checkpoints: pdf_model = training_model.get_layer("PDFs") - replica_paths = [self.replica_path / f"replica_{r}" for r in self.replicas] + # Save parameters where colibri will look for checkpoints + replica_paths = [ + self.replica_path.parent / f"fit_replicas/replica_{r}" for r in self.replicas + ] checpoint_callback = callbacks.StoreCallback( pdf_model=pdf_model, replica_paths=replica_paths, check_freq=self.checkpoint_freq ) @@ -961,9 +965,8 @@ def hyperparametrizable(self, params): "layer_type": params["layer_type"], } state = {"_init_args": _init_args} - import pickle - with open(self.replica_path / "pdf_model.pkl", "wb") as file: + with open(self.replica_path.parent / "pdf_model.pkl", "wb") as file: pickle.dump(state, file) ### Training loop From df2dc918bb2f38b2819ae01fa2c6b0c66237eae8 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 00:27:47 +0000 Subject: [PATCH 06/16] Workaround for colibri replicas --- n3fit/src/n3fit/model_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 7ff1f65e28..5a4bafc552 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -955,7 +955,7 @@ def hyperparametrizable(self, params): "flav_info": self.flavinfo, "replica_range_settings": { "min_replica": np.sort(self.replicas)[0], - "max_replica": np.sort(self.replicas)[-1], + "max_replica": np.sort(self.replicas)[0], }, "impose_sumrule": self.impose_sumrule, "fitbasis": self.fitbasis, From 1f685802a7388c94157576857257f8cfb131f147 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 00:34:07 +0000 Subject: [PATCH 07/16] remove debugging --- validphys2/src/validphys/loader.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index 3c75d55ba8..46b4a6a1b4 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -216,7 +216,7 @@ def available_ekos(self): return { eko_path.parent.name.split("_")[1] for eko_path in self._theories_path.glob("*/eko.tar") } - + @property @functools.lru_cache def available_photons_qed(self): @@ -328,13 +328,15 @@ def check_eko(self, theoryID): if not eko_path.exists(): raise EkoNotFound(f"Could not find eko {eko_path} in theory: {theoryID}") return eko_path - + @functools.lru_cache def check_photonQED(self, theoryID, luxset): """Check the Photon QED set exists and return the path to it""" photon_qed_path = self._photons_qed_path / f"photon_qed_{theoryID.id}_{luxset}.tar" if not photon_qed_path.exists(): - raise PhotonQEDNotFound(f"Could not find Photon QED set {photon_qed_path} in theory: {theoryID}") + raise PhotonQEDNotFound( + f"Could not find Photon QED set {photon_qed_path} in theory: {theoryID}" + ) return photon_qed_path @functools.lru_cache @@ -867,12 +869,12 @@ def eko_index(self): @_key_or_loader_error def eko_urls(self): return self.nnprofile['eko_urls'] - + @property @_key_or_loader_error def photon_qed_index(self): return self.nnprofile['photon_qed_index'] - + @property @_key_or_loader_error def photon_qed_urls(self): @@ -910,7 +912,6 @@ def lhapdf_urls(self): def _remote_files_from_url(self, url, index, thing='files'): index_url = url + index - import ipdb; ipdb.set_trace() try: resp = requests.get(index_url) resp.raise_for_status() @@ -958,7 +959,7 @@ def remote_ekos(self): token = 'eko_' rt = self.remote_files(self.eko_urls, self.eko_index, thing="ekos") return {k[len(token) :]: v for k, v in rt.items()} - + @property @functools.lru_cache def remote_photons_qed(self): @@ -1006,7 +1007,7 @@ def downloadable_theories(self): @property def downloadable_ekos(self): return list(self.remote_ekos) - + @property def downloadable_photonsQED(self): return list(self.remote_photons_qed) From a32daabc1228cdb80a453558a1ddc4a6d174d897 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 01:21:18 +0000 Subject: [PATCH 08/16] Remove stopping and add would stop --- n3fit/src/n3fit/io/writer.py | 7 +++++++ n3fit/src/n3fit/model_trainer.py | 5 +++++ n3fit/src/n3fit/performfit.py | 2 ++ n3fit/src/n3fit/stopping.py | 16 +++++++++++++++- 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index c6d02e9569..ccc0880fdc 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -279,11 +279,18 @@ def write_data(self, save_path, fitname, weights_name): replica_path.mkdir(exist_ok=True, parents=True) self._write_chi2s(replica_path / "chi2exps.log") + self._write_would_stop_epoch(replica_path / "would_stop_epoch.txt") self._write_metadata_json(i, replica_path / f"{fitname}.json") self._export_pdf_grid(i, replica_path / f"{fitname}.exportgrid") if weights_name: self._write_weights(i, replica_path / f"{weights_name}") + def _write_would_stop_epoch(self, out_path): + epoch = self.stopping_object.would_stop_epoch + with open(out_path, "w", encoding="utf-8") as f: + f.write(str(epoch) if epoch is not None else "None") + f.write("\n") + def _write_chi2s(self, out_path): # Note: same for all replicas, unless run separately chi2_log = self.stopping_object.chi2exps_json() diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 5a4bafc552..c78291cce6 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -116,6 +116,7 @@ def __init__( save_checkpoints=False, replica_path=None, checkpoint_freq=100, + dont_stop=False, ): """ Parameters @@ -163,6 +164,8 @@ def __init__( root path for all replicas. checkpoint_freq: int frequency (in epochs) at which to save checkpoints. Only relevant if `save_checkpoints` is True. + dont_stop: bool + whether to disable the stopping mechanism, i.e. to run for all epochs regardless of the validation chi2 """ # Save all input information self.exp_info = list(exp_info) @@ -179,6 +182,7 @@ def __init__( self.lux_params = lux_params self.replicas = replicas self.experiments_data = experiments_data + self.dont_stop = dont_stop # Checkpointing options self.save_checkpoints = save_checkpoints @@ -1035,6 +1039,7 @@ def hyperparametrizable(self, params): stopping_patience=stopping_epochs, threshold_positivity=threshold_pos, threshold_chi2=threshold_chi2, + dont_stop=self.dont_stop, ) # Compile each of the models with the right parameters diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index dc5ad83cb0..af9edcc2d6 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -44,6 +44,7 @@ def performfit( parallel_models=True, save_checkpoints=False, checkpoint_freq=100, + dont_stop=False, ): """ This action will (upon having read a validcard) process a full PDF fit @@ -204,6 +205,7 @@ def performfit( save_checkpoints=save_checkpoints, replica_path=replica_path, checkpoint_freq=checkpoint_freq, + dont_stop=dont_stop, ) # This is just to give a descriptive name to the fit function diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 99be8f45e7..577f353db6 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -345,6 +345,7 @@ def __init__( self._dont_stop = dont_stop self._stop_now = False + self._would_stop_epoch = None self.stopping_patience = stopping_patience self.total_epochs = total_epochs @@ -481,7 +482,20 @@ def make_stop(self): and reload the history to the point of the best model if any """ self._stop_now = True - self._restore_best_weights() + if self._would_stop_epoch is None: + # final_epoch is the last registered epoch (0-indexed); +1 to match stop_epoch convention + self._would_stop_epoch = ( + -1 if self._history.final_epoch is None else self._history.final_epoch + 1 + ) + if not self._dont_stop: + self._restore_best_weights() + + @property + def would_stop_epoch(self): + """Epoch at which early stopping would have triggered. + Returns None if stopping never triggered (fit converged within total_epochs). + When dont_stop=False this equals stop_epoch.""" + return self._would_stop_epoch def _restore_best_weights(self): for i_replica, weights in enumerate(self._best_weights): From 311fca72ced8dbcb2d1dfdb7b3a2e91491ca8415 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 09:46:31 +0000 Subject: [PATCH 09/16] Restoring master version for loader and photon --- validphys2/src/validphys/loader.py | 39 ---------------------- validphys2/src/validphys/photon/compute.py | 4 --- 2 files changed, 43 deletions(-) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index 46b4a6a1b4..2372f7fd15 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -217,14 +217,6 @@ def available_ekos(self): eko_path.parent.name.split("_")[1] for eko_path in self._theories_path.glob("*/eko.tar") } - @property - @functools.lru_cache - def available_photons_qed(self): - """Return a string token for each of the available theories""" - return { - eko_path.parent.name.split("_")[1] for eko_path in self._theories_path.glob("*/eko.npz") - } - @functools.cached_property def available_photons(self): """Return a string token for each of the available theories""" @@ -329,16 +321,6 @@ def check_eko(self, theoryID): raise EkoNotFound(f"Could not find eko {eko_path} in theory: {theoryID}") return eko_path - @functools.lru_cache - def check_photonQED(self, theoryID, luxset): - """Check the Photon QED set exists and return the path to it""" - photon_qed_path = self._photons_qed_path / f"photon_qed_{theoryID.id}_{luxset}.tar" - if not photon_qed_path.exists(): - raise PhotonQEDNotFound( - f"Could not find Photon QED set {photon_qed_path} in theory: {theoryID}" - ) - return photon_qed_path - @functools.lru_cache def check_photonQED(self, theoryID, luxset): """Check the Photon QED set exists and return the path to it""" @@ -880,16 +862,6 @@ def photon_qed_index(self): def photon_qed_urls(self): return self.nnprofile['photon_qed_urls'] - @property - @_key_or_loader_error - def photon_qed_index(self): - return self.nnprofile['photon_qed_index'] - - @property - @_key_or_loader_error - def photon_qed_urls(self): - return self.nnprofile['photon_qed_urls'] - @property @_key_or_loader_error def nnpdf_pdfs_urls(self): @@ -960,13 +932,6 @@ def remote_ekos(self): rt = self.remote_files(self.eko_urls, self.eko_index, thing="ekos") return {k[len(token) :]: v for k, v in rt.items()} - @property - @functools.lru_cache - def remote_photons_qed(self): - token = 'photon_qed_' - rt = self.remote_files(self.photon_qed_urls, self.photon_qed_index, thing="photons_qed") - return {k[len(token) :]: v for k, v in rt.items()} - @property @functools.lru_cache def remote_photons(self): @@ -1008,10 +973,6 @@ def downloadable_theories(self): def downloadable_ekos(self): return list(self.remote_ekos) - @property - def downloadable_photonsQED(self): - return list(self.remote_photons_qed) - @property def downloadable_photons(self): return list(self.remote_photons) diff --git a/validphys2/src/validphys/photon/compute.py b/validphys2/src/validphys/photon/compute.py index 119245c718..d3effa8ec1 100644 --- a/validphys2/src/validphys/photon/compute.py +++ b/validphys2/src/validphys/photon/compute.py @@ -3,7 +3,6 @@ from functools import lru_cache import logging import tempfile -from concurrent.futures import ThreadPoolExecutor from joblib import Parallel, delayed import numpy as np @@ -17,13 +16,11 @@ from validphys.core import FKTableSpec from validphys.loader import Loader, PhotonQEDNotFound from validphys.n3fit_data import replica_luxseed -from validphys.loader import Loader, PhotonQEDNotFound from . import structure_functions as sf from .alpha import Alpha log = logging.getLogger(__name__) -loader = Loader() # not the complete fiatlux runcard since some parameters are set in the code FIATLUX_DEFAULT = { @@ -160,7 +157,6 @@ def _compute_photon_set(self): f2 = f2_func(luxset.members[photonreplica]) fl = fl_func(luxset.members[photonreplica]) - alpha = Alpha(theory, self.fiatlux_runcard["q2_max"]) with tempfile.NamedTemporaryFile(mode="w") as tmp: yaml.dump(fiatlux_runcard, tmp) lux = fiatlux.FiatLux(tmp.name) From a1f9a1207f4e7b6072da15181cce30263495493c Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 10:18:36 +0000 Subject: [PATCH 10/16] Apply epoch convention to stored parameters --- n3fit/src/n3fit/backends/keras_backend/callbacks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 07760c2160..ddfa63d3ee 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -230,14 +230,14 @@ def on_step_end(self, epoch, logs=None): if ((epoch + 1) % self.check_freq) == 0: pdf_replicas = self.pdf_model.split_replicas() for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs): - filepath = weight_dir / f"params_{epoch}.npz" + filepath = weight_dir / f"params_{epoch+1}.npz" # save parameters as expected by colibri trainable_weights_flat = np.concatenate( [w.numpy().flatten() for w in replica_model.trainable_weights] ) np.savez(filepath, params=trainable_weights_flat) # replica_model.save_weights(filepath) - log.info(f"Saved parameters at epoch {epoch} in {filepath}") + log.info(f"Saved parameters at epoch {epoch+1} in {filepath}") def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0): From 6c8858f4be0784ce88831a51880d352035bde1b0 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 10:19:39 +0000 Subject: [PATCH 11/16] Add would_stop_epoch to json --- n3fit/src/n3fit/io/writer.py | 16 +++++++++------- n3fit/src/n3fit/stopping.py | 3 +-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index ccc0880fdc..45cc053a49 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -279,18 +279,11 @@ def write_data(self, save_path, fitname, weights_name): replica_path.mkdir(exist_ok=True, parents=True) self._write_chi2s(replica_path / "chi2exps.log") - self._write_would_stop_epoch(replica_path / "would_stop_epoch.txt") self._write_metadata_json(i, replica_path / f"{fitname}.json") self._export_pdf_grid(i, replica_path / f"{fitname}.exportgrid") if weights_name: self._write_weights(i, replica_path / f"{weights_name}") - def _write_would_stop_epoch(self, out_path): - epoch = self.stopping_object.would_stop_epoch - with open(out_path, "w", encoding="utf-8") as f: - f.write(str(epoch) if epoch is not None else "None") - f.write("\n") - def _write_chi2s(self, out_path): # Note: same for all replicas, unless run separately chi2_log = self.stopping_object.chi2exps_json() @@ -310,6 +303,11 @@ def _write_metadata_json(self, i, out_path): # Note: the 2 arguments below are the same for all replicas, unless run separately timing=self.timings, stop_epoch=self.stopping_object.stop_epoch, + would_stop_epoch=( + self.stopping_object.would_stop_epoch + if self.stopping_object._dont_stop + else self.stopping_object.stop_epoch + ), ) with open(out_path, "w", encoding="utf-8") as fs: @@ -354,6 +352,7 @@ def jsonfit( true_chi2, stop_epoch, timing, + would_stop_epoch, ): """Generates a dictionary containing all relevant metadata for the fit @@ -379,6 +378,8 @@ def jsonfit( epoch at which the stopping stopped (not the one for the best fit!) timing: dict dictionary of the timing of the different events that happened + would_stop_epoch: int + epoch at which the stopping would have stopped if it were not set to "dont_stop" """ all_info = {} # Generate preprocessing information @@ -393,6 +394,7 @@ def jsonfit( all_info["arc_lengths"] = arc_lengths all_info["integrability"] = integrability_numbers all_info["timing"] = timing + all_info["would_stop_epoch"] = would_stop_epoch # Versioning info all_info["version"] = version() return all_info diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 577f353db6..672ae0f59f 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -487,8 +487,7 @@ def make_stop(self): self._would_stop_epoch = ( -1 if self._history.final_epoch is None else self._history.final_epoch + 1 ) - if not self._dont_stop: - self._restore_best_weights() + self._restore_best_weights() @property def would_stop_epoch(self): From e7e94504df4c5ed052c3f66fda0b2c3a970d2556 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 24 Feb 2026 14:58:56 +0000 Subject: [PATCH 12/16] Restoring condition --- n3fit/src/n3fit/stopping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 672ae0f59f..577f353db6 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -487,7 +487,8 @@ def make_stop(self): self._would_stop_epoch = ( -1 if self._history.final_epoch is None else self._history.final_epoch + 1 ) - self._restore_best_weights() + if not self._dont_stop: + self._restore_best_weights() @property def would_stop_epoch(self): From c92c918d7c526c07a13a6b9e19147bd742bd7db7 Mon Sep 17 00:00:00 2001 From: achiefa Date: Tue, 17 Mar 2026 17:13:22 +0000 Subject: [PATCH 13/16] Save best parameters --- .../n3fit/backends/keras_backend/callbacks.py | 28 +++++++++++++------ n3fit/src/n3fit/model_trainer.py | 5 +++- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index ddfa63d3ee..e5cc2a09c9 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -212,16 +212,25 @@ class StoreCallback(CallbackStep): Save every this many epochs (default: 100) """ - def __init__(self, pdf_model, replica_paths, check_freq=100): + def __init__(self, pdf_model, replica_paths, stopping_object, check_freq=100): super().__init__() self.check_freq = check_freq self.pdf_model = pdf_model self.weight_dirs = [] + self.stopping_object = stopping_object for path in replica_paths: weight_dir = path / "parameters" weight_dir.mkdir(parents=True, exist_ok=True) self.weight_dirs.append(weight_dir) + def _save_weights(self, epoch, tr_weights, weight_dir): + + filepath = weight_dir / f"params_{epoch+1}.npz" + # save parameters as expected by colibri + trainable_weights_flat = np.concatenate([np.asarray(w).flatten() for w in tr_weights]) + np.savez(filepath, params=trainable_weights_flat) + log.info(f"Saved parameters at epoch {epoch+1} in {filepath}") + def on_step_end(self, epoch, logs=None): """Function to be called at the end of every epoch Every ``check_freq`` number of epochs, the parameters of the model will @@ -230,14 +239,15 @@ def on_step_end(self, epoch, logs=None): if ((epoch + 1) % self.check_freq) == 0: pdf_replicas = self.pdf_model.split_replicas() for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs): - filepath = weight_dir / f"params_{epoch+1}.npz" - # save parameters as expected by colibri - trainable_weights_flat = np.concatenate( - [w.numpy().flatten() for w in replica_model.trainable_weights] - ) - np.savez(filepath, params=trainable_weights_flat) - # replica_model.save_weights(filepath) - log.info(f"Saved parameters at epoch {epoch+1} in {filepath}") + weights = replica_model.trainable_weights + self._save_weights(epoch, weights, weight_dir) + + def on_train_end(self, logs=None): + """Store the best parameters""" + for idx, weight_dir in enumerate(self.weight_dirs): + best_epoch = self.stopping_object._best_epochs[idx] + best_weights = self.stopping_object._best_weights[idx]['all_NNs'] + self._save_weights(best_epoch, best_weights, weight_dir) def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0): diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index c78291cce6..0b45eb2c61 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -752,7 +752,10 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool: self.replica_path.parent / f"fit_replicas/replica_{r}" for r in self.replicas ] checpoint_callback = callbacks.StoreCallback( - pdf_model=pdf_model, replica_paths=replica_paths, check_freq=self.checkpoint_freq + pdf_model=pdf_model, + replica_paths=replica_paths, + check_freq=self.checkpoint_freq, + stopping_object=stopping_object, ) callback_list.append(checpoint_callback) From f6777fc0c403d2bdead86f0e6704c81340711505 Mon Sep 17 00:00:00 2001 From: achiefa Date: Sat, 21 Mar 2026 21:03:08 +0000 Subject: [PATCH 14/16] Add median + correct epoch numbering --- n3fit/src/n3fit/backends/keras_backend/callbacks.py | 7 +++---- validphys2/src/validphys/core.py | 3 +++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index e5cc2a09c9..7f6ebb28aa 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -224,12 +224,11 @@ def __init__(self, pdf_model, replica_paths, stopping_object, check_freq=100): self.weight_dirs.append(weight_dir) def _save_weights(self, epoch, tr_weights, weight_dir): - - filepath = weight_dir / f"params_{epoch+1}.npz" + filepath = weight_dir / f"params_{epoch}.npz" # save parameters as expected by colibri trainable_weights_flat = np.concatenate([np.asarray(w).flatten() for w in tr_weights]) np.savez(filepath, params=trainable_weights_flat) - log.info(f"Saved parameters at epoch {epoch+1} in {filepath}") + log.info(f"Saved parameters at epoch {epoch} in {filepath}") def on_step_end(self, epoch, logs=None): """Function to be called at the end of every epoch @@ -240,7 +239,7 @@ def on_step_end(self, epoch, logs=None): pdf_replicas = self.pdf_model.split_replicas() for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs): weights = replica_model.trainable_weights - self._save_weights(epoch, weights, weight_dir) + self._save_weights(epoch + 1, weights, weight_dir) def on_train_end(self, logs=None): """Store the best parameters""" diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 4021a36559..5c4ae2fc23 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -886,6 +886,9 @@ def errorbar68(self): up = np.nanpercentile(self.error_members(), 84.13, axis=0) return down, up + def median(self): + return np.median(self.error_members(), axis=0) + def sample_values(self, size): return np.random.choice(self, size=size) From 020314836ec39ec2001a69ff0ad7c5504fc9b85d Mon Sep 17 00:00:00 2001 From: achiefa Date: Sat, 4 Apr 2026 17:48:01 +0100 Subject: [PATCH 15/16] Don't fail in storing weights if best fit is not found --- n3fit/src/n3fit/backends/keras_backend/callbacks.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 7f6ebb28aa..aa851ab036 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -244,9 +244,15 @@ def on_step_end(self, epoch, logs=None): def on_train_end(self, logs=None): """Store the best parameters""" for idx, weight_dir in enumerate(self.weight_dirs): - best_epoch = self.stopping_object._best_epochs[idx] - best_weights = self.stopping_object._best_weights[idx]['all_NNs'] - self._save_weights(best_epoch, best_weights, weight_dir) + weights = self.stopping_object._best_weights[idx] + if weights is not None: + best_weights = weights['all_NNs'] + best_epoch = self.stopping_object._best_epochs[idx] + self._save_weights(best_epoch, best_weights, weight_dir) + else: + log.warning( + f"No best weights found for replica {idx+1}, skipping saving best parameters." + ) def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0): From 674d3b26339cf8d079ad54a5cdf22c3995cb85c0 Mon Sep 17 00:00:00 2001 From: achiefa Date: Wed, 22 Apr 2026 17:12:35 +0100 Subject: [PATCH 16/16] Ensure that flavours is a list and not a numpy array --- validphys2/src/validphys/pdfgrids.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validphys2/src/validphys/pdfgrids.py b/validphys2/src/validphys/pdfgrids.py index 039de9cc9a..247fdd8c8f 100644 --- a/validphys2/src/validphys/pdfgrids.py +++ b/validphys2/src/validphys/pdfgrids.py @@ -65,6 +65,10 @@ def __post_init__(self): if not isinstance(self.grid_values, Stats): raise ValueError("`XPlottingGrid` grid_values can only be instances of `Stats`") + # Ensure that flavours is a list or tuple and not numpy array + if isinstance(self.flavours, np.ndarray): + self.flavours = self.flavours.tolist() + def select_flavour(self, flindex): """Return a new grid for one single flavour""" if isinstance(flindex, str):