From 43aeb42e90b81a3bfd90b574fdcbfe7867fa0df7 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 4 Mar 2024 09:13:15 +0100 Subject: [PATCH 1/4] Rename kernel to multi_kernel --- n3fit/src/n3fit/backends/keras_backend/multi_dense.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py index b8caa9a3fd..d0c0bdaf4a 100644 --- a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py +++ b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py @@ -56,7 +56,7 @@ def __init__( def build(self, input_shape): input_dim = input_shape[-1] - self.kernel = self.add_weight( + self.multi_kernel = self.add_weight( name="kernel", shape=(self.replicas, input_dim, self.units), initializer=self.kernel_initializer, @@ -80,7 +80,7 @@ def build(self, input_shape): # TODO: benchmark against the replica-agnostic einsum below and make that default # see https://github.com/NNPDF/nnpdf/pull/1905#discussion_r1489344081 if self.replicas == 1: - matmul = lambda inputs: tf.tensordot(inputs, self.kernel[0], [[-1], [0]]) + matmul = lambda inputs: tf.tensordot(inputs, self.multi_kernel[0], [[-1], [0]]) if self.is_first_layer: # Manually add replica dimension self.matmul = lambda x: tf.expand_dims(matmul(x), axis=1) @@ -88,7 +88,7 @@ def build(self, input_shape): self.matmul = matmul else: einrule = "bnf,rfg->brng" if self.is_first_layer else "brnf,rfg->brng" - self.matmul = lambda inputs: tf.einsum(einrule, inputs, self.kernel) + self.matmul = lambda inputs: tf.einsum(einrule, inputs, self.multi_kernel) def call(self, inputs): """ From 2c9ce24d88999adb6f14197baf4243a0485ccacf Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 4 Mar 2024 11:21:24 +0100 Subject: [PATCH 2/4] list->tuple in output shape --- n3fit/src/n3fit/backends/keras_backend/multi_dense.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py index d0c0bdaf4a..9fa0ac5835 100644 --- a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py +++ b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py @@ -125,7 +125,7 @@ def compute_output_shape(self, input_shape): output_shape = super().compute_output_shape(input_shape) # Add back the replica axis to the output shape. - output_shape = output_shape[:1] + [self.replicas] + output_shape[1:] + output_shape = output_shape[:1] + (self.replicas,) + output_shape[1:] return output_shape From baefa5fb6766f8431ab29e6c8fd0200ac5facb54 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 4 Mar 2024 12:03:58 +0100 Subject: [PATCH 3/4] miscelaneous fixes for tensorflow 2.16 kicking down the can recover previous behaviour try-except for 3.11 deal with type missmatch make sure units are int remove pdb fix change in how weights are named Update n3fit/src/n3fit/tests/test_hyperopt.py 0 is understood as None by initializer change scope of hyperopt test bugfix 312 --- .github/workflows/python_installation.yml | 2 +- .../n3fit/backends/keras_backend/MetaLayer.py | 15 +++------ .../n3fit/backends/keras_backend/MetaModel.py | 19 +++++++----- .../n3fit/backends/keras_backend/callbacks.py | 23 +++++++++----- .../backends/keras_backend/constraints.py | 10 ++---- .../backends/keras_backend/multi_dense.py | 4 +-- .../backends/keras_backend/operations.py | 14 ++++++--- .../n3fit/hyper_optimization/hyper_scan.py | 15 ++++++--- n3fit/src/n3fit/layers/observable.py | 4 +-- n3fit/src/n3fit/model_gen.py | 20 ++++++------ n3fit/src/n3fit/tests/test_hyperopt.py | 21 +++++++++---- n3fit/src/n3fit/tests/test_multidense.py | 31 +++++++++++-------- 12 files changed, 102 insertions(+), 76 deletions(-) diff --git a/.github/workflows/python_installation.yml b/.github/workflows/python_installation.yml index 1496c5f743..9116de7943 100644 --- a/.github/workflows/python_installation.yml +++ b/.github/workflows/python_installation.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ["3.11"] + python-version: ["3.11", "3.12"] include: - os: ubuntu-latest CONDA_OS: linux-64 diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py b/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py index af4210d603..31def842c4 100644 --- a/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py +++ b/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py @@ -9,12 +9,7 @@ """ from tensorflow.keras.layers import Layer -from tensorflow.keras.initializers import ( - Constant, - RandomUniform, - glorot_normal, - glorot_uniform, -) +from tensorflow.keras.initializers import Constant, RandomUniform, glorot_normal, glorot_uniform # Define in this dictionary new initializers as well as the arguments they accept (with default values if needed be) initializers = { @@ -37,9 +32,7 @@ class MetaLayer(Layer): weight_inits = [] # Building function - def builder_helper( - self, name, kernel_shape, initializer, trainable=True, constraint=None - ): + def builder_helper(self, name, kernel_shape, initializer, trainable=True, constraint=None): """ Creates a kernel that should be saved as an attribute of the caller class name: name of the kernel @@ -73,9 +66,9 @@ def get_weight_by_name(self, weight_name, internal_count=0): weight_name: str Name of the weight """ - check_name = f"{self.name}/{weight_name}:{internal_count}" + main_name = f"{self.name}/{weight_name}" for weight in self.weights: - if weight.name == check_name: + if weight.name in (f"{main_name}:{internal_count}", main_name, weight_name): return weight return None diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py index 2533d8456f..2956a25d2e 100644 --- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py +++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py @@ -6,8 +6,8 @@ """ import re +import shutil -import h5py import numpy as np import tensorflow as tf from tensorflow.keras import optimizers as Kopt @@ -16,12 +16,6 @@ import n3fit.backends.keras_backend.operations as op -# Check the TF version to check if legacy-mode is needed (TF < 2.2) -tf_version = tf.__version__.split(".") -if int(tf_version[0]) == 2 and int(tf_version[1]) < 2: - raise NotImplementedError("n3fit needs TF > 2.2 in order to work") - - # We need a function to transform tensors to numpy/python primitives # which is not part of the official TF interface and can change with the version if hasattr(tf_utils, "to_numpy_or_python_type"): @@ -414,6 +408,17 @@ def load_identical_replicas(self, model_file): for i_replica in range(self.num_replicas): self.set_replica_weights(weights, i_replica) + def save_weights(self, file, save_format="h5"): + """ + Compatibility function for tf < 2.16 + """ + try: + super().save_weights(file, save_format=save_format) + except TypeError: + new_file = file.with_suffix(".weights.h5") + super().save_weights(new_file) + shutil.move(new_file, file) + def is_stacked_single_replicas(layer): """ diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index 7349d6be36..c72ea9fb5c 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -10,9 +10,10 @@ import logging from time import time + import numpy as np import tensorflow as tf -from tensorflow.keras.callbacks import TensorBoard, Callback +from tensorflow.keras.callbacks import Callback, TensorBoard log = logging.getLogger(__name__) @@ -30,7 +31,7 @@ def __init__(self, count_range=100): self.last_time = 0 def on_epoch_end(self, epoch, logs=None): - """ At the end of every epoch it checks the time """ + """At the end of every epoch it checks the time""" new_time = time() if epoch == 0: # The first epoch is only useful for starting @@ -45,13 +46,13 @@ def on_epoch_end(self, epoch, logs=None): self.last_time = new_time def on_train_end(self, logs=None): - """ Print the results """ + """Print the results""" total_time = time() - self.starting_time n_times = len(self.all_times) # Skip the first 100 epochs to avoid fluctuations due to compilations of part of the code # by epoch 100 all parts of the code have usually been called so it's a good compromise - mean = np.mean(self.all_times[min(110, n_times-1):]) - std = np.std(self.all_times[min(110, n_times-1):]) + mean = np.mean(self.all_times[min(110, n_times - 1) :]) + std = np.std(self.all_times[min(110, n_times - 1) :]) log.info(f"> > Average time per epoch: {mean:.5} +- {std:.5} s") log.info(f"> > > Total time: {total_time/60:.5} min") @@ -75,9 +76,15 @@ def __init__(self, stopping_object, log_freq=100): super().__init__() self.log_freq = log_freq self.stopping_object = stopping_object + self._current_loss = None + + def on_epoch_begin(self, epoch, logs=None): + # TODO This is an unnecessary performance hit, just for testing + self._current_loss = self.model.compute_losses() def on_epoch_end(self, epoch, logs=None): - """ Function to be called at the end of every epoch """ + """Function to be called at the end of every epoch""" + logs = self._current_loss print_stats = ((epoch + 1) % self.log_freq) == 0 # Note that the input logs correspond to the fit before the weights are updated self.stopping_object.monitor_chi2(logs, epoch, print_stats=print_stats) @@ -117,7 +124,7 @@ def __init__(self, datasets, multipliers, update_freq=100): self.updateable_weights = [] def on_train_begin(self, logs=None): - """ Save an instance of all relevant layers """ + """Save an instance of all relevant layers""" for layer_name in self.datasets: layer = self.model.get_layer(layer_name) self.updateable_weights.append(layer.weights) @@ -133,7 +140,7 @@ def _update_weights(self): w.assign(w * multiplier) def on_epoch_end(self, epoch, logs=None): - """ Function to be called at the end of every epoch """ + """Function to be called at the end of every epoch""" if (epoch + 1) % self.update_freq == 0: self._update_weights() diff --git a/n3fit/src/n3fit/backends/keras_backend/constraints.py b/n3fit/src/n3fit/backends/keras_backend/constraints.py index b186cd2638..5b1bd8d413 100644 --- a/n3fit/src/n3fit/backends/keras_backend/constraints.py +++ b/n3fit/src/n3fit/backends/keras_backend/constraints.py @@ -3,8 +3,8 @@ """ import tensorflow as tf -from tensorflow.keras.constraints import MinMaxNorm from tensorflow.keras import backend as K +from tensorflow.keras.constraints import MinMaxNorm class MinMaxWeight(MinMaxNorm): @@ -14,15 +14,11 @@ class MinMaxWeight(MinMaxNorm): """ def __init__(self, min_value, max_value, **kwargs): - super(MinMaxWeight, self).__init__( - min_value=min_value, max_value=max_value, **kwargs - ) + super(MinMaxWeight, self).__init__(min_value=min_value, max_value=max_value, **kwargs) - @tf.function def __call__(self, w): norms = K.sum(w, axis=self.axis, keepdims=True) desired = ( - self.rate * K.clip(norms, self.min_value, self.max_value) - + (1 - self.rate) * norms + self.rate * K.clip(norms, self.min_value, self.max_value) + (1 - self.rate) * norms ) return w * desired / (K.epsilon() + norms) diff --git a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py index 9fa0ac5835..3b2037c47e 100644 --- a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py +++ b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py @@ -99,8 +99,8 @@ def call(self, inputs): If the input already contains multiple replica outputs, it is equivalent to applying each replica to its corresponding input. """ - if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype: - inputs = tf.cast(inputs, dtype=self._compute_dtype_object) + # cast always + inputs = tf.cast(inputs, dtype=self.compute_dtype) outputs = self.matmul(inputs) diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py index 12d16b0d73..c73a548b53 100644 --- a/n3fit/src/n3fit/backends/keras_backend/operations.py +++ b/n3fit/src/n3fit/backends/keras_backend/operations.py @@ -22,8 +22,10 @@ Note that tensor operations can also be applied to layers as the output of a layer is a tensor equally operations are automatically converted to layers when used as such. """ + from typing import Optional +import keras import numpy as np import numpy.typing as npt import tensorflow as tf @@ -249,11 +251,15 @@ def concatenate(tensor_list, axis=-1, target_shape=None, name=None): Concatenates a list of numbers or tensor into a bigger tensor If the target shape is given, the output is reshaped to said shape """ - concatenated_tensor = tf.concat(tensor_list, axis, name=name) - if target_shape: - return K.reshape(concatenated_tensor, target_shape) - else: + try: + # For tensorflow >= 2.16, Keras >= 3 + concatenated_tensor = keras.ops.concatenate(tensor_list, axis=axis) + except AttributeError: + concatenated_tensor = tf.concat(tensor_list, axis=axis) + + if target_shape is None: return concatenated_tensor + return K.reshape(concatenated_tensor, target_shape) def einsum(equation, *args, **kwargs): diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py index 174e921677..f06234ff6a 100644 --- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py +++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py @@ -16,6 +16,7 @@ import logging import hyperopt +from hyperopt.pyll.base import scope import numpy as np from n3fit.backends import MetaLayer, MetaModel @@ -36,7 +37,7 @@ def hp_uniform(key, lower_end, higher_end): return hyperopt.hp.uniform(key, lower_end, higher_end) -def hp_quniform(key, lower_end, higher_end, step_size=None, steps=None): +def hp_quniform(key, lower_end, higher_end, step_size=None, steps=None, make_int=False): """Like uniform but admits a step_size""" if lower_end is None or higher_end is None: return None @@ -44,7 +45,11 @@ def hp_quniform(key, lower_end, higher_end, step_size=None, steps=None): step_size = lower_end if steps: step_size = (higher_end - lower_end) / steps - return hyperopt.hp.quniform(key, lower_end, higher_end, step_size) + + ret = hyperopt.hp.quniform(key, lower_end, higher_end, step_size) + if make_int: + ret = scope.int(ret) + return ret def hp_loguniform(key, lower_end, higher_end): @@ -276,7 +281,7 @@ def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_pati stopping_key = "stopping_patience" if min_epochs is not None and max_epochs is not None: - epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1) + epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1, make_int=True) self._update_param(epochs_key, epochs) if min_patience is not None or max_patience is not None: @@ -429,7 +434,9 @@ def architecture( units = [] for i in range(n): units_label = "nl{0}:-{1}/{0}".format(n, i) - units_sampler = hp_quniform(units_label, min_units, max_units, step_size=1) + units_sampler = hp_quniform( + units_label, min_units, max_units, step_size=1, make_int=True + ) units.append(units_sampler) # The number of nodes in the last layer are read from the runcard units.append(output_size) diff --git a/n3fit/src/n3fit/layers/observable.py b/n3fit/src/n3fit/layers/observable.py index 2a5dd9a93e..b1a2d701b3 100644 --- a/n3fit/src/n3fit/layers/observable.py +++ b/n3fit/src/n3fit/layers/observable.py @@ -81,9 +81,6 @@ def build(self, input_shape): super().build(input_shape) - def compute_output_shape(self, input_shape): - return (self.output_dim, None) - def call(self, pdf): """ This function perform the convolution with the fktable and one (DIS) or two (DY-like) pdfs. @@ -111,6 +108,7 @@ def call(self, pdf): observables = self.operation(observables) return observables + # Overridables @abstractmethod def gen_mask(self, basis): pass diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py index 219bbdfc11..8c4c30c59a 100644 --- a/n3fit/src/n3fit/model_gen.py +++ b/n3fit/src/n3fit/model_gen.py @@ -9,6 +9,7 @@ """ + from dataclasses import dataclass from typing import Callable, List @@ -73,7 +74,9 @@ def _generate_loss(self, mask=None): if self.invcovmat is not None: if self.rotation: # If we have a matrix diagonal only, padd with 0s and hope it's not too heavy on memory - invcovmat_matrix = np.eye(self.invcovmat.shape[-1]) * self.invcovmat[..., np.newaxis] + invcovmat_matrix = ( + np.eye(self.invcovmat.shape[-1]) * self.invcovmat[..., np.newaxis] + ) if self.covmat is not None: covmat_matrix = np.eye(self.covmat.shape[-1]) * self.covmat[..., np.newaxis] else: @@ -82,11 +85,7 @@ def _generate_loss(self, mask=None): covmat_matrix = self.covmat invcovmat_matrix = self.invcovmat loss = losses.LossInvcovmat( - invcovmat_matrix, - self.data, - mask, - covmat=covmat_matrix, - name=self.name + invcovmat_matrix, self.data, mask, covmat=covmat_matrix, name=self.name ) elif self.positivity: loss = losses.LossPositivity(name=self.name, c=self.multiplier) @@ -642,9 +641,10 @@ def compute_unnormalized_pdf(x): if photons: # add batch and flavor dimensions - photon_integrals = op.batchit(op.batchit(photons.integral)) + ph_tensor = op.numpy_to_tensor(photons.integral) + photon_integrals = op.batchit(op.batchit(ph_tensor)) else: - photon_integrals = np.zeros((1, num_replicas, 1)) + photon_integrals = op.numpy_to_tensor(np.zeros((1, num_replicas, 1))) PDFs_normalized = sumrule_layer( { @@ -737,7 +737,7 @@ def layer_generator(i_layer, nodes_out, activation): layer = base_layer_selector( layer_type, kernel_initializer=initializers, - units=nodes_out, + units=int(nodes_out), activation=activation, input_shape=(nodes_in,), basis_size=basis_size, @@ -755,7 +755,7 @@ def layer_generator(i_layer, nodes_out, activation): layer_type, replica_seeds=replica_seeds, kernel_initializer=MetaLayer.select_initializer(initializer_name, seed=i_layer), - units=nodes_out, + units=int(nodes_out), activation=activation, is_first_layer=(i_layer == 0), regularizer=reg, diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index cecc747452..f53377b083 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -32,13 +32,21 @@ def load_data(info_file): def test_restart_from_pickle(tmp_path): - """Ensure that our hyperopt restart works as expected""" + """Ensure that after a hyperopt restart, the testing continues + from the same point. + The test is set up so that it does one trial, then stops, then a second one + And then this is compared with two trials one after the other. + + The test checks that the starting point of the second trial is the same in both cases + """ # Prepare the run quickcard = f"hyper-{QUICKNAME}.yml" quickpath = REGRESSION_FOLDER / quickcard - # Set up some options - n_trials_stop = 2 - n_trials_total = 4 + + # Set the test up so that it does one trial, then stops, then does another one + # and then we do two + n_trials_stop = 1 + n_trials_total = 2 output_restart = tmp_path / f"run_{n_trials_stop}_trials_and_then_{n_trials_total}_trials" output_direct = tmp_path / f"run_{n_trials_total}_trials" @@ -46,7 +54,7 @@ def test_restart_from_pickle(tmp_path): shutil.copy(quickpath, tmp_path) # run some trials for the first time sp.run( - f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} " f"-o {output_restart}".split(), + f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} -o {output_restart}".split(), cwd=tmp_path, check=True, ) @@ -78,4 +86,5 @@ def test_restart_from_pickle(tmp_path): assert restart_json[i]['misc'] == direct_json[i]['misc'] assert restart_json[i]['state'] == direct_json[i]['state'] assert restart_json[i]['tid'] == direct_json[i]['tid'] - assert restart_json[i]['result'] == direct_json[i]['result'] + assert restart_json[i]['misc']['idxs'] == direct_json[i]['misc']['idxs'] + # Note that it doesn't check the final loss of the second trial diff --git a/n3fit/src/n3fit/tests/test_multidense.py b/n3fit/src/n3fit/tests/test_multidense.py index 6c7df89e3a..e2a8ddc412 100644 --- a/n3fit/src/n3fit/tests/test_multidense.py +++ b/n3fit/src/n3fit/tests/test_multidense.py @@ -15,20 +15,21 @@ def test_multidense(): units=8, replica_seeds=[42, 43], is_first_layer=True, - kernel_initializer=GlorotUniform(seed=0), + kernel_initializer=GlorotUniform(seed=5), ), MultiDense(units=4, replica_seeds=[52, 53], kernel_initializer=GlorotUniform(seed=100)), ] ) - single_models = [ - Sequential( - [ - Dense(units=8, kernel_initializer=GlorotUniform(seed=42 + r)), - Dense(units=4, kernel_initializer=GlorotUniform(seed=52 + r + 100)), - ] + single_models = [] + for r in range(replicas): + single_models.append( + Sequential( + [ + Dense(units=8, kernel_initializer=GlorotUniform(seed=42 + r + 5)), + Dense(units=4, kernel_initializer=GlorotUniform(seed=52 + r + 100)), + ] + ) ) - for r in range(replicas) - ] gridsize, features = 100, 3 multi_dense_model.build(input_shape=(None, gridsize, features)) @@ -46,12 +47,17 @@ def test_multidense(): def test_initializers(): input_shape = (None, 3, 1) - dense_layers = [] + dense_weights = [] for r in range(2): dense_layer = Dense(units=2, kernel_initializer=GlorotUniform(seed=42 + r)) dense_layer.build(input_shape=input_shape) - dense_layers.append(dense_layer) - stacked_weights = tf.stack([dense_layer.weights[0] for dense_layer in dense_layers], axis=0) + try: + dense_weights.append(dense_layer.weights[0].value.numpy()) + except AttributeError: + # In tensorflow < 2.16, value was a function + dense_weights.append(dense_layer.weights[0].value().numpy()) + + stacked_weights = np.stack(dense_weights, axis=0) multi_dense_layer = MultiDense( units=2, @@ -62,6 +68,5 @@ def test_initializers(): multi_dense_layer.build(input_shape=input_shape) multi_dense_weights = multi_dense_layer.weights[0].numpy() - stacked_weights = stacked_weights.numpy() np.testing.assert_allclose(multi_dense_weights, stacked_weights) From 9e6dd2cc780eb93548cd805a563c10ea0acc90ef Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 5 Mar 2024 15:43:53 +0100 Subject: [PATCH 4/4] change the per-100-epochs monitoring of chi2 to avoid having to recompute losses --- .../n3fit/backends/keras_backend/MetaModel.py | 10 ++- .../n3fit/backends/keras_backend/callbacks.py | 12 ++-- .../backends/keras_backend/operations.py | 14 ++-- n3fit/src/n3fit/stopping.py | 72 +++++++++---------- 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py index 2956a25d2e..0169019d18 100644 --- a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py +++ b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py @@ -410,12 +410,18 @@ def load_identical_replicas(self, model_file): def save_weights(self, file, save_format="h5"): """ - Compatibility function for tf < 2.16 + Compatibility function for: + - tf < 2.16, keras < 3: argument save format needed for h5 + - tf >= 2.16, keras >= 3: save format is deduced from the file extension + In both cases, the final weights are finally copied to the ``file`` path. """ try: + # Keras 2, tf < 2.16 super().save_weights(file, save_format=save_format) except TypeError: - new_file = file.with_suffix(".weights.h5") + # Newer versions of keras (>=3) drop the ``save_format`` argument + # and instead take the format from the extension of the file + new_file = file.with_suffix(f".weights.{save_format}") super().save_weights(new_file) shutil.move(new_file, file) diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py index c72ea9fb5c..05087dd640 100644 --- a/n3fit/src/n3fit/backends/keras_backend/callbacks.py +++ b/n3fit/src/n3fit/backends/keras_backend/callbacks.py @@ -76,15 +76,13 @@ def __init__(self, stopping_object, log_freq=100): super().__init__() self.log_freq = log_freq self.stopping_object = stopping_object - self._current_loss = None - - def on_epoch_begin(self, epoch, logs=None): - # TODO This is an unnecessary performance hit, just for testing - self._current_loss = self.model.compute_losses() def on_epoch_end(self, epoch, logs=None): - """Function to be called at the end of every epoch""" - logs = self._current_loss + """Function to be called at the end of every epoch + Every ``log_freq`` number of epochs, the ``monitor_chi2`` method of the ``stopping_object`` + will be called and the validation loss (broken down by experiment) will be logged. + For the training model only the total loss is logged during the training. + """ print_stats = ((epoch + 1) % self.log_freq) == 0 # Note that the input logs correspond to the fit before the weights are updated self.stopping_object.monitor_chi2(logs, epoch, print_stats=print_stats) diff --git a/n3fit/src/n3fit/backends/keras_backend/operations.py b/n3fit/src/n3fit/backends/keras_backend/operations.py index c73a548b53..844274379c 100644 --- a/n3fit/src/n3fit/backends/keras_backend/operations.py +++ b/n3fit/src/n3fit/backends/keras_backend/operations.py @@ -37,6 +37,14 @@ from validphys.convolution import OP +# Select a concatenate function depending on the tensorflow version +try: + # For tensorflow >= 2.16, Keras >= 3 + concatenate_function = keras.ops.concatenate +except AttributeError: + # keras.ops was introduced in keras 3 + concatenate_function = tf.concat + def evaluate(tensor): """Evaluate input tensor using the backend""" @@ -251,11 +259,7 @@ def concatenate(tensor_list, axis=-1, target_shape=None, name=None): Concatenates a list of numbers or tensor into a bigger tensor If the target shape is given, the output is reshaped to said shape """ - try: - # For tensorflow >= 2.16, Keras >= 3 - concatenated_tensor = keras.ops.concatenate(tensor_list, axis=axis) - except AttributeError: - concatenated_tensor = tf.concat(tensor_list, axis=axis) + concatenated_tensor = concatenate_function(tensor_list, axis=axis) if target_shape is None: return concatenated_tensor diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 380c5445d7..dec9236b55 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -146,13 +146,16 @@ class FitState: all losses for the training model validation_info: dict all losses for the validation model + training_loss: float + total training loss, this can be given if per-exp``training_info`` + is not available """ vl_ndata = None tr_ndata = None vl_suffix = None - def __init__(self, training_info, validation_info): + def __init__(self, training_info, validation_info, training_loss=None): if self.vl_ndata is None or self.tr_ndata is None or self.vl_suffix is None: raise ValueError( "FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled" @@ -164,6 +167,8 @@ def __init__(self, training_info, validation_info): self._tr_chi2 = None # This is an overall training chi2 self._vl_dict = None self._tr_dict = None + # This can be given if ``training_info`` is not given + self._training_loss = training_loss @property def vl_loss(self): @@ -173,6 +178,8 @@ def vl_loss(self): @property def tr_loss(self): """Return the total validation loss as it comes from the info dictionaries""" + if self._training is None: + return self._training_loss return self._training.get("loss") def _parse_chi2(self): @@ -223,7 +230,7 @@ def total_partial_tr_chi2(self): def total_partial_vl_chi2(self): """Return the vl chi2 summed over replicas per experiment""" - return {k: np.sum(v) for k, v in self.all_tr_chi2.items()} + return {k: np.sum(v) for k, v in self.all_vl_chi2.items()} def total_tr_chi2(self): """Return the total tr chi2 summed over replicas""" @@ -273,27 +280,12 @@ def get_state(self, epoch): f"Tried to get obtain the state for epoch {epoch} when only {len(self._history)} epochs have been saved" ) from e - def register(self, epoch, training_info, validation_info): - """Save a new fitstate and updates the current final epoch - - Parameters - ---------- - epoch: int - the current epoch of the fit - training_info: dict - all losses for the training model - validation_info: dict - all losses for the validation model - - Returns - ------- - FitState + def register(self, epoch, fitstate): + """Save the current fitstate and the associated epoch + and set the current epoch as the final one should the fit end now """ - # Save all the information in a fitstate object - fitstate = FitState(training_info, validation_info) self.final_epoch = epoch self._history.append(fitstate) - return fitstate class Stopping: @@ -425,8 +417,8 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): Parameters ---------- training_info: dict - output of a .fit() call, dictionary of the total loss (summed over replicas) for - each experiment + output of a .fit() call, dictionary of the total training loss + (summed over replicas and experiments) epoch: int index of the epoch @@ -436,7 +428,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): true/false according to the status of the run """ # Step 1. Check whether the fit has NaN'd and stop it if so - if np.isnan(training_info["loss"]): + if np.isnan(training_loss := training_info["loss"]): log.warning(" > NaN found, stopping activated") self.make_stop() return False @@ -445,7 +437,9 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): validation_info = self._validation.compute_losses() # Step 3. Register the current point in (the) history - fitstate = self._history.register(epoch, training_info, validation_info) + # and set the current final epoch as the current one + fitstate = FitState(None, validation_info, training_loss) + self._history.register(epoch, fitstate) if print_stats: self.print_current_stats(epoch, fitstate) @@ -496,21 +490,23 @@ def _restore_best_weights(self): def print_current_stats(self, epoch, fitstate): """ - Prints ``fitstate`` training and validation chi2s + Prints ``fitstate`` validation chi2 for every experiment + and the current total training loss as well as the validation loss + after the training step """ epoch_index = epoch + 1 - tr_chi2 = fitstate.total_tr_chi2() vl_chi2 = fitstate.total_vl_chi2() - total_str = f"At epoch {epoch_index}/{self.total_epochs}, total chi2: {tr_chi2}\n" + total_str = f"""Epoch {epoch_index}/{self.total_epochs}: loss: {fitstate.tr_loss:.7f} +Validation loss after training step: {vl_chi2:.7f}. +Validation chi2s: """ # The partial chi2 makes no sense for more than one replica at once: if self._n_replicas == 1: - partial_tr_chi2 = fitstate.total_partial_tr_chi2() + partial_vl_chi2 = fitstate.total_partial_vl_chi2() partials = [] - for experiment, chi2 in partial_tr_chi2.items(): + for experiment, chi2 in partial_vl_chi2.items(): partials.append(f"{experiment}: {chi2:.3f}") - total_str += ", ".join(partials) + "\n" - total_str += f"Validation chi2 at this point: {vl_chi2}" + total_str += ", ".join(partials) log.info(total_str) def stop_here(self): @@ -525,6 +521,7 @@ def stop_here(self): def chi2exps_json(self, i_replica=0, log_each=100): """ Returns and apt-for-json dictionary with the status of the fit every `log_each` epochs + It reports the total training loss and the validation loss broken down by experiment. Parameters ---------- @@ -543,16 +540,14 @@ def chi2exps_json(self, i_replica=0, log_each=100): for epoch in range(log_each - 1, final_epoch + 1, log_each): fitstate = self._history.get_state(epoch) - all_tr = fitstate.all_tr_chi2_for_replica(i_replica) - all_vl = fitstate.all_vl_chi2_for_replica(i_replica) + # Get the training and validation losses + tmp = {"training_loss": fitstate.tr_loss, "validation_loss": fitstate.vl_loss.tolist()} - tmp = {exp: {"training": tr_chi2} for exp, tr_chi2 in all_tr.items()} - for exp, vl_chi2 in all_vl.items(): - if exp not in tmp: - tmp[exp] = {"training": None} - tmp[exp]["validation"] = vl_chi2 + # And the validation chi2 broken down by experiment + tmp["validation_chi2s"] = fitstate.all_vl_chi2_for_replica(i_replica) json_dict[epoch + 1] = tmp + return json_dict @@ -586,6 +581,7 @@ def check_positivity(self, history_object): otherwise, it passes. It returns an array booleans which are True if positivity passed story_object[key_loss] < self.threshold + Parameters ---------- history_object: dict