When launching many-replica parallel fits on the trvl-mask-layers branch on the CPU, a couple of things seem to be broken. With 10 replicas, things run fine. With 50 replicas, following error occurs at the training stage:
[WARNING]: > NaN found, stopping activated
[CRITICAL]: Bug in n3fit ocurred. Please report it.
Traceback (most recent call last):
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/scripts/n3fit_exec.py", line 286, in run
super().run()
File "/gpfs/home6/gijstest/src/nnpdf/validphys2/src/validphys/app.py", line 152, in run
super().run()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/app.py", line 380, in run
rb.execute_sequential()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 166, in execute_sequential
result = self.get_result(callspec.function,
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 175, in get_result
fres = function(**kwdict)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/performfit.py", line 266, in performfit
log.info("Stopped at epoch=%d", stopping_object.stop_epoch)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/stopping.py", line 387, in stop_epoch
return self._history.final_epoch + 1
TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'
which seems like a side effect of the stopping refactoring.
Surprisingly, when running 100 parallel replicas, the training step fails with a different error:
[CRITICAL]: Bug in n3fit ocurred. Please report it.
Traceback (most recent call last):
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/scripts/n3fit_exec.py", line 286, in run
super().run()
File "/gpfs/home6/gijstest/src/nnpdf/validphys2/src/validphys/app.py", line 152, in run
super().run()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/app.py", line 380, in run
rb.execute_sequential()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 166, in execute_sequential
result = self.get_result(callspec.function,
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 175, in get_result
fres = function(**kwdict)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/performfit.py", line 262, in performfit
result = pdf_gen_and_train_function(parameters)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/model_trainer.py", line 942, in hyperparametrizable
passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/model_trainer.py", line 743, in _train_and_fit
training_model.perform_fit(
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/backends/keras_backend/MetaModel.py", line 170, in perform_fit
history = super().fit(x=x_params, y=y, epochs=epochs, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:
Detected at node 'meta_model/trmask_BCDMSD_dw_ite/boolean_mask/GatherV2' defined at (most recent call last):
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/bin/n3fit", line 33, in <module>
sys.exit(load_entry_point('n3fit', 'console_scripts', 'n3fit')())
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/scripts/n3fit_exec.py", line 298, in main
a.main()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/app.py", line 395, in main
self.run()
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/scripts/n3fit_exec.py", line 286, in run
super().run()
File "/gpfs/home6/gijstest/src/nnpdf/validphys2/src/validphys/app.py", line 152, in run
super().run()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/app.py", line 380, in run
rb.execute_sequential()
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 166, in execute_sequential
result = self.get_result(callspec.function,
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/reportengine/resourcebuilder.py", line 175, in get_result
fres = function(**kwdict)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/performfit.py", line 262, in performfit
result = pdf_gen_and_train_function(parameters)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/model_trainer.py", line 942, in hyperparametrizable
passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/model_trainer.py", line 743, in _train_and_fit
training_model.perform_fit(
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/backends/keras_backend/MetaModel.py", line 170, in perform_fit
history = super().fit(x=x_params, y=y, epochs=epochs, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/training.py", line 1384, in fit
tmp_logs = self.train_function(iterator)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function
return step_function(self, iterator)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step
outputs = model.train_step(data)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/training.py", line 859, in train_step
y_pred = self(x, training=True)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1096, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/functional.py", line 451, in call
return self._run_internal_graph(
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/functional.py", line 589, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1096, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/home/gijstest/.conda/envs/nnpdf-dev-cpu/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/layers/mask.py", line 49, in call
if self.mask is not None:
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/layers/mask.py", line 50, in call
flat_res = op.boolean_mask(ret, self.mask, axis=self.axis)
File "/gpfs/home6/gijstest/src/nnpdf/n3fit/src/n3fit/backends/keras_backend/operations.py", line 225, in boolean_mask
return tf.boolean_mask(*args, **kwargs)
Node: 'meta_model/trmask_BCDMSD_dw_ite/boolean_mask/GatherV2'
indices[88] = -4936617473272247041 is not in [0, 24800)
[[{{node meta_model/trmask_BCDMSD_dw_ite/boolean_mask/GatherV2}}]] [Op:__inference_train_function_323127]
When launching many-replica parallel fits on the trvl-mask-layers branch on the CPU, a couple of things seem to be broken. With 10 replicas, things run fine. With 50 replicas, following error occurs at the training stage:
which seems like a side effect of the stopping refactoring.
Surprisingly, when running 100 parallel replicas, the training step fails with a different error:
Apparently the mask layer fails for the
BCDMSD_dw_itedataset, but I have seen the same issue occur for other datasets too.