diff --git a/.github/workflows/all_tests_nnpdf.yml b/.github/workflows/all_tests_nnpdf.yml index f37bb5640d..f61f4764c1 100644 --- a/.github/workflows/all_tests_nnpdf.yml +++ b/.github/workflows/all_tests_nnpdf.yml @@ -110,6 +110,7 @@ jobs: shell: bash -l {0} run: | cd n3fit/runcards/examples + vp-setupfit Basic_runcard.yml n3fit Basic_runcard.yml 4 cat Basic_runcard/nnfit/*/Basic_runcard.json - name: Test we can still run postfit @@ -139,6 +140,7 @@ jobs: shell: bash -l {0} run: | cd n3fit/runcards/examples + vp-setupfit Basic_runcard.yml n3fit Basic_runcard.yml 42 cat Basic_runcard/nnfit/*/Basic_runcard.json diff --git a/doc/sphinx/source/tutorials/closuretest.rst b/doc/sphinx/source/tutorials/closuretest.rst index e5dbe8649d..097f7afcb5 100644 --- a/doc/sphinx/source/tutorials/closuretest.rst +++ b/doc/sphinx/source/tutorials/closuretest.rst @@ -159,11 +159,11 @@ Running a closure test with ``n3fit`` Running a closure test with ``n3fit`` will require a valid ``n3fit`` runcard, with the closure test settings modified as shown -:ref:`above `. The difference -between running a closure fit in ``n3fit`` and a standard fit is that the user is -required to run ``vp-setupfit`` on the runcard before running ``n3fit``. This is -because the filtering of the data is required to generate the pseudodata central -values. The workflow is as follows: +:ref:`above `. As with any standard fit, the user is required +to run ``vp-setupfit`` on the runcard before running ``n3fit`` (see +:ref:`run-n3fit-fit`); for a closure test this step is additionally responsible +for filtering the data and generating the pseudodata central values. The +workflow is as follows: .. code:: bash diff --git a/doc/sphinx/source/tutorials/run-fit.rst b/doc/sphinx/source/tutorials/run-fit.rst index 7f11965739..f372efdd78 100644 --- a/doc/sphinx/source/tutorials/run-fit.rst +++ b/doc/sphinx/source/tutorials/run-fit.rst @@ -75,10 +75,13 @@ Running the fitting code After successfully installing the ``n3fit`` package and preparing a runcard following the points presented above you can proceed with a fit. -1. Prepare the fit using ``vp-setupfit``. This command will generate a - folder with the same name as the runcard (minus the file extension) in the - current directory, which will contain a copy of the original YAML runcard. - The required resources will be downloaded, which includes: +1. Prepare the fit using ``vp-setupfit``. **This step is mandatory**: it + produces resources that ``n3fit`` requires in order to run. + + The command creates a folder with the same name as the runcard (minus the + file extension) in the current directory, which will contain a copy of the + original YAML runcard. The required resources will be downloaded, which + includes: - The t0 PDF set (an LHAPDF object). - The FastKernel tables in the form of a ``theory_xxx.tgz`` file @@ -87,6 +90,10 @@ following the points presented above you can proceed with a fit. If the runcard requires to precompute some heavy objects shared among replicas, such as the theory covariance matrix, it will be done during this step. + A file named ``md5`` is also written inside the output folder. It contains a + hash of the runcard which ``n3fit`` uses to detect changes made to the + runcard between ``vp-setupfit`` and ``n3fit`` (see step 2). + :: vp-setupfit .yml @@ -97,6 +104,27 @@ following the points presented above you can proceed with a fit. replica fit you should launch more than 100 replicas (e.g. 120) since not all of them will necessarily converge. While by default the code runs each replica separately, it is possible to run many replicas in parallel, see :ref:`parallel-label`. + Before training begins, ``n3fit`` recomputes the md5 hash of the runcard + and compares it against the value written by ``vp-setupfit`` in step 1. + If the two do not match — i.e. the runcard was modified after + ``vp-setupfit`` was run — ``n3fit`` aborts with an error. The fix is to + re-run ``vp-setupfit`` so that the md5 (and any heavy resources affected + by the change, such as the covariance matrix) are regenerated from the + current runcard. + + .. warning:: + + The md5 check can be bypassed by passing ``--skip-md5-check`` to + ``n3fit``:: + + n3fit .yml --skip-md5-check + + Use this flag at your own risk: it disables the only safeguard + against running ``n3fit`` on a runcard that has diverged from the + one ``vp-setupfit`` was given. Resources cached during ``vp-setupfit`` + (covariance matrices, t0 PDF, theory tables) may no longer be + consistent with the current runcard. + :: for i in {1..120} ; do diff --git a/extra_tests/regression_checks.py b/extra_tests/regression_checks.py index 22b4e97bf0..b3f038ff41 100644 --- a/extra_tests/regression_checks.py +++ b/extra_tests/regression_checks.py @@ -8,6 +8,7 @@ import pytest +from n3fit.tests.helpers import run_n3fit from n3fit.tests.test_fit import EXE, check_fit_results from validphys.utils import yaml_safe @@ -44,7 +45,7 @@ def test_regression_fit(tmp_path, runcard, replica, regenerate): if (wname := runcard_info.get("load")) is not None: shutil.copy(REGRESSION_FOLDER / wname, tmp_path) - sp.run(f"{EXE} {runcard_name} {replica}".split(), cwd=tmp_path, check=True) + run_n3fit(runcard_name, f"{replica}", cwd=tmp_path, check=True) old_json_file = REGRESSION_FOLDER / f"{runcard}_{replica}.json" check_fit_results( diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index c77fbbeae0..d8b6c408ac 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -13,6 +13,7 @@ import pandas as pd from ruamel.yaml import error +from n3fit.scripts.vp_setupfit import MD5_FILENAME, SetupFitError, _compute_fit_md5 from reportengine import colors from reportengine.namespaces import NSList from validphys.api import API @@ -71,6 +72,11 @@ def init_output(self): # check if results folder exists self.output_path = pathlib.Path(self.output_path).absolute() + + # Verify vp-setupfit hash is consistent with the current fit configuration + # before we touch anything in the output folder. + self._verify_setupfit_md5() + if not (self.output_path / "nnfit").is_dir(): if not re.fullmatch(r"[\w.\-]+", self.output_path.name): raise N3FitError("Invalid output folder name. Must be alphanumeric.") @@ -103,6 +109,29 @@ def init_output(self): self.input_folder = self.replica_path / INPUT_FOLDER self.input_folder.mkdir(exist_ok=True) + def _verify_setupfit_md5(self): + if self.skip_md5_check: + log.warning("Skipping md5 check against vp-setupfit (--skip-md5-check is set).") + return + + md5_path = self.output_path / MD5_FILENAME + if not md5_path.exists(): + raise N3FitError( + f"{MD5_FILENAME} file not found at {md5_path}. " + "Run vp-setupfit on this runcard before n3fit." + ) + stored = md5_path.read_text().strip() + try: + current = _compute_fit_md5(self.config_yml, self.output_path) + except SetupFitError as e: + raise N3FitError(f"Failed to compute current fit md5: {e}") from e + if stored != current: + raise N3FitError( + f"md5 mismatch in {self.output_path}: stored {stored} != current {current}. " + "The runcard changed since vp-setupfit was run. Re-run vp-setupfit (or pass " + "--skip-md5-check to override this check if this is the desired behaviour)." + ) + @classmethod def ns_dump_description(cls): return { @@ -312,6 +341,12 @@ def check_positive(value): help="End of the range of replicas to compute", type=check_positive, ) + parser.add_argument( + "--skip-md5-check", + help="Skip the integrity check against the md5 written by vp-setupfit.", + action="store_true", + default=False, + ) return parser def get_commandline_arguments(self, cmdline=None): @@ -341,6 +376,7 @@ def run(self): self.environment.db_host = self.args["db_host"] self.environment.db_port = self.args["db_port"] self.environment.db_name = self.args["db_name"] + self.environment.skip_md5_check = self.args["skip_md5_check"] super().run() except N3FitError as e: log.error(f"Error in n3fit:\n{e}") diff --git a/n3fit/src/n3fit/scripts/vp_setupfit.py b/n3fit/src/n3fit/scripts/vp_setupfit.py index 91fa2e7001..8ed0d8766a 100644 --- a/n3fit/src/n3fit/scripts/vp_setupfit.py +++ b/n3fit/src/n3fit/scripts/vp_setupfit.py @@ -68,6 +68,17 @@ INPUT_FOLDER = "input" +def _compute_fit_md5(config_yaml: pathlib.Path, output_path: pathlib.Path) -> str: + """Compute the md5 hash of the fit when vp-setupfit is run. This utility function + is also used in n3fit to check that the fit configuration has not changed + since the setup-fit was run.""" + hash_md5 = hashlib.md5() + with open(config_yaml, 'rb') as f: + hash_md5.update(f.read()) + digest = hash_md5.hexdigest() + return digest + + class SetupFitError(Exception): """Exception raised when setup-fit cannot succeed and knows why""" @@ -113,13 +124,12 @@ def init_output(self): self.input_folder.mkdir(exist_ok=True) def save_md5(self): - """Generate md5 key from file""" + """Generate md5 key from the runcard and the stored fitting covmat table.""" output_filename = self.output_path / MD5_FILENAME - with open(self.config_yml, 'rb') as f: - hash_md5 = hashlib.md5(f.read()).hexdigest() + digest = _compute_fit_md5(self.config_yml, self.output_path) with open(output_filename, 'w') as g: - g.write(hash_md5) - log.info(f"md5 {hash_md5} stored in {output_filename}") + g.write(digest) + log.info(f"md5 {digest} stored in {output_filename}") @classmethod def ns_dump_description(cls): diff --git a/n3fit/src/n3fit/tests/helpers.py b/n3fit/src/n3fit/tests/helpers.py new file mode 100644 index 0000000000..c2cb37c826 --- /dev/null +++ b/n3fit/src/n3fit/tests/helpers.py @@ -0,0 +1,44 @@ +""" +Test helpers for the n3fit regression suite. + +n3fit verifies an md5 written by vp-setupfit (see +``N3FitEnvironment._verify_setupfit_md5``), so any test that invokes n3fit +on a runcard must run vp-setupfit first. ``run_n3fit`` wraps that pair so +individual tests don't have to. +""" + +import subprocess as sp + + +def run_setupfit(runcard, *, cwd, output=None): + """Run ``vp-setupfit [-o ]`` before ``n3fit``. + + Exposed separately so tests that launch n3fit via ``sp.Popen`` (e.g. the + parallel-hyperopt tests) can run the setupfit step once before forking + workers. + """ + cmd = ["vp-setupfit", str(runcard)] + if output is not None: + cmd += ["-o", str(output)] + return sp.run(cmd, cwd=cwd, check=True) + + +def run_n3fit(runcard, args="", *, cwd, setupfit=True, **run_kwargs): + """Run ``vp-setupfit`` and then ``n3fit`` on the same runcard. + + If ``-o `` appears in ``args`` it's forwarded to vp-setupfit so + the md5 lands in the directory n3fit will read it from. + + Set ``setupfit=False`` when this call shares both the runcard *and* the + output directory (default or ``-o``) with an earlier call that already + ran vp-setupfit — the md5 written by that call is still valid. + """ + args_list = args.split() if isinstance(args, str) else list(args) + + if setupfit: + output = None + if "-o" in args_list: + output = args_list[args_list.index("-o") + 1] + run_setupfit(runcard, cwd=cwd, output=output) + + return sp.run(["n3fit", str(runcard), *args_list], cwd=cwd, **run_kwargs) diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py index f844bad3a6..c7f10e3faf 100644 --- a/n3fit/src/n3fit/tests/test_fit.py +++ b/n3fit/src/n3fit/tests/test_fit.py @@ -24,6 +24,7 @@ import n3fit from n3fit.io.writer import SuperEncoder +from n3fit.tests.helpers import run_n3fit, run_setupfit from validphys.n3fit_data import replica_mcseed, replica_nnseed, replica_trvlseed from validphys.utils import yaml_safe @@ -160,7 +161,7 @@ def _auxiliary_performfit(tmp_path, runcard=QUICKNAME, replica=1, timing=True, r shutil.copy(quickpath, tmp_path) shutil.copy(weightpath, tmp_path / f"{weight_name}.weights.h5") # run the fit - sp.run(f"{EXE} {quickcard} {replica}".split(), cwd=tmp_path, check=True) + run_n3fit(quickcard, str(replica), cwd=tmp_path, check=True) # And compare check_fit_results(tmp_path, runcard, replica, old_json_file, timing=timing, rel_error=rel_error) @@ -185,7 +186,7 @@ def test_performfit_with_old_theory(tmp_path): quickcard = "quickcard_old.yml" quickpath = REGRESSION_FOLDER / quickcard shutil.copy(quickpath, tmp_path) - sp.run(f"{EXE} {quickcard} 5".split(), cwd=tmp_path, check=True) + run_n3fit(quickcard, "5", cwd=tmp_path, check=True) @pytest.mark.skip(reason="Still not implemented in parallel mode") @@ -198,7 +199,7 @@ def test_hyperopt(tmp_path): # We just want to ensure that the hyperopt can run, but we need to kill it ourselves # 60 seconds should be enough with pytest.raises(sp.TimeoutExpired): - sp.run(f"{EXE} {quickcard} {REPLICA} --hyperopt 1000".split(), cwd=tmp_path, timeout=60) + run_n3fit(quickcard, f"{REPLICA} --hyperopt 1000", cwd=tmp_path, timeout=60) def test_novalidation(tmp_path, timing=30): @@ -209,7 +210,7 @@ def test_novalidation(tmp_path, timing=30): quickpath = REGRESSION_FOLDER / quickcard shutil.copy(quickpath, tmp_path) with pytest.raises(sp.TimeoutExpired): - sp.run(f"{EXE} {quickcard} {REPLICA}".split(), cwd=tmp_path, timeout=timing) + run_n3fit(quickcard, REPLICA, cwd=tmp_path, timeout=timing) def test_weirdbasis(tmp_path, timing=30): @@ -225,7 +226,7 @@ def test_weirdbasis(tmp_path, timing=30): shutil.copy(quickpath, tmp_path) # with pytest.raises(sp.TimeoutExpired): with pytest.raises(sp.CalledProcessError): - sp.run(f"{EXE} {quickcard} {REPLICA}".split(), cwd=tmp_path, timeout=timing, check=True) + run_n3fit(quickcard, REPLICA, cwd=tmp_path, timeout=timing, check=True) @pytest.mark.linux @@ -239,7 +240,7 @@ def test_multireplica_runs(tmp_path, runcard): path = tmp_path / name path.mkdir() shutil.copy(quickpath, path) - sp.run(f"{EXE} {quickcard} {replicas}".split(), cwd=path, check=True) + run_n3fit(quickcard, replicas, cwd=path, check=True) for name_1, option_1 in options.items(): for name_2, option_2 in options.items(): @@ -299,8 +300,8 @@ def test_parallel_against_sequential(tmp_path, rep_from=6, rep_to=8): # Now run both for r in range(rep_from, rep_to + 1): - sp.run(f"{EXE} {card_sequenti} {r}".split(), cwd=tmp_path, check=True) - sp.run(f"{EXE} {card_parallel} {rep_from} -r {rep_to}".split(), cwd=tmp_path, check=True) + run_n3fit(card_sequenti, str(r), cwd=tmp_path, setupfit=(r == rep_from), check=True) + run_n3fit(card_parallel, f"{rep_from} -r {rep_to}", cwd=tmp_path, check=True) # Loop over all pseudodata files for both fits and load them up folder_seq = card_sequenti.with_suffix("") / "nnfit" @@ -335,3 +336,19 @@ def compare_weights(option_1, option_2, file_1, file_2): weight_name = file_1[key].name err_msg = f"Difference between runs `n3fit {option_1}` and `n3fit {option_2}` in weights {weight_name}" assert_allclose(file_1[key][:], file_2[key][:], rtol=1e-5, atol=1e-5, err_msg=err_msg) + + +def test_md5_mismatch_is_detected(tmp_path): + """vp-setupfit, then tamper with the runcard -> n3fit must refuse to start.""" + quickcard = f"{QUICKNAME}.yml" + weight_name = "weights_pol" if "_pol" in quickcard else "weights" + weightpath = REGRESSION_FOLDER / f"{weight_name}_1.weights.h5" + shutil.copy(REGRESSION_FOLDER / quickcard, tmp_path) + shutil.copy(weightpath, tmp_path / f"{weight_name}.weights.h5") + run_setupfit(quickcard, cwd=tmp_path) + + runcard_path = tmp_path / quickcard + runcard_path.write_text(runcard_path.read_text() + "\n# tampered\n") + + with pytest.raises(sp.CalledProcessError): + run_n3fit(quickcard, REPLICA, cwd=tmp_path, setupfit=False, check=True) diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py index bbae97ddd4..ad4a83f595 100644 --- a/n3fit/src/n3fit/tests/test_hyperopt.py +++ b/n3fit/src/n3fit/tests/test_hyperopt.py @@ -15,6 +15,7 @@ from n3fit.hyper_optimization.rewards import HyperLoss from n3fit.model_gen import ReplicaSettings, generate_pdf_model +from n3fit.tests.helpers import run_n3fit, run_setupfit from n3fit.vpinterface import N3PDF from validphys.loader import Loader from validphys.tests.conftest import THEORYID @@ -139,20 +140,22 @@ def test_restart_from_pickle(tmp_path): # cp runcard to tmp folder shutil.copy(quickpath, tmp_path) # run some trials for the first time - sp.run( - f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} -o {output_restart}".split(), + run_n3fit( + quickpath, + f"{REPLICA} --hyperopt {n_trials_stop} -o {output_restart}", cwd=tmp_path, check=True, ) - # restart the hyperopt and calculate more trials - sp.run( - f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} -o {output_restart}".split(), + run_n3fit( + quickpath, + f"{REPLICA} --hyperopt {n_trials_total} -o {output_restart}", cwd=tmp_path, + setupfit=False, check=True, - ) - # start again and calculate all trials at once - sp.run( - f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} -o {output_direct}".split(), + ) # md5 already in output_restart + run_n3fit( + quickpath, + f"{REPLICA} --hyperopt {n_trials_total} -o {output_direct}", cwd=tmp_path, check=True, ) @@ -193,13 +196,15 @@ def test_parallel_hyperopt(tmp_path): shutil.copy(quickpath, tmp_path) # Run hyperopt sequentially - sp.run( - f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials} -o {output_sequential}".split(), + run_n3fit( + quickpath, + f"{REPLICA} --hyperopt {n_trials} -o {output_sequential}", cwd=tmp_path, check=True, ) # Run hyperopt in parallel + run_setupfit(quickpath, cwd=tmp_path, output=output_parallel) workers = [] my_env = os.environ.copy() my_env["CUDA_VISIBLE_DEVICES"] = "" @@ -248,6 +253,7 @@ def test_parallel_restart(tmp_path): # cp runcard to tmp folder shutil.copy(quickpath, tmp_path) + run_setupfit(quickpath, cwd=tmp_path, output=output) # run some trials for the first time for i in range(n_workers): tmp = sp.Popen(