diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 12216f96d2..2cb23ea439 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -23,7 +23,7 @@ requirements: - numpy run: # Only install tensorflow on linux. Eigen build is default (for now). - - tensorflow # [linux] + - tensorflow >=2 # [linux] - psutil # to ensure n3fit affinity is with the right processors - hyperopt - seaborn diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py index 7f90a0d292..edab10031f 100644 --- a/n3fit/src/n3fit/checks.py +++ b/n3fit/src/n3fit/checks.py @@ -67,9 +67,9 @@ def check_stopping(parameters): raise CheckError(f"Needs to run at least 1 epoch, got: {epochs}") -def check_basis_with_layers(fitting, parameters): +def check_basis_with_layers(basis, parameters): """ Check that the last layer matches the number of flavours defined in the runcard""" - number_of_flavours = len(fitting["basis"]) + number_of_flavours = len(basis) last_layer = parameters["nodes_per_layer"][-1] if number_of_flavours != last_layer: raise CheckError( @@ -137,34 +137,31 @@ def check_lagrange_multipliers(parameters, key): raise CheckError(f"The {key}::threshold must be a number, received: {threshold}") -def check_model_file(fitting): +def check_model_file(save, load): """ Checks whether the model_files given in the runcard are acceptable """ - save_file = fitting.get("save") - load_file = fitting.get("load") - if save_file: - if not isinstance(save_file, str): - raise CheckError(f"Model file to save to: {save_file} not understood") + if save: + if not isinstance(save, str): + raise CheckError(f"Model file to save to: {save} not understood") # Since the file to save to will be found inside the replica folder, it should writable as all the others - if load_file: - if not isinstance(load_file, str): - raise CheckError(f"Model file to load: {load_file} not understood, str expected") - if not os.path.isfile(load_file): - raise CheckError(f"Model file to load: {load_file} can not be opened, does it exist?") - if not os.access(load_file, os.R_OK): - raise CheckError(f"Model file to load: {load_file} cannot be read, permission denied") - if os.stat(load_file).st_size == 0: - raise CheckError(f"Model file {load_file} seems to be empty") + if load: + if not isinstance(load, str): + raise CheckError(f"Model file to load: {load} not understood, str expected") + if not os.path.isfile(load): + raise CheckError(f"Model file to load: {load} can not be opened, does it exist?") + if not os.access(load, os.R_OK): + raise CheckError(f"Model file to load: {load} cannot be read, permission denied") + if os.stat(load).st_size == 0: + raise CheckError(f"Model file {load} seems to be empty") @make_argcheck -def wrapper_check_NN(fitting): +def wrapper_check_NN(basis, tensorboard, save, load, parameters): """ Wrapper function for all NN-related checks """ - check_tensorboard(fitting.get("tensorboard")) - parameters = fitting["parameters"] - check_model_file(fitting) + check_tensorboard(tensorboard) + check_model_file(save, load) check_existing_parameters(parameters) check_consistent_layers(parameters) - check_basis_with_layers(fitting, parameters) + check_basis_with_layers(basis, parameters) check_stopping(parameters) check_dropout(parameters) check_lagrange_multipliers(parameters, "integrability") @@ -244,13 +241,11 @@ def check_kfold_options(kfold): ) -def check_correct_partitions(kfold, experiments_data): +def check_correct_partitions(kfold, data): """Ensures that all experimennts in all partitions are included in the fit definition""" # Get all datasets - datasets = [] - for exp in experiments_data: - datasets += [i.name for i in exp.datasets] + datasets = list(map(str, data)) for partition in kfold["partitions"]: fold_sets = partition["datasets"] for dset in fold_sets: @@ -290,13 +285,13 @@ def check_hyperopt_stopping(stopping_dict): @make_argcheck -def wrapper_hyperopt(hyperopt, hyperscan, fitting, experiments_data): +def wrapper_hyperopt(hyperopt, hyperscan, genrep, data): """Wrapper function for all hyperopt-related checks No check is performed if hyperopt is not active """ if not hyperopt: return None - if fitting["genrep"]: + if genrep: raise CheckError("Generation of replicas is not accepted during hyperoptimization") if hyperscan is None: raise CheckError("Can't perform hyperoptimization without the hyperscan key") @@ -306,7 +301,7 @@ def wrapper_hyperopt(hyperopt, hyperscan, fitting, experiments_data): check_hyperopt_architecture(hyperscan.get("architecture")) check_hyperopt_positivity(hyperscan.get("positivity")) check_kfold_options(hyperscan["kfold"]) - check_correct_partitions(hyperscan["kfold"], experiments_data) + check_correct_partitions(hyperscan["kfold"], data) def check_sumrules(sum_rules): @@ -322,17 +317,16 @@ def check_sumrules(sum_rules): # Checks on the physics @make_argcheck -def check_consistent_basis(fitting, theoryid): +def check_consistent_basis(sum_rules, fitbasis, basis, theoryid): """Checks the fitbasis setup for inconsistencies - Checks the sum rules can be imposed - Correct flavours for the selected basis - Correct ranges (min < max) for the small and large-x exponents """ - check_sumrules(fitting.get("sum_rules", True)) - fitbasis = fitting["fitbasis"] + check_sumrules(sum_rules) # Check that there are no duplicate flavours and that parameters are sane flavs = [] - for flavour_dict in fitting["basis"]: + for flavour_dict in basis: name = flavour_dict["fl"] smallx = flavour_dict["smallx"] if smallx[0] > smallx[1]: diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index b12e6359ea..669eb3b647 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -5,7 +5,6 @@ # Backend-independent imports from collections import namedtuple import logging -import os.path import numpy as np import n3fit.checks from n3fit.vpinterface import N3PDF @@ -13,134 +12,116 @@ log = logging.getLogger(__name__) -def initialize_seeds(replica: list, trvlseed: int, nnseed: int, mcseed: int, genrep: bool): - """Action to initialize seeds for random number generation. - We initialize three different seeds. The first is the seed - used for training/validation splits, the second is used for - initialization of the neural network's parameters and the - final one is the monte carlo seeds for pseudodata replica - generation. - - The generation of these seeds depend on the replica number - in question. This dependence comes in by sampling the random - number generator times in the for loop. - - Parameters - ---------- - replica: list - A list of replica numbers to run over typically of size one - trvlseed: int - Seed initialization for training/validation split - nnseed: int - Seed for network initialization - mcseed: int - Seed for pseudodata replica generation - genrep: bool - - Returns - ------- - seeds: NamedTuple[List, List, List] - A namedtuple of lists containing the trvalseeds, nnseeds, mcseeds - """ - # First set the seed variables for - # - Tr/Vl split - # - Neural Network initialization - # - Replica generation - # These depend both on the seed set in the runcard and the replica number - trvalseeds = [] - nnseeds = [] - mcseeds = [] - for replica_number in replica: - np.random.seed(trvlseed) - for _ in range(replica_number): - trvalseed = np.random.randint(0, pow(2, 31)) - - np.random.seed(nnseed) - for _ in range(replica_number): - nnseed = np.random.randint(0, pow(2, 31)) - - np.random.seed(mcseed) - for _ in range(replica_number): - mcseed = np.random.randint(0, pow(2, 31)) - trvalseeds.append(trvalseed) - nnseeds.append(nnseed) - mcseeds.append(mcseed) - - if genrep == 0: - mcseeds = [] - - Seeds = namedtuple("Seeds", ["trvlseeds", "nnseeds", "mcseeds"]) - return Seeds(trvalseeds, nnseeds, mcseeds) - - -# Action to be called by valid phys +# Action to be called by validphys # All information defining the NN should come here in the "parameters" dict @n3fit.checks.check_consistent_basis @n3fit.checks.wrapper_check_NN @n3fit.checks.wrapper_hyperopt def performfit( - fitting, - experiments_data, - t0set, - replica, + *, + genrep, # used for checks + data, # used for checks + replicas_nnseed_fitting_data_dict, + posdatasets_fitting_pos_dict, + integdatasets_fitting_integ_dict, + theoryid, + basis, + fitbasis, + sum_rules=True, + parameters, replica_path, output_path, - theoryid, - posdatasets, - integdatasets=None, + save_weights_each=None, + save=None, + load=None, hyperscan=None, hyperopt=None, + kfold_parameters, + tensorboard=None, debug=False, maxcores=None, ): """ - This action will (upon having read a validcard) process a full PDF fit for a given replica. + This action will (upon having read a validcard) process a full PDF fit + for a set of replicas. The input to this function is provided by validphys and/or defined in the runcards or commandline arguments. + This controller is provided with: + 1. Seeds generated using the replica number and the seeds defined in the runcard. + 2. Loaded datasets with replicas generated. + 2.1 Loaded positivity/integrability sets. + The workflow of this controller is as follows: - 1. Generates seeds using the replica number and the seeds defined in the runcard, - 2. Read up all datasets from the given experiments and create the necessary replicas - 2.1 Read up also the positivity sets - 3. Generate a ModelTrainer object holding information to create the NN and perform a fit + 1. Generate a ModelTrainer object holding information to create the NN and perform a fit (at this point no NN object has been generated) - 3.1 (if hyperopt) generates the hyperopt scanning dictionary + 1.1 (if hyperopt) generates the hyperopt scanning dictionary taking as a base the fitting dictionary and the runcard's hyperscan dictionary - 4. Pass the dictionary of parameters to ModelTrainer + 2. Pass the dictionary of parameters to ModelTrainer for the NN to be generated and the fit performed - 4.1 (if hyperopt) Loop over point 4 for `hyperopt` number of times - 5. Once the fit is finished, output the PDF grid and accompanying files + 2.1 (if hyperopt) Loop over point 4 for `hyperopt` number of times + 3. Once the fit is finished, output the PDF grid and accompanying files Parameters ---------- - fitting: dict - dictionary with the hyperparameters of the fit - experiments_data: dict - vp list of datasets grouped into experiments to be included in - the fit - t0set: str - t0set name - replica: list - a list of replica numbers to run over (typically just one) + genrep: bool + Whether or not to generate MC replicas. (Only used for checks) + data: validphys.core.DataGroupSpec + containing the datasets to be included in the fit. (Only used + for checks) + replicas_nnseed_fitting_data_dict: list[tuple] + list with element for each replica (typically just one) to be + fitted. Each element + is a tuple containing the replica number, nnseed and + ``fitted_data_dict`` containing all of the data, metadata + for each group of datasets which is to be fitted. + posdatasets_fitting_pos_dict: list[dict] + list of dictionaries containing all data and metadata for each + positivity dataset + integdatasets_fitting_integ_dict: list[dict] + list of dictionaries containing all data and metadata for each + integrability dataset + theoryid: validphys.core.TheoryIDSpec + Theory which is used to generate theory predictions from model + during fit. Object also contains some metadata on the theory + settings. + basis: list[dict] + preprocessing information for each flavour to be fitted. + fitbasis: str + Valid basis which the fit is to be ran in. Available bases can + be found in :py:mod:`validphys.pdfbases`. + sum_rules: bool + Whether to impose sum rules in fit. By default set to True + parameters: dict + Mapping containing parameters which define the network + architecture/fitting methodology. replica_path: pathlib.Path path to the output of this run output_path: str name of the fit - theorid: int - theory id number - posdatasets: list - list of positivity datasets - integdatasets: list - list of integrability datasets + save_weights_each: None, int + if set, save the state of the fit every ``save_weights_each`` + epochs + save: None, str + model file where weights will be saved, used in conjunction with + ``load``. + load: None, str + model file from which to load weights from. hyperscan: dict dictionary containing the details of the hyperscan hyperopt: int if given, number of hyperopt iterations to run + kfold_parameters: None, dict + dictionary with kfold settings used in hyperopt. + tensorboard: None, dict + mapping containing tensorboard settings if it is to be used. By + default it is None and tensorboard is not enabled. debug: bool activate some debug options maxcores: int maximum number of (logical) cores that the backend should be aware of + """ from n3fit.backends import set_initial_state @@ -155,77 +136,11 @@ def performfit( # so they can eventually be set from the runcard from n3fit.ModelTrainer import ModelTrainer from n3fit.io.writer import WriterWrapper - from n3fit.backends import MetaModel, operations - import n3fit.io.reader as reader - - # Loading t0set from LHAPDF - if t0set is not None: - t0pdfset = t0set.load_t0() - else: - t0pdfset = None - - trvlseed, nnseed, mcseed, genrep = [ - fitting.get(i) for i in ["trvlseed", "nnseed", "mcseed", "genrep"] - ] - - seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep) - trvalseeds, nnseeds, mcseeds = seeds.trvlseeds, seeds.nnseeds, seeds.mcseeds - - ############################################################################## - # ### Read files - # Loop over all the experiment and positivity datasets - # and save them into two list of dictionaries: exp_info and pos_info - # later on we will loop over these two lists and generate the actual layers - # i.e., at this point there is no information about the NN - # we are just creating dictionaries with all the necessary information - # (experimental data, covariance matrix, replicas, etc, tr/val split) - ############################################################################## - all_exp_infos = [[] for _ in replica] - if fitting.get("diagonal_basis"): - log.info("working in diagonal basis") - - if hyperscan and hyperopt: - kfold_parameters = hyperscan["kfold"] - kpartitions = kfold_parameters["partitions"] - else: - kfold_parameters = None - kpartitions = None - - # First loop over the experiments - for exp in experiments_data: - log.info("Loading experiment: {0}".format(exp)) - all_exp_dicts = reader.common_data_reader( - exp, - t0pdfset, - replica_seeds=mcseeds, - trval_seeds=trvalseeds, - kpartitions=kpartitions, - rotate_diagonal=fitting.get("diagonal_basis"), - ) - for i, exp_dict in enumerate(all_exp_dicts): - all_exp_infos[i].append(exp_dict) - - # Now read all the positivity datasets - pos_info = [] - for pos_set in posdatasets: - log.info("Loading positivity dataset %s", pos_set) - pos_dict = reader.positivity_reader(pos_set) - pos_info.append(pos_dict) - - if integdatasets is not None: - integ_info = [] - for integ_set in integdatasets: - log.info("Loading integrability dataset %s", integ_set) - # Use the same reader as positivity observables - integ_dict = reader.positivity_reader(integ_set) - integ_info.append(integ_dict) - else: - integ_info = None # Note: In the basic scenario we are only running for one replica and thus this loop is only - # run once and all_exp_infos is a list of just than one element + # run once as replicas_nnseed_fitting_data_dict is a list of just one element stopwatch.register_times("data_loaded") - for replica_number, exp_info, nnseed in zip(replica, all_exp_infos, nnseeds): + for replica_number, exp_info, nnseed in replicas_nnseed_fitting_data_dict: replica_path_set = replica_path / f"replica_{replica_number}" log.info("Starting replica fit %s", replica_number) @@ -233,24 +148,23 @@ def performfit( # this object holds all necessary information to train a PDF (up to the NN definition) the_model_trainer = ModelTrainer( exp_info, - pos_info, - integ_info, - fitting["basis"], - fitting["fitbasis"], + posdatasets_fitting_pos_dict, + integdatasets_fitting_integ_dict, + basis, + fitbasis, nnseed, debug=debug, - save_weights_each=fitting.get("save_weights_each"), + save_weights_each=save_weights_each, kfold_parameters=kfold_parameters, max_cores=maxcores, - model_file=fitting.get("load"), - sum_rules=fitting.get("sum_rules", True) + model_file=load, + sum_rules=sum_rules ) # This is just to give a descriptive name to the fit function pdf_gen_and_train_function = the_model_trainer.hyperparametrizable # Read up the parameters of the NN from the runcard - parameters = fitting.get("parameters") stopwatch.register_times("replica_set") ######################################################################## @@ -280,10 +194,9 @@ def performfit( the_model_trainer.set_hyperopt(False) # Enable the tensorboard callback - tboard = fitting.get("tensorboard") - if tboard is not None: - profiling = tboard.get("profiling", False) - weight_freq = tboard.get("weight_freq", 0) + if tensorboard is not None: + profiling = tensorboard.get("profiling", False) + weight_freq = tensorboard.get("weight_freq", 0) log_path = replica_path_set / "tboard" the_model_trainer.enable_tensorboard(log_path, weight_freq, profiling) @@ -318,7 +231,7 @@ def performfit( ) # Create a pdf instance - pdf_instance = N3PDF(pdf_model, fit_basis=fitting.get("basis")) + pdf_instance = N3PDF(pdf_model, fit_basis=basis) # Generate the writer wrapper writer_wrapper = WriterWrapper( @@ -336,7 +249,7 @@ def performfit( ) # Save the weights to some file for the given replica - model_file = fitting.get("save") + model_file = save if model_file: model_file_path = replica_path_set / model_file log.info(" > Saving the weights for future in %s", model_file_path) @@ -355,5 +268,5 @@ def performfit( # So every time we want to capture output_path.name and addd a history_step_X # parallel to the nnfit folder - if tboard is not None: + if tensorboard is not None: log.info("Tensorboard logging information is stored at %s", log_path) diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py index 7609999a68..6861f9cbda 100755 --- a/n3fit/src/n3fit/scripts/n3fit_exec.py +++ b/n3fit/src/n3fit/scripts/n3fit_exec.py @@ -17,6 +17,7 @@ from validphys.core import FitSpec from reportengine import colors from reportengine.compat import yaml +from reportengine.namespaces import NSList N3FIT_FIXED_CONFIG = dict( @@ -25,13 +26,13 @@ actions_ = [] ) -N3FIT_PROVIDERS = ["n3fit.performfit", "validphys.results"] +N3FIT_PROVIDERS = ["n3fit.performfit", "validphys.results", "validphys.n3fit_data"] log = logging.getLogger(__name__) RUNCARD_COPY_FILENAME = "filter.yml" INPUT_FOLDER = "input" - +TAB_FOLDER = "tables" class N3FitError(Exception): """Exception raised when n3fit cannot succeed and knows why""" @@ -68,13 +69,17 @@ def init_output(self): # create output folder for the fit self.replica_path = self.output_path / "nnfit" - for replica in self.replica: + for replica in self.replicas: path = self.replica_path / "replica_{0}".format(replica) log.info("Creating replica output folder in {0}".format(path)) try: path.mkdir(exist_ok=True) except OSError as e: raise EnvironmentError_(e) from e + + # place tables in last replica path, folder already exists. + self.table_folder = path + # make lockfile input inside of replica folder # avoid conflict with setupfit self.input_folder = self.replica_path / INPUT_FOLDER @@ -83,7 +88,7 @@ def init_output(self): @classmethod def ns_dump_description(cls): return { - "replica": "The MC replica number", + "replicas": "The MC replica number/s", "replica_path": "The replica output path", "output_path": "The runcard name", "hyperopt": "The hyperopt flag", @@ -111,11 +116,21 @@ def from_yaml(cls, o, *args, **kwargs): raise ConfigError(f"Expecting input runcard to be a mapping, " f"not '{type(file_content)}'.") if file_content.get('closuretest') is not None: - N3FIT_FIXED_CONFIG['actions_'].append( - 'datacuts::theory::closuretest performfit') + fit_action = 'datacuts::theory::closuretest::fitting performfit' else: - N3FIT_FIXED_CONFIG['actions_'].append( - 'datacuts::theory performfit') + fit_action = 'datacuts::theory::fitting performfit' + N3FIT_FIXED_CONFIG['actions_'].append(fit_action) + + if file_content["fitting"].get("savepseudodata"): + if len(kwargs["environment"].replicas) != 1: + raise ConfigError( + "Cannot request that multiple replicas are fitted and that " + "pseudodata is saved. Either set `fitting::savepseudodata` " + "to `false` or fit replicas one at a time." + ) + # take same namespace configuration on the pseudodata_table action. + table_action = fit_action.replace('performfit', 'pseudodata_table') + N3FIT_FIXED_CONFIG['actions_'].append(table_action) file_content.update(N3FIT_FIXED_CONFIG) return cls(file_content, *args, **kwargs) @@ -147,7 +162,15 @@ def produce_use_fitcommondata(self, fakedata): """ return fakedata + def produce_kfold_parameters(self, hyperscan=None, hyperopt=None): + if hyperscan and hyperopt: + return hyperscan["kfold"] + return None + def produce_kpartitions(self, kfold_parameters): + if kfold_parameters: + return kfold_parameters["partitions"] + return None class N3FitApp(App): @@ -191,7 +214,7 @@ def run(self): replicas = list(range(replica, self.args["replica_range"] + 1)) else: replicas = [replica] - self.environment.replica = replicas + self.environment.replicas = NSList(replicas, nskey="replica") self.environment.hyperopt = self.args["hyperopt"] super().run() except N3FitError as e: diff --git a/n3fit/src/n3fit/tests/test_fit.py b/n3fit/src/n3fit/tests/test_fit.py index adfb4809a8..a0d7b2d21a 100644 --- a/n3fit/src/n3fit/tests/test_fit.py +++ b/n3fit/src/n3fit/tests/test_fit.py @@ -30,7 +30,7 @@ from numpy.testing import assert_equal, assert_allclose from reportengine.compat import yaml import n3fit -from n3fit.performfit import initialize_seeds +from validphys.n3fit_data import replica_trvlseed, replica_nnseed, replica_mcseed log = logging.getLogger(__name__) REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions") @@ -69,19 +69,22 @@ def load_data(info_file): def test_initialize_seeds(): # Regression tests for seed generation replicas = [1, 4] - Seeds = namedtuple("Seeds", ["trvlseeds", "nnseeds", "mcseeds"]) - regression = Seeds( - trvlseeds=[2005877882, 741720773], - nnseeds=[327741615, 87982037], - mcseeds=[1791095845, 2029572362], - ) - result = initialize_seeds(replicas, 4, 7, 1, True) - assert result == regression - result_nomc = initialize_seeds(replicas, 10, 100, 1000, False) - regression_nomc = Seeds( - trvlseeds=[1165313289, 2124247567], nnseeds=[186422792, 1315999533], mcseeds=[] - ) - assert result_nomc == regression_nomc + + trvlseeds = [replica_trvlseed(rep, 4) for rep in replicas] + assert trvlseeds == [2005877882, 741720773] + nnseeds = [replica_nnseed(rep, 7) for rep in replicas] + assert nnseeds == [327741615, 1369975286] + mcseeds = [replica_mcseed(rep, 1, True) for rep in replicas] + assert mcseeds == [1791095845, 1857819720] + + mcseeds_nomc = [replica_mcseed(rep, 1000, False) for rep in replicas] + assert mcseeds_nomc == [None, None] + + # test that we always get same answer + same_replicas = [3]*10 + assert len({replica_trvlseed(rep, 4) for rep in same_replicas}) == 1 + assert len({replica_nnseed(rep, 7) for rep in same_replicas}) == 1 + assert len({replica_mcseed(rep, 1, True) for rep in same_replicas}) == 1 def auxiliary_performfit(tmp_path, replica=1, timing=True, rel_error=2e-3): diff --git a/validphys2/src/validphys/app.py b/validphys2/src/validphys/app.py index 0de5827f6e..a7cd2a6489 100644 --- a/validphys2/src/validphys/app.py +++ b/validphys2/src/validphys/app.py @@ -50,6 +50,7 @@ 'validphys.covmats', 'validphys.hyperoptplot', 'validphys.deltachi2', + 'validphys.n3fit_data', 'reportengine.report', ] diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py new file mode 100644 index 0000000000..9d94c12839 --- /dev/null +++ b/validphys2/src/validphys/n3fit_data.py @@ -0,0 +1,513 @@ +""" +n3fit_data.py + +Providers which prepare the data ready for +:py:func:`n3fit.performfit.performfit`. Returns python objects but the underlying +functions make calls to libnnpdf C++ library. + +""" +from collections import defaultdict +from copy import deepcopy +import hashlib +import logging + +import numpy as np +import pandas as pd + +from NNPDF import RandomGenerator +from reportengine import collect +from reportengine.table import table + +from validphys.n3fit_data_utils import ( + common_data_reader_experiment, + positivity_reader, +) + +log = logging.getLogger(__name__) + +def replica_trvlseed(replica, trvlseed): + """Generates the ``trvlseed`` for a ``replica``.""" + # TODO: move to the new infrastructure + # https://numpy.org/doc/stable/reference/random/index.html#introduction + np.random.seed(seed=trvlseed) + for _ in range(replica): + res = np.random.randint(0, pow(2, 31)) + return res + +def replica_nnseed(replica, nnseed): + """Generates the ``nnseed`` for a ``replica``.""" + np.random.seed(seed=nnseed) + for _ in range(replica): + res = np.random.randint(0, pow(2, 31)) + return res + +def replica_mcseed(replica, mcseed, genrep): + """Generates the ``mcseed`` for a ``replica``.""" + if not genrep: + return None + np.random.seed(seed=mcseed) + for _ in range(replica): + res = np.random.randint(0, pow(2, 31)) + return res + + +def tr_masks(data, replica_trvlseed): + """Generate the boolean masks used to split data into training and + validation points. Returns a list of 1-D boolean arrays, one for each + dataset. Each array has length equal to N_data, the datapoints which + will be included in the training are ``True`` such that + + tr_data = data[tr_mask] + + """ + nameseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10 ** 8 + nameseed += replica_trvlseed + # TODO: update this to new random infrastructure. + np.random.seed(nameseed) + trmask_partial = [] + for dataset in data.datasets: + # TODO: python commondata will not require this rubbish. + # all data if cuts are None + cuts = dataset.cuts + ndata = len(cuts.load()) if cuts else dataset.commondata.ndata + frac = dataset.frac + trmax = int(frac * ndata) + mask = np.concatenate( + [np.ones(trmax, dtype=np.bool), np.zeros(ndata - trmax, dtype=np.bool)] + ) + np.random.shuffle(mask) + trmask_partial.append(mask) + return trmask_partial + +def kfold_masks(kpartitions, data): + """Collect the masks (if any) due to kfolding for this data. + These will be applied to the experimental data before starting + the training of each fold. + + Parameters + ---------- + kpartitions: list[dict] + list of partitions, each partition dictionary with key-value pair + `datasets` and a list containing the names of all datasets in that + partition. See n3fit/runcards/Basic_hyperopt.yml for an example + runcard or the hyperopt documentation for an expanded discussion on + k-fold partitions. + data: validphys.core.DataGroupSpec + full list of data which is to be partitioned. + + Returns + ------- + kfold_masks: list[np.array] + A list containing a boolean array for each partition. Each array is + a 1-D boolean array with length equal to the number of cut datapoints + in ``data``. If a dataset is included in a particular fold then the + mask will be True for the elements corresponding to those datasets + such that data.load().get_cv()[kfold_masks[i]] will return the + datapoints in the ith partition. See example below. + + Examples + -------- + >>> from validphys.api import API + >>> partitions=[ + ... {"datasets": ["HERACOMBCCEM", "HERACOMBNCEP460", "NMC", "NTVNBDMNFe"]}, + ... {"datasets": ["HERACOMBCCEP", "HERACOMBNCEP575", "NMCPD", "NTVNUDMNFe"]} + ... ] + >>> ds_inputs = [{"dataset": ds} for part in partitions for ds in part["datasets"]] + >>> kfold_masks = API.kfold_masks(dataset_inputs=ds_inputs, kpartitions=partitions, theoryid=53, use_cuts="nocuts") + >>> len(kfold_masks) # one element for each partition + 2 + >>> kfold_masks[0] # mask which splits data into first partition + array([False, False, False, ..., True, True, True]) + >>> data = API.data(dataset_inputs=ds_inputs, theoryid=53, use_cuts="nocuts") + >>> fold_data = data.load().get_cv()[kfold_masks[0]] + >>> len(fold_data) + 604 + >>> kfold_masks[0].sum() + 604 + + """ + list_folds = [] + if kpartitions is not None: + for partition in kpartitions: + data_fold = partition.get("datasets", []) + mask = [] + for dataset in data.datasets: + # TODO: python commondata will not require this rubbish. + # all data if cuts are None + cuts = dataset.cuts + ndata = len(cuts.load()) if cuts else dataset.commondata.ndata + # If the dataset is in the fold, its mask is full of 0s + if str(dataset) in data_fold: + mask.append(np.zeros(ndata, dtype=np.bool)) + # otherwise of ones + else: + mask.append(np.ones(ndata, dtype=np.bool)) + list_folds.append(np.concatenate(mask)) + return list_folds + + +def _mask_fk_tables(dataset_dicts, tr_masks): + """ + Internal function which masks the fktables for a group of datasets. + + Parameters + ---------- + dataset_dicts: list[dict] + list of datasets dictionaries returned by + :py:func:`validphys.n3fit_data_utils.common_data_reader_experiment`. + tr_masks: list[np.array] + a tuple containing the lists of training masks for each dataset. + + Return + ------ + data_trmask: np.array + boolean array resulting from concatenating the training masks of + each dataset. + + Note: the returned masks are only used in order to mask the covmat + """ + trmask_partial = tr_masks + for dataset_dict, tr_mask in zip(dataset_dicts, trmask_partial): + # Generate the training and validation fktables + tr_fks = [] + vl_fks = [] + ex_fks = [] + vl_mask = ~tr_mask + for fktable_dict in dataset_dict["fktables"]: + tr_fks.append(fktable_dict["fktable"][tr_mask]) + vl_fks.append(fktable_dict["fktable"][vl_mask]) + ex_fks.append(fktable_dict.get("fktable")) + dataset_dict["tr_fktables"] = tr_fks + dataset_dict["vl_fktables"] = vl_fks + dataset_dict["ex_fktables"] = ex_fks + + return np.concatenate(trmask_partial) + + +def generate_data_replica(data, replica_mcseed): + """Generate a pseudodata replica for ``data`` given the ``replica_seed``""" + spec_c = data.load() + base_mcseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10 ** 8 + # copy C++ object to avoid mutation + # t0 not required for replica generation, since libnnpdf uses experimental + # covmat to generate replicas. + spec_replica_c = type(spec_c)(spec_c) + + # Replica generation + if replica_mcseed is not None: + mcseed = base_mcseed + replica_mcseed + RandomGenerator.InitRNG(0, mcseed) + spec_replica_c.MakeReplica() + return spec_replica_c.get_cv() + + +def fitting_data_dict( + data, + generate_data_replica, + tr_masks, + kfold_masks, + t0set=None, + diagonal_basis=None, +): + """ + Provider which takes the information from validphys ``data``. + + Returns + ------- + all_dict_out: dict + Containing all the information of the experiment/dataset + for training, validation and experimental With the following keys: + + 'datasets' + list of dictionaries for each of the datasets contained in ``data`` + 'name' + name of the ``data`` - typically experiment/group name + 'expdata_true' + non-replica data + 'invcovmat_true' + inverse of the covmat (non-replica) + 'trmask' + mask for the training data + 'invcovmat' + inverse of the covmat for the training data + 'ndata' + number of datapoints for the training data + 'expdata' + experimental data (replica'd) for training + 'vlmask' + (same as above for validation) + 'invcovmat_vl' + (same as above for validation) + 'ndata_vl' + (same as above for validation) + 'expdata_vl' + (same as above for validation) + 'positivity' + bool - is this a positivity set? + 'count_chi2' + should this be counted towards the chi2 + """ + # TODO: Plug in the python data loading when available. Including but not + # limited to: central values, ndata, replica generation, covmat construction + spec_c = data.load() + ndata = spec_c.GetNData() + expdata_true = spec_c.get_cv().reshape(1, ndata) + if t0set: + t0pdfset = t0set.load_t0() + spec_c.SetT0(t0pdfset) + + expdata = generate_data_replica + + datasets = common_data_reader_experiment(spec_c, data) + + # t0 covmat + covmat = spec_c.get_covmat() + inv_true = np.linalg.inv(covmat) + + if diagonal_basis: + log.info("working in diagonal basis.") + eig, v = np.linalg.eigh(covmat) + dt_trans = v.T + else: + dt_trans = None + dt_trans_tr = None + dt_trans_vl = None + + + # Copy dataset dict because we mutate it. + datasets_copy = deepcopy(datasets) + + tr_mask = _mask_fk_tables(datasets_copy, tr_masks) + vl_mask = ~tr_mask + + if diagonal_basis: + expdata = np.matmul(dt_trans, expdata) + # make a 1d array of the diagonal + covmat_tr = eig[tr_mask] + invcovmat_tr = 1./covmat_tr + + covmat_vl = eig[vl_mask] + invcovmat_vl = 1./covmat_vl + + # prepare a masking rotation + dt_trans_tr = dt_trans[tr_mask] + dt_trans_vl = dt_trans[vl_mask] + else: + covmat_tr = covmat[tr_mask].T[tr_mask] + invcovmat_tr = np.linalg.inv(covmat_tr) + + covmat_vl = covmat[vl_mask].T[vl_mask] + invcovmat_vl = np.linalg.inv(covmat_vl) + + ndata_tr = np.count_nonzero(tr_mask) + expdata_tr = expdata[tr_mask].reshape(1, ndata_tr) + + ndata_vl = np.count_nonzero(vl_mask) + expdata_vl = expdata[vl_mask].reshape(1, ndata_vl) + + # Now save a dictionary of training/validation/experimental folds + # for training and validation we need to apply the tr/vl masks + # for experimental we need to negate the mask + folds = defaultdict(list) + for fold in kfold_masks: + folds["training"].append(fold[tr_mask]) + folds["validation"].append(fold[vl_mask]) + folds["experimental"].append(~fold) + + dict_out = { + "datasets": datasets_copy, + "name": str(data), + "expdata_true": expdata_true, + "invcovmat_true": inv_true, + "trmask": tr_mask, + "invcovmat": invcovmat_tr, + "ndata": ndata_tr, + "expdata": expdata_tr, + "vlmask": vl_mask, + "invcovmat_vl": invcovmat_vl, + "ndata_vl": ndata_vl, + "expdata_vl": expdata_vl, + "positivity": False, + "count_chi2": True, + "folds" : folds, + "data_transformation": dt_trans_tr, + "data_transformation_vl": dt_trans_vl, + } + return dict_out + +exps_fitting_data_dict = collect("fitting_data_dict", ("group_dataset_inputs_by_experiment",)) + +def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nnseed): + """For a single replica return a tuple of the inputs to this function. + Used with `collect` over replicas to avoid having to perform multiple + collects. + + See Also + -------- + replicas_nnseed_fitting_data_dict - the result of collecting this function + over replicas. + + """ + return (replica, exps_fitting_data_dict, replica_nnseed) + +replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",)) + +exps_pseudodata = collect("generate_data_replica", ("group_dataset_inputs_by_experiment",)) +replicas_exps_pseudodata = collect("exps_pseudodata", ("replicas",)) + +@table +def pseudodata_table(replicas_exps_pseudodata, replicas, experiments_index): + """Creates a pandas DataFrame containing the generated pseudodata. The + index is :py:func:`validphys.results.experiments_index` and the columns + are the replica numbers. + + Notes + ----- + Whilst running ``n3fit``, this action will only be called if + `fitting::savepseudodata` is `true` and replicas are fitted one at a time. + The table can be found in the replica folder i.e. /nnfit/replica_*/ + + """ + rep_dfs = [] + for rep_exps_pseudodata, rep in zip(replicas_exps_pseudodata, replicas): + all_pseudodata = np.concatenate(rep_exps_pseudodata) + rep_dfs.append(pd.DataFrame( + all_pseudodata, + columns=[f"replica {rep}"], + index=experiments_index + )) + return pd.concat(rep_dfs, axis=1) + + +exps_tr_masks = collect("tr_masks", ("group_dataset_inputs_by_experiment",)) +replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",)) + + +@table +def training_mask_table(replicas_exps_tr_masks, replicas, experiments_index): + """Save the boolean mask used to split data into training and validation + for each replica as a pandas DataFrame, indexed by + :py:func:`validphys.results.experiments_index`. Can be used to reconstruct + the training and validation data used in a fit. + + Parameters + ---------- + replicas_exps_tr_masks: list[list[list[np.array]]] + Result of :py:func:`tr_masks` collected over experiments then replicas, + which creates the nested structure. The outer list is len(replicas), + the next list is len(group_dataset_inputs_by_experiment) and the + inner-most list has an array for each dataset in that particular + experiment - as defined by the metadata. The arrays should be 1-D + boolean arrays which can be used as masks. + replicas: NSlist + Namespace list of replica numbers to tabulate masks for, each element + of the list should be a `replica`. See example below for more + information. + experiments_index: pd.MultiIndex + Index returned by :py:func:`validphys.results.experiments_index`. + + + Example + ------- + >>> from validphys.api import API + >>> from reportengine.namespaces import NSList + >>> # create namespace list for collects over replicas. + >>> reps = NSList(list(range(1, 4)), nskey="replica") + >>> ds_inp = [ + ... {'dataset': 'NMC', 'frac': 0.75}, + ... {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75}, + ... {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75} + ... ] + >>> API.training_mask_table(dataset_inputs=ds_inp, replicas=reps, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False) + replica 1 replica 2 replica 3 + group dataset id + NMC NMC 0 True False False + 1 True True True + 2 False True True + 3 True True False + 4 True True True + ... ... ... ... + CMS CMSZDIFF12 45 True True True + 46 True False True + 47 True True True + 48 False True True + 49 True True True + + [345 rows x 3 columns] + + """ + rep_dfs = [] + for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas): + # create flat list with all dataset masks in, then concatenate to single + # array. + all_masks = np.concatenate([ + ds_mask + for exp_masks in rep_exps_masks + for ds_mask in exp_masks + ]) + rep_dfs.append(pd.DataFrame( + all_masks, + columns=[f"replica {rep}"], + index=experiments_index + )) + return pd.concat(rep_dfs, axis=1) + +def fitting_pos_dict(posdataset): + """Loads a positivity dataset. For more information see + :py:func:`validphys.n3fit_data_utils.positivity_reader`. + + Parameters + ---------- + posdataset: validphys.core.PositivitySetSpec + Positivity set which is to be loaded. + + Examples + -------- + >>> from validphys.api import API + >>> posdataset = {"dataset": "POSF2U", "poslambda": 1e6} + >>> pos = API.fitting_pos_dict(posdataset=posdataset, theoryid=162) + >>> len(pos) + 9 + + """ + log.info("Loading positivity dataset %s", posdataset) + return positivity_reader(posdataset) + +posdatasets_fitting_pos_dict = collect("fitting_pos_dict", ("posdatasets",)) + + +#can't use collect here because integdatasets might not exist. +def integdatasets_fitting_integ_dict(integdatasets=None): + """Loads a integrability dataset. Calls same function as + :py:func:`fitting_pos_dict`, except on each element of + ``integdatasets`` if ``integdatasets`` is not None. + + Parameters + ---------- + integdatasets: list[validphys.core.PositivitySetSpec] + list containing the settings for the integrability sets. Examples of + these can be found in the runcards located in n3fit/runcards. They have + a format similar to ``dataset_input``. + + Examples + -------- + >>> from validphys.api import API + >>> integdatasets = [{"dataset": "INTEGXT3", "poslambda": 1e2}] + >>> res = API.integdatasets_fitting_integ_dict(integdatasets=integdatasets, theoryid=53) + >>> len(res), len(res[0]) + (1, 9) + >>> res = API.integdatasets_fitting_integ_dict(integdatasets=None) + >>> print(res) + None + + """ + if integdatasets is not None: + integ_info = [] + for integ_set in integdatasets: + log.info("Loading integrability dataset %s", integ_set) + # Use the same reader as positivity observables + integ_dict = positivity_reader(integ_set) + integ_info.append(integ_dict) + return integ_info + log.warning("Not using any integrability datasets.") + return None diff --git a/n3fit/src/n3fit/io/reader.py b/validphys2/src/validphys/n3fit_data_utils.py similarity index 99% rename from n3fit/src/n3fit/io/reader.py rename to validphys2/src/validphys/n3fit_data_utils.py index c6f598cd0c..601d383ad6 100644 --- a/n3fit/src/n3fit/io/reader.py +++ b/validphys2/src/validphys/n3fit_data_utils.py @@ -1,5 +1,8 @@ """ - Library of function for reading NNPDF objects +n3fit_data_utils.py + +Library of function for reading libnnpdf objects. + """ import hashlib from copy import deepcopy diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index dba391e6ad..b3c9c7036a 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -16,8 +16,8 @@ from reportengine import collect -import n3fit.io.reader as reader -from n3fit.performfit import initialize_seeds +from validphys.n3fit_data import replica_mcseed, replica_trvlseed +import validphys.n3fit_data_utils as reader log = logging.getLogger(__name__) @@ -301,12 +301,20 @@ def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=N # include the last replica replica = range(1, num_fitted_replicas + 1) - trvlseed, nnseed, mcseed, genrep = [ + trvlseed, mcseed, genrep = [ fit.as_input().get("fitting").get(i) - for i in ["trvlseed", "nnseed", "mcseed", "genrep"] + for i in ["trvlseed", "mcseed", "genrep"] ] - seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep) + # common_data_reader expects None if genrep is False + if genrep: + replicas_mcseed = [ + replica_mcseed(rep, mcseed, genrep) for rep in replica + ] + else: + replicas_mcseed = None + + replicas_trvlseeds = [replica_trvlseed(rep, trvlseed) for rep in replica] def task(d, mcseeds, trvlseeds, replicas): all_exp_infos = [[] for _ in range(len(mcseeds))] @@ -321,7 +329,7 @@ def task(d, mcseeds, trvlseeds, replicas): if NPROC == 1: pseudodata_dicts = dict() - task(pseudodata_dicts, seeds.mcseeds, seeds.trvlseeds, replica) + task(pseudodata_dicts, replicas_mcseed, replicas_trvlseeds, replica) else: with mp.Manager() as manager: d = manager.dict() @@ -340,8 +348,8 @@ def task(d, mcseeds, trvlseeds, replicas): list_split = lambda lst, n: [ arr.tolist() for arr in np.array_split(lst, n) ] - batched_mcseeds = list_split(seeds.mcseeds, NPROC) - batched_trvlseeds = list_split(seeds.trvlseeds, NPROC) + batched_mcseeds = list_split(replicas_mcseed, NPROC) + batched_trvlseeds = list_split(replicas_trvlseeds, NPROC) batched_replica_num = list_split(replica, NPROC) for mc_batch, trvl_batch, replica_batch in zip( batched_mcseeds, batched_trvlseeds, batched_replica_num diff --git a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle b/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle index a7e6829337..e854305bfb 100644 Binary files a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle and b/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle differ