diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 530ced5815..02b8268b9b 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -62,23 +62,23 @@ def covmat_from_systematics(commondata): `paper `_ outlining the procedure, specifically equation 2 and surrounding text. - Paramaters + Parameters ---------- - commondata : validphys.coredata.CommonData + commondata: validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. Returns ------- - cov_mat : np.array - Numpy array which is N_dat x N_dat (where N_dat is the number of data - points after cuts) containing uncertainty and correlation information. + cov_mat: np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. Example ------- >>> from validphys.commondataparser import load_commondata >>> from validphys.loader import Loader - >>> from validphys.calcutils import covmat_from_systematics + >>> from validphys.covmats import covmat_from_systematics >>> l = Loader() >>> cd = l.check_commondata("NMC") >>> cd = load_commondata(cd) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index d62331f467..fc4cc81324 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -12,12 +12,17 @@ import pandas as pd from validphys.checks import check_cuts_fromfit, check_darwin_single_process +from validphys.commondataparser import load_commondata +from validphys.coredata import CommonData +from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect import n3fit.io.reader as reader from n3fit.performfit import initialize_seeds +import NNPDF + log = logging.getLogger(__name__) @@ -25,6 +30,66 @@ context_index = collect("groups_index", ("fitcontext",)) + +def reconstruct_replica(fit, list_of_commondata, replica: int): + """A function that reconstructs the pseudodata seen by replica number ``replica`` + + #TODO: add list_of_commondata + Parameters + ---------- + fit: validphys.core.fit + The input fit + replica: int + The replica number for which the pseudodata is reconstructed + + Returns + ------- + #XXX: maybe this will be a pd.Series + pseudodata: np.array + The pseudodata in array format + + Raises + ------ + KeyError: + If one of ``trvalseed``, ``nnseed``, or ``mcseed`` is not found under the ``fitting`` + namespace + + Example: + >>> from validphys.api import API + >>> API.reconstruct_replica(fit="NNPDF40_nnlo_as_0118", replica=0) + #TODO: finish this example + """ + log.warning(f"Attempt to reconstruct pseudodata for MC replica {replica} " + f"of {fit}. Be advised that this functionality is not available " + "for all fits, but only those fitted after N3FIT switched to the " + "Python MakeReplica function." + ) + + runcard = fit.as_input() + + try: + fit_ns = runcard["fitting"] + trvlseed = fit_ns["trvlseed"] + nnseed = fit_ns["nnseed"] + mcseed = fit_ns["mcseed"] + except KeyError as e: + raise KeyError(f"The following keys were not found in the fitting namespace {str(e)}, " + "please ensure the input fit is a valid N3FIT fit." + ) from e + + seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep=True) + + pseudodata = make_replica(list_of_commondata, seed=mcseed) + + # TODO: change the need for datasets and exp_name + trmasks, vlmasks = reader.make_tr_val_mask(datasets=None, exp_name=None, seed=trvlseed) + + training = pseudodata[:, trmasks] + validation = pseudodata[:, vlmasks] + + return training, validation + + @check_cuts_fromfit def read_fit_pseudodata(fitcontext, context_index): """Generator to handle the reading of training and validation splits for a fit that has been @@ -104,6 +169,124 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index +def make_replica(list_of_commondata, seed=None): + """Function that takes in a list of :py:class:`validphys.coredata.CommonData` + objects and returns pseudodata replicas of the data central value accounting for + possible correlations between systematic uncertainties. + + The function loops until positive definite pseudodata is generated for any + non-asymmetry datasets. In the case of an asymmetry dataset negative values are + permitted so the loop block executes only once. + + Parameters + --------- + list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`] + Iterable of CommonData objects which stores information about systematic errors, + their treatment and description, for each dataset. + + seed: int, None + Seed used to initialise the numpy random number generator. If ``None`` then a random seed is + allocated using the default numpy behaviour. + + Returns + ------- + pseudodata: np.array + Numpy array which is N_dat (where N_dat is the combined number of data points after cuts) + containing monte carlo samples of data centered around the data central value. + + Example + ------- + >>> from validphys.commondataparser import load_commondata + >>> from validphys.loader import Loader + >>> from validphys.pseudodata import make_replica + >>> l = Loader() + >>> cd1 = l.check_commondata("NMC") + >>> cd2 = l.check_commondata("NMCPD") + >>> lds = map(load_commondata, (cd1, cd2)) + >>> make_replica(lds) + array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , + 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, + 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , + """ + # Seed the numpy RNG with the seed. + rng = np.random.default_rng(seed=seed) + + # The inner while True loop is for ensuring a positive definite + # pseudodata replica + while True: + pseudodatas = [] + special_add = [] + special_mult = [] + mult_shifts = [] + check_positive_masks = [] + for cd in list_of_commondata: + pseudodata = cd.central_values.to_numpy() + + # add contribution from statistical uncertainty + pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata)) + + # ~~~ ADDITIVE ERRORS ~~~ + add_errors = cd.additive_errors + add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() + + pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1) + + # correlated within dataset + add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() + pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1]) + + # append the partially shifted pseudodata + pseudodatas.append(pseudodata) + # store the additive errors with correlations between datasets for later use + special_add.append( + add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + # ~~~ MULTIPLICATIVE ERRORS ~~~ + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 + ).prod(axis=1) + + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + mult_shift *= ( + 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 + ).prod(axis=1) + + mult_shifts.append(mult_shift) + + # store the multiplicative errors with correlations between datasets for later use + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + + # mask out the data we want to check are all positive + if "ASY" in cd.commondataproc: + check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) + else: + check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) + + # if we sort here (which sorts columns), then permuting datasets doesn't change the result + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() + + + all_pseudodata = ( + np.concatenate(pseudodatas, axis=0) + + special_add_errors @ rng.normal(size=special_add_errors.shape[1]) + ) * ( + np.concatenate(mult_shifts, axis=0) + * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1) + ) + + if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): + break + + return all_pseudodata + + @check_darwin_single_process def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): """A function to obtain information about the pseudodata that went