From eef21af15f1e256c58fa490d231700e4c6fb37b6 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 19 Aug 2020 14:57:02 +0100 Subject: [PATCH 01/37] A direct translation of the C++ pseudodata code No attempt at optimization has been done Uses the same seed structure as C++ so will give the same pseudodata if the same RNG seed is used. This commit is a proof of concept. --- validphys2/src/validphys/covmats.py | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 74989025a0..1c2c59272f 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -11,6 +11,8 @@ from reportengine.table import table from validphys.calcutils import regularize_covmat, get_df_block +from validphys.core import PDF, DataGroupSpec, DataSetSpec +from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, check_norm_threshold, @@ -21,10 +23,64 @@ from validphys.core import PDF, DataGroupSpec, DataSetSpec from validphys.results import ThPredictionsResult +import NNPDF + log = logging.getLogger(__name__) INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") +def make_replica(commondata, seed=1): + # dataset_input: {dataset: NMC, frac: 0.5} + + # use_cuts: nocuts + + # theoryid: 53 + + # template_text: | + # {@make_replica@} + + # actions_: + # - report(main=True) + + # TODO: eventually remove the inhouse random number generator + # in favour of e.g numpy + NNPDF.RandomGenerator.InitRNG(0, seed) + rng = NNPDF.RandomGenerator.GetRNG() + + ld_cd = load_commondata(commondata) + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(covmat_for_sampling) + + rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] + deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] + deviates, rand = map(np.array, (deviates, rand)) + + correlated_deviates = sampling_matrix @ deviates + + artdata = np.array(ld_cd.central_values) + correlated_deviates + + sys_table = ld_cd.sys_errors + for i in range(ld_cd.ndata): + xnor = 1 + for j in range(ld_cd.nsys): + sys = sys_table.iloc[i, j] + if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): + continue + + if sys.name == 'UNCORR': + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) + else: + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rand[j] * sys.mult/100) + artdata[i] = xnor * artdata[i] + + # TODO: check the pseudodata is positive + # requires implementing the above code in a while loop + return artdata + def covmat_from_systematics(commondata, central_values=None): """Take the statistical uncertainty and systematics table from @@ -74,6 +130,12 @@ def covmat_from_systematics(commondata, central_values=None): can be used to calculate, for example, the t0 covariance matrix by using the predictions from the central member of the t0 pdf. + use_mult_errors: bool + Boolean which controls whether we use multiplicative errors + when computing the covmat. By default it should be set to ``True`` + unles one wishes to uses a sampling matrix for pseudodata generation, + in which case it should be set to ``False``. + Returns ------- cov_mat : np.array From dc4dd22118bf3b841916acdc5d96ed7be44c36d3 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 20 Aug 2020 11:46:02 +0100 Subject: [PATCH 02/37] Loop until positivity of pseudodata is achieved --- validphys2/src/validphys/covmats.py | 54 ++++++++++++++++------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 1c2c59272f..835b8d62d0 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -51,34 +51,40 @@ def make_replica(commondata, seed=1): covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) - rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] - deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] - deviates, rand = map(np.array, (deviates, rand)) - - correlated_deviates = sampling_matrix @ deviates + # We need to loop until the pseudodata is positive + while True: + rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] + deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] + deviates, rand = map(np.array, (deviates, rand)) + + correlated_deviates = sampling_matrix @ deviates + + artdata = np.array(ld_cd.central_values) + correlated_deviates + + sys_table = ld_cd.sys_errors + for i in range(ld_cd.ndata): + xnor = 1 + for j in range(ld_cd.nsys): + sys = sys_table.iloc[i, j] + if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): + continue - artdata = np.array(ld_cd.central_values) + correlated_deviates + if sys.name == 'UNCORR': + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) + else: + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rand[j] * sys.mult/100) + artdata[i] = xnor * artdata[i] - sys_table = ld_cd.sys_errors - for i in range(ld_cd.ndata): - xnor = 1 - for j in range(ld_cd.nsys): - sys = sys_table.iloc[i, j] - if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): - continue + artdata = np.array(artdata) - if sys.name == 'UNCORR': - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) - else: - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rand[j] * sys.mult/100) - artdata[i] = xnor * artdata[i] + # Check positivity of the pseudodata + if all(artdata > 0): + break - # TODO: check the pseudodata is positive - # requires implementing the above code in a while loop return artdata From 9abb31c2ed1905191100ff8b8f989aff9fff8518 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 20 Aug 2020 15:47:37 +0100 Subject: [PATCH 03/37] Using pandas and numpy to generate the pseudodata This removes the for loop implementation --- validphys2/src/validphys/covmats.py | 73 +++++++++++++++++++---------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 835b8d62d0..963893bc4f 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -51,41 +51,64 @@ def make_replica(commondata, seed=1): covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) + central_values = ld_cd.central_values + nsys, ndata = ld_cd.nsys, ld_cd.ndata + + # Define these variables here to prevent them + # from being redefined in the while loop + sys_names = ld_cd.systype_table["name"] + sys_types = ld_cd.systype_table["type"] + systematics = ld_cd.sys_errors + + # Drop any systematics that are related to theory uncertainties + # or have explicitly been labelled as a systematic to be skipped. + # We are only interested in multiplicative uncertainties + mult_uncorr_sys = systematics.loc[ + :, + (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) + & (sys_types == "MULT"), + ] + mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + + # We want to know which of the remaining systematics were + # uncorrelated. The other we mark as NaN + mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + # We need to loop until the pseudodata is positive while True: - rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] - deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] - deviates, rand = map(np.array, (deviates, rand)) + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) correlated_deviates = sampling_matrix @ deviates - artdata = np.array(ld_cd.central_values) + correlated_deviates - - sys_table = ld_cd.sys_errors - for i in range(ld_cd.ndata): - xnor = 1 - for j in range(ld_cd.nsys): - sys = sys_table.iloc[i, j] - if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): - continue - - if sys.name == 'UNCORR': - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) - else: - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rand[j] * sys.mult/100) - artdata[i] = xnor * artdata[i] + pseudodata = np.array(central_values) + correlated_deviates + + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( + [ + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) + ] + ) - artdata = np.array(artdata) + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata # Check positivity of the pseudodata - if all(artdata > 0): + if all(pseudodata > 0): break - return artdata + return pseudodata def covmat_from_systematics(commondata, central_values=None): From 684131710bfe23c704448555cd06532dd39aca1c Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 24 Aug 2020 15:24:47 +0100 Subject: [PATCH 04/37] Adding docstring for make_replica function Also correcting some typos in the docstring of covmat_from_systematics --- validphys2/src/validphys/covmats.py | 62 ++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 963893bc4f..41591541cb 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -12,6 +12,7 @@ from validphys.calcutils import regularize_covmat, get_df_block from validphys.core import PDF, DataGroupSpec, DataSetSpec +from validphys.coredata import CommonData from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, @@ -30,24 +31,58 @@ INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") def make_replica(commondata, seed=1): - # dataset_input: {dataset: NMC, frac: 0.5} + """Function that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas + of the data central value. - # use_cuts: nocuts + The square root (obtained by the Cholesky decomposition) of the covariance + matrix is used as the sampling matrix. The covariance matrix is obtained using + :py:func:`validphys.covmats.covmat_from_systematics` and must not use + multiplicative systematic uncertainties when obtaining the sampling matrix. - # theoryid: 53 + Parameters + --------- + commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` + CommonData which stores information about systematic errors, + their treatment and description. - # template_text: | - # {@make_replica@} + seed: int + Seed used to initialise the random number generator. - # actions_: - # - report(main=True) + Returns + ------- + pseudodata: np.array + Numpy array which is N_dat (where N_dat is the number of data points after cuts) + containing monte carlo samples of data centered around the data central value. + + Example + ------- + >>> from validphys.loader import Loader + >>> l = Loader() + >>> from validphys.covmats import make_replica + >>> cd = l.check_commondata("NMC") + >>> make_replica(cd) + - Random Generator allocated: ranlux + array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + + .. todo:: Replace the inhouse random number generation with a numpy equivalent. + .. todo:: Allow for correlations between datasets within an experiment. + """ # TODO: eventually remove the inhouse random number generator # in favour of e.g numpy NNPDF.RandomGenerator.InitRNG(0, seed) rng = NNPDF.RandomGenerator.GetRNG() - ld_cd = load_commondata(commondata) + if not isinstance(commondata, CommonData): + ld_cd = load_commondata(commondata) + else: + ld_cd = commondata + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) @@ -70,8 +105,7 @@ def make_replica(commondata, seed=1): ] mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) - # We want to know which of the remaining systematics were - # uncorrelated. The other we mark as NaN + # We want to know the names of the remaining systematics. mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] # We need to loop until the pseudodata is positive @@ -149,7 +183,7 @@ def covmat_from_systematics(commondata, central_values=None): Parameters ---------- - commondata : validphys.coredata.CommonData + commondata: validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. central_values : None, np.array @@ -167,9 +201,9 @@ def covmat_from_systematics(commondata, central_values=None): Returns ------- - cov_mat : np.array - Numpy array which is N_dat x N_dat (where N_dat is the number of data - points after cuts) containing uncertainty and correlation information. + cov_mat: np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. Example ------- From 8005e8403a952c768c22e622e66edbb0ddd3e371 Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Wed, 26 Aug 2020 13:38:58 +0100 Subject: [PATCH 05/37] Apply suggestions from code review Co-authored-by: Cameron Voisey <32741139+voisey@users.noreply.github.com> --- validphys2/src/validphys/covmats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 41591541cb..88459ea5e4 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -32,7 +32,7 @@ def make_replica(commondata, seed=1): """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas + :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas of the data central value. The square root (obtained by the Cholesky decomposition) of the covariance @@ -196,7 +196,7 @@ def covmat_from_systematics(commondata, central_values=None): use_mult_errors: bool Boolean which controls whether we use multiplicative errors when computing the covmat. By default it should be set to ``True`` - unles one wishes to uses a sampling matrix for pseudodata generation, + unless one wishes to uses a sampling matrix for pseudodata generation, in which case it should be set to ``False``. Returns From 01f2b7fc14debfd639df28d83ed40517bf71eac9 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 27 Aug 2020 13:00:32 +0100 Subject: [PATCH 06/37] Moving to pseudodata.py --- validphys2/src/validphys/covmats.py | 4 - validphys2/src/validphys/pseudodata.py | 120 +++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 88459ea5e4..efdf356433 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -12,8 +12,6 @@ from validphys.calcutils import regularize_covmat, get_df_block from validphys.core import PDF, DataGroupSpec, DataSetSpec -from validphys.coredata import CommonData -from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, check_norm_threshold, @@ -24,8 +22,6 @@ from validphys.core import PDF, DataGroupSpec, DataSetSpec from validphys.results import ThPredictionsResult -import NNPDF - log = logging.getLogger(__name__) INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index d62331f467..d0990114ab 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -12,12 +12,17 @@ import pandas as pd from validphys.checks import check_cuts_fromfit, check_darwin_single_process +from validphys.commondataparser import load_commondata +from validphys.coredata import CommonData +from validphys.covmats import covmat_from_systematics, sqrt_covmat from reportengine import collect import n3fit.io.reader as reader from n3fit.performfit import initialize_seeds +import NNPDF + log = logging.getLogger(__name__) @@ -104,6 +109,121 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index +def make_replica(commondata, seed=1): + """Function that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas + of the data central value. + + The square root (obtained by the Cholesky decomposition) of the covariance + matrix is used as the sampling matrix. The covariance matrix is obtained using + :py:func:`validphys.covmats.covmat_from_systematics` and must not use + multiplicative systematic uncertainties when obtaining the sampling matrix. + + Parameters + --------- + commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` + CommonData which stores information about systematic errors, + their treatment and description. + + seed: int + Seed used to initialise the random number generator. + + Returns + ------- + pseudodata: np.array + Numpy array which is N_dat (where N_dat is the number of data points after cuts) + containing monte carlo samples of data centered around the data central value. + + + Example + ------- + >>> from validphys.loader import Loader + >>> l = Loader() + >>> from validphys.pseudodata import make_replica + >>> cd = l.check_commondata("NMC") + >>> make_replica(cd) + - Random Generator allocated: ranlux + array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + + .. todo:: Replace the inhouse random number generation with a numpy equivalent. + .. todo:: Allow for correlations between datasets within an experiment. + """ + # TODO: eventually remove the inhouse random number generator + # in favour of e.g numpy + NNPDF.RandomGenerator.InitRNG(0, seed) + rng = NNPDF.RandomGenerator.GetRNG() + + if not isinstance(commondata, CommonData): + ld_cd = load_commondata(commondata) + else: + ld_cd = commondata + + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(covmat_for_sampling) + + central_values = ld_cd.central_values + nsys, ndata = ld_cd.nsys, ld_cd.ndata + + # Define these variables here to prevent them + # from being redefined in the while loop + sys_names = ld_cd.systype_table["name"] + sys_types = ld_cd.systype_table["type"] + systematics = ld_cd.sys_errors + + # Drop any systematics that are related to theory uncertainties + # or have explicitly been labelled as a systematic to be skipped. + # We are only interested in multiplicative uncertainties + mult_uncorr_sys = systematics.loc[ + :, + (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) + & (sys_types == "MULT"), + ] + mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + + # We want to know the names of the remaining systematics. + mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + + # We need to loop until the pseudodata is positive + while True: + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) + + correlated_deviates = sampling_matrix @ deviates + + pseudodata = np.array(central_values) + correlated_deviates + + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( + [ + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) + ] + ) + + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + + # Check positivity of the pseudodata + if all(pseudodata > 0): + break + + return pseudodata + + @check_darwin_single_process def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): """A function to obtain information about the pseudodata that went From ed9edaa8aaa2fc0f2cd426f6dcadeaa3a16f7113 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 27 Aug 2020 15:54:30 +0100 Subject: [PATCH 07/37] Making the function into an infinite generator Thus it remembers the RNG state --- validphys2/src/validphys/pseudodata.py | 74 ++++++++++++++------------ 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index d0990114ab..69a65e2aef 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -110,9 +110,10 @@ def read_fit_pseudodata(fitcontext, context_index): def make_replica(commondata, seed=1): - """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas - of the data central value. + """Generator that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas + of the data central value. This generator is infinite in the sense that it will + never raise a StopIteration exception The square root (obtained by the Cholesky decomposition) of the covariance matrix is used as the sampling matrix. The covariance matrix is obtained using @@ -141,13 +142,14 @@ def make_replica(commondata, seed=1): >>> l = Loader() >>> from validphys.pseudodata import make_replica >>> cd = l.check_commondata("NMC") - >>> make_replica(cd) + >>> data_generator = make_replica(cd) + >>> next(data_generator) - Random Generator allocated: ranlux array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. @@ -187,41 +189,45 @@ def make_replica(commondata, seed=1): # We want to know the names of the remaining systematics. mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] - # We need to loop until the pseudodata is positive + # We make this generator infinite in the sense that it shall never + # raise a StopIteration exception. Thus we can continuely request + # pseudodata from it and it will remember the RNG state. while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) + # We need to loop until the pseudodata is positive + while True: + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - correlated_deviates = sampling_matrix @ deviates + correlated_deviates = sampling_matrix @ deviates - pseudodata = np.array(central_values) + correlated_deviates + pseudodata = np.array(central_values) + correlated_deviates - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) ] - for _ in range(ndata) - ] - ) + ) - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata - if all(pseudodata > 0): - break + # Check positivity of the pseudodata + if all(pseudodata > 0): + break - return pseudodata + yield pseudodata @check_darwin_single_process From 7bc29fd74d87da3a6e197744b9cafb10b8ff5ed4 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 4 Sep 2020 10:54:01 +0100 Subject: [PATCH 08/37] Adding comment explaining why we divide the multiplicative uncertainties by 100 --- validphys2/src/validphys/pseudodata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 69a65e2aef..186747f62c 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -184,6 +184,8 @@ def make_replica(commondata, seed=1): (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) & (sys_types == "MULT"), ] + # The multiplicative uncertainties are quoted as percentages, so + # we change them to fractional values mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) # We want to know the names of the remaining systematics. From 3ccda956d82352fdbb30cf2accb5ba650e27db91 Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Fri, 4 Sep 2020 12:29:26 +0100 Subject: [PATCH 09/37] Update validphys2/src/validphys/covmats.py Co-authored-by: Zaharid --- validphys2/src/validphys/covmats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index efdf356433..d70a5a89c3 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -192,7 +192,7 @@ def covmat_from_systematics(commondata, central_values=None): use_mult_errors: bool Boolean which controls whether we use multiplicative errors when computing the covmat. By default it should be set to ``True`` - unless one wishes to uses a sampling matrix for pseudodata generation, + unless one wishes to sample pseudodata from a multigaussian distribution, in which case it should be set to ``False``. Returns From 37b460c075367c0c368b3030aa87e3a69398271f Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 4 Sep 2020 16:10:04 +0100 Subject: [PATCH 10/37] Changing covmat_for_sampling to additive_covmat --- validphys2/src/validphys/pseudodata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 186747f62c..09ce272ad2 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -164,8 +164,8 @@ def make_replica(commondata, seed=1): else: ld_cd = commondata - covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(covmat_for_sampling) + additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(additive_covmat) central_values = ld_cd.central_values nsys, ndata = ld_cd.nsys, ld_cd.ndata From 078cd5c4f889485010cfdaa9677dd57358460812 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 15:30:32 +0100 Subject: [PATCH 11/37] Changing variable names to better reflect their purpose --- validphys2/src/validphys/pseudodata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 09ce272ad2..e111906767 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -179,17 +179,17 @@ def make_replica(commondata, seed=1): # Drop any systematics that are related to theory uncertainties # or have explicitly been labelled as a systematic to be skipped. # We are only interested in multiplicative uncertainties - mult_uncorr_sys = systematics.loc[ + mult_sys = systematics.loc[ :, (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) & (sys_types == "MULT"), ] # The multiplicative uncertainties are quoted as percentages, so # we change them to fractional values - mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + mult_sys = mult_sys.applymap(lambda x: x.mult / 100) # We want to know the names of the remaining systematics. - mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + mult_names = sys_names.loc[mult_sys.columns] # We make this generator infinite in the sense that it shall never # raise a StopIteration exception. Thus we can continuely request @@ -213,7 +213,7 @@ def make_replica(commondata, seed=1): [ [ rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() + for i, sys_type in mult_names.iteritems() ] for _ in range(ndata) ] @@ -223,7 +223,7 @@ def make_replica(commondata, seed=1): # of the multiplicative uncertainties with the above random matrix. We then # add unity to all elements and take the product across columns. This vector # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata # Check positivity of the pseudodata if all(pseudodata > 0): From 55760a077bfc7d0c4829e362582345e1d4c36650 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 16:12:33 +0100 Subject: [PATCH 12/37] Accounting for the asymmetry edge case --- validphys2/src/validphys/pseudodata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index e111906767..1414f8f3ef 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -225,8 +225,9 @@ def make_replica(commondata, seed=1): # must then be elementwise multiplied by central values + correlated deviates. pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata - if all(pseudodata > 0): + # Check positivity of the pseudodata. Note that if the dataset is an asymmetry + # then it is allowed to be negative. + if all(pseudodata > 0) or ("ASY" in commondata.process_type): break yield pseudodata From 8217d8d0ff3974afc850886fc4b46a93e40b2f1b Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 16:55:48 +0100 Subject: [PATCH 13/37] Adding information on the nature of sampling in the docstring --- validphys2/src/validphys/pseudodata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 1414f8f3ef..0c462cd083 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -120,6 +120,11 @@ def make_replica(commondata, seed=1): :py:func:`validphys.covmats.covmat_from_systematics` and must not use multiplicative systematic uncertainties when obtaining the sampling matrix. + Gaussian sampling is performed using the additive part of the covariance matrix. + An additional gaussian sampling for the renormalization of the data having multiplicative + uncertainties happens later. All the contributions to the covariance matrix coming from + theory errors should be accounted for in the additive part + Parameters --------- commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` From f2d42c7e4f952c0e41ffa386010041fe8f2ca00c Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 6 Oct 2020 17:29:05 +0100 Subject: [PATCH 14/37] Correcting covmat_from_systematics example --- validphys2/src/validphys/covmats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index d70a5a89c3..8559d9aceb 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -205,7 +205,7 @@ def covmat_from_systematics(commondata, central_values=None): ------- >>> from validphys.commondataparser import load_commondata >>> from validphys.loader import Loader - >>> from validphys.calcutils import covmat_from_systematics + >>> from validphys.covmats import covmat_from_systematics >>> l = Loader() >>> cd = l.check_commondata("NMC") >>> cd = load_commondata(cd) From 98e53e38165054fd94ed151d07b2bea65de32703 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 13 Oct 2020 15:15:56 +0100 Subject: [PATCH 15/37] Clarifying doc strings Adding Zahari's review comments --- validphys2/src/validphys/covmats.py | 7 ++++--- validphys2/src/validphys/pseudodata.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 8559d9aceb..76a3410c8d 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -190,10 +190,11 @@ def covmat_from_systematics(commondata, central_values=None): using the predictions from the central member of the t0 pdf. use_mult_errors: bool - Boolean which controls whether we use multiplicative errors + Boolean which controls whether multiplicative errors are used when computing the covmat. By default it should be set to ``True`` - unless one wishes to sample pseudodata from a multigaussian distribution, - in which case it should be set to ``False``. + since setting this parameter to ``False`` will completely ignore multiplicative + uncertainties. This flag is present for use with pseudodata generation where the + multiplicative uncertainties should be ignored. Returns ------- diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 0c462cd083..7ed133912d 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -116,14 +116,14 @@ def make_replica(commondata, seed=1): never raise a StopIteration exception The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix. The covariance matrix is obtained using - :py:func:`validphys.covmats.covmat_from_systematics` and must not use - multiplicative systematic uncertainties when obtaining the sampling matrix. - - Gaussian sampling is performed using the additive part of the covariance matrix. - An additional gaussian sampling for the renormalization of the data having multiplicative - uncertainties happens later. All the contributions to the covariance matrix coming from - theory errors should be accounted for in the additive part + matrix is used as the sampling matrix with the covariance matrix being obtained using + :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call + to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to + ignore multiplicative uncertainties. Gaussian sampling is performed using the + additive part of the covariance matrix. An additional gaussian sampling for the + renormalization of the data having multiplicative uncertainties happens later. + All the contributions to the covariance matrix coming from theory errors should be + accounted for in the additive part. Parameters --------- From 213de7f067a9737fa3cf2b4592d0713952f1cdb4 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 13 Oct 2020 15:20:40 +0100 Subject: [PATCH 16/37] Setting default seed kwarg to None Assigning seed randomly if not provided explicilty in function call --- validphys2/src/validphys/pseudodata.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 7ed133912d..b804e68853 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -7,6 +7,7 @@ import multiprocessing as mp import os import pathlib +import random import numpy as np import pandas as pd @@ -109,7 +110,7 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index -def make_replica(commondata, seed=1): +def make_replica(commondata, seed=None): """Generator that takes in a :py:class:`validphys.coredata.CommonData` or :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas of the data central value. This generator is infinite in the sense that it will @@ -131,8 +132,10 @@ def make_replica(commondata, seed=1): CommonData which stores information about systematic errors, their treatment and description. - seed: int - Seed used to initialise the random number generator. + seed: int, None + Seed used to initialise the random number generator. If ``None`` then a random seed is allocated + using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing + inadvertent correlations. Returns ------- @@ -159,6 +162,11 @@ def make_replica(commondata, seed=1): .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. """ + # If seed is None (i.e not explicitly provided in the function call) + # then set it randomly + if seed is None: + seed = random.randint(1, 1000) + # TODO: eventually remove the inhouse random number generator # in favour of e.g numpy NNPDF.RandomGenerator.InitRNG(0, seed) From 612197f18de7990b7b3756b728d868856290fbfa Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 17 Nov 2020 11:23:53 +0000 Subject: [PATCH 17/37] Removing make replica from covmats.py --- validphys2/src/validphys/covmats.py | 114 ---------------------------- 1 file changed, 114 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 76a3410c8d..96c5453d8c 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -26,120 +26,6 @@ INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") -def make_replica(commondata, seed=1): - """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas - of the data central value. - - The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix. The covariance matrix is obtained using - :py:func:`validphys.covmats.covmat_from_systematics` and must not use - multiplicative systematic uncertainties when obtaining the sampling matrix. - - Parameters - --------- - commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` - CommonData which stores information about systematic errors, - their treatment and description. - - seed: int - Seed used to initialise the random number generator. - - Returns - ------- - pseudodata: np.array - Numpy array which is N_dat (where N_dat is the number of data points after cuts) - containing monte carlo samples of data centered around the data central value. - - - Example - ------- - >>> from validphys.loader import Loader - >>> l = Loader() - >>> from validphys.covmats import make_replica - >>> cd = l.check_commondata("NMC") - >>> make_replica(cd) - - Random Generator allocated: ranlux - array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , - - .. todo:: Replace the inhouse random number generation with a numpy equivalent. - .. todo:: Allow for correlations between datasets within an experiment. - """ - # TODO: eventually remove the inhouse random number generator - # in favour of e.g numpy - NNPDF.RandomGenerator.InitRNG(0, seed) - rng = NNPDF.RandomGenerator.GetRNG() - - if not isinstance(commondata, CommonData): - ld_cd = load_commondata(commondata) - else: - ld_cd = commondata - - covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(covmat_for_sampling) - - central_values = ld_cd.central_values - nsys, ndata = ld_cd.nsys, ld_cd.ndata - - # Define these variables here to prevent them - # from being redefined in the while loop - sys_names = ld_cd.systype_table["name"] - sys_types = ld_cd.systype_table["type"] - systematics = ld_cd.sys_errors - - # Drop any systematics that are related to theory uncertainties - # or have explicitly been labelled as a systematic to be skipped. - # We are only interested in multiplicative uncertainties - mult_uncorr_sys = systematics.loc[ - :, - (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) - & (sys_types == "MULT"), - ] - mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) - - # We want to know the names of the remaining systematics. - mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] - - # We need to loop until the pseudodata is positive - while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - - correlated_deviates = sampling_matrix @ deviates - - pseudodata = np.array(central_values) + correlated_deviates - - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ - [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() - ] - for _ in range(ndata) - ] - ) - - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata - - # Check positivity of the pseudodata - if all(pseudodata > 0): - break - - return pseudodata - def covmat_from_systematics(commondata, central_values=None): """Take the statistical uncertainty and systematics table from From 3f1ce74a6cad692f4246a9bbd75f68386680df01 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 17 Nov 2020 13:35:46 +0000 Subject: [PATCH 18/37] Adding use_mult_errors flag to datasets_covmat_from_systematics --- validphys2/src/validphys/covmats.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 96c5453d8c..263fabd17d 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -75,13 +75,6 @@ def covmat_from_systematics(commondata, central_values=None): can be used to calculate, for example, the t0 covariance matrix by using the predictions from the central member of the t0 pdf. - use_mult_errors: bool - Boolean which controls whether multiplicative errors are used - when computing the covmat. By default it should be set to ``True`` - since setting this parameter to ``False`` will completely ignore multiplicative - uncertainties. This flag is present for use with pseudodata generation where the - multiplicative uncertainties should be ignored. - Returns ------- cov_mat: np.array @@ -139,6 +132,10 @@ def datasets_covmat_from_systematics( contribution. By default this is None and the experimental central values are used. + use_mult_errors: bool + Whether or not to include multiplicative systematic errors in the computation of + the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`. + Returns ------- cov_mat : np.array From f0fa6282add7e68dadba2bda40f9f34eb60a1d1b Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 7 Dec 2020 15:17:21 +0000 Subject: [PATCH 19/37] Adding functionality to compute pseudodata for a list of commondata Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 140 ++++++++++++------------- 1 file changed, 69 insertions(+), 71 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index b804e68853..2e7d5cbee4 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -15,7 +15,7 @@ from validphys.checks import check_cuts_fromfit, check_darwin_single_process from validphys.commondataparser import load_commondata from validphys.coredata import CommonData -from validphys.covmats import covmat_from_systematics, sqrt_covmat +from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect @@ -110,7 +110,8 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index -def make_replica(commondata, seed=None): +def make_replica(list_of_commondata, seed=None): + # TODO: this is the old docstring: update it """Generator that takes in a :py:class:`validphys.coredata.CommonData` or :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas of the data central value. This generator is infinite in the sense that it will @@ -165,85 +166,82 @@ def make_replica(commondata, seed=None): # If seed is None (i.e not explicitly provided in the function call) # then set it randomly if seed is None: - seed = random.randint(1, 1000) + seed = np.random.randint(1000) - # TODO: eventually remove the inhouse random number generator - # in favour of e.g numpy - NNPDF.RandomGenerator.InitRNG(0, seed) - rng = NNPDF.RandomGenerator.GetRNG() - - if not isinstance(commondata, CommonData): - ld_cd = load_commondata(commondata) - else: - ld_cd = commondata - - additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(additive_covmat) - - central_values = ld_cd.central_values - nsys, ndata = ld_cd.nsys, ld_cd.ndata - - # Define these variables here to prevent them - # from being redefined in the while loop - sys_names = ld_cd.systype_table["name"] - sys_types = ld_cd.systype_table["type"] - systematics = ld_cd.sys_errors - - # Drop any systematics that are related to theory uncertainties - # or have explicitly been labelled as a systematic to be skipped. - # We are only interested in multiplicative uncertainties - mult_sys = systematics.loc[ - :, - (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) - & (sys_types == "MULT"), - ] - # The multiplicative uncertainties are quoted as percentages, so - # we change them to fractional values - mult_sys = mult_sys.applymap(lambda x: x.mult / 100) - - # We want to know the names of the remaining systematics. - mult_names = sys_names.loc[mult_sys.columns] + # Seed the numpy RNG with the seed. + np.random.seed(seed=seed) # We make this generator infinite in the sense that it shall never - # raise a StopIteration exception. Thus we can continuely request + # raise a StopIteration exception. Thus we can continually request # pseudodata from it and it will remember the RNG state. while True: - # We need to loop until the pseudodata is positive + # The inner while True loop is for ensuring a positive definite + # pseudodata replica while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - - correlated_deviates = sampling_matrix @ deviates - - pseudodata = np.array(central_values) + correlated_deviates - - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ - [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_names.iteritems() - ] - for _ in range(ndata) - ] - ) + pseudodatas = [] + special_add = [] + special_mult = [] + mult_shifts = [] + check_positive_masks = [] + for cd in list_of_commondata: + pseudodata = cd.central_values.to_numpy() + + add_errors = cd.additive_errors + add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() + + pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + + # correlated within dataset + add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() + pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + + # append the partially shifted pseudodata + pseudodatas.append(pseudodata) + special_add.append( + add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + ).prod(axis=1) + + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + mult_shift *= ( + 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + ).prod(axis=1) + + mult_shifts.append(mult_shift) + + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + + # mask out the data we want to check are all positive + if "ASY" in cd.commondataproc: + check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) + else: + check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) + + # if we sort here (which sorts columns), then permuting datasets doesn't change the result + special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy() + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata. Note that if the dataset is an asymmetry - # then it is allowed to be negative. - if all(pseudodata > 0) or ("ASY" in commondata.process_type): + all_pseudodata = ( + np.concatenate(pseudodatas, axis=0) + + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + ) * ( + np.concatenate(mult_shifts, axis=0) + * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + ) + + if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): break - yield pseudodata + yield all_pseudodata @check_darwin_single_process From c7626b80b9a19b0a57996f3bb57e0485701d54ce Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 8 Dec 2020 13:09:15 +0000 Subject: [PATCH 20/37] Adding fillna(0) to special additive errors --- validphys2/src/validphys/pseudodata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 2e7d5cbee4..da9311e00b 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -225,8 +225,8 @@ def make_replica(list_of_commondata, seed=None): check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) # if we sort here (which sorts columns), then permuting datasets doesn't change the result - special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy() # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() From c3fd73f9f2b2d337e71dd7fefe520aad3fed5888 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 9 Dec 2020 15:58:08 +0000 Subject: [PATCH 21/37] Changing covmats.py to not use the use_mult_errors flag --- validphys2/src/validphys/covmats.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 263fabd17d..189d61d89b 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -132,10 +132,6 @@ def datasets_covmat_from_systematics( contribution. By default this is None and the experimental central values are used. - use_mult_errors: bool - Whether or not to include multiplicative systematic errors in the computation of - the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`. - Returns ------- cov_mat : np.array From a5213cf605b27e3de60b832c304345f84171ba55 Mon Sep 17 00:00:00 2001 From: wilsonmr <33907451+wilsonmr@users.noreply.github.com> Date: Wed, 16 Dec 2020 10:01:31 +0000 Subject: [PATCH 22/37] Apply suggestions from code review - comments to split up code Co-authored-by: Rosalyn Pearson <33020850+RosalynLP@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index da9311e00b..eb450f0fc9 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -186,6 +186,7 @@ def make_replica(list_of_commondata, seed=None): for cd in list_of_commondata: pseudodata = cd.central_values.to_numpy() + # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() @@ -197,9 +198,11 @@ def make_replica(list_of_commondata, seed=None): # append the partially shifted pseudodata pseudodatas.append(pseudodata) + # store the additive errors with correlations between datasets for later use special_add.append( add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] ) + # ~~~ MULTIPLICATIVE ERRORS ~~~ mult_errors = cd.multiplicative_errors mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() # convert to from percent to fraction @@ -214,6 +217,7 @@ def make_replica(list_of_commondata, seed=None): mult_shifts.append(mult_shift) + # store the multiplicative errors with correlations between datasets for later use special_mult.append( mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] ) From 7ee813f204f1ae792ef4fd22817684b2f3f5a7fa Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 16 Dec 2020 10:17:23 +0000 Subject: [PATCH 23/37] include stat errors --- validphys2/src/validphys/pseudodata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index eb450f0fc9..c74529b48f 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -186,6 +186,9 @@ def make_replica(list_of_commondata, seed=None): for cd in list_of_commondata: pseudodata = cd.central_values.to_numpy() + # add contribution from statistical uncertainty + pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() From 46020df63205a844ae9309def12a89af639d415d Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Fri, 15 Jan 2021 14:06:45 +0000 Subject: [PATCH 24/37] Removing if seed is None Numpy has this as default behaviour Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index c74529b48f..0a1b70e064 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -163,11 +163,6 @@ def make_replica(list_of_commondata, seed=None): .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. """ - # If seed is None (i.e not explicitly provided in the function call) - # then set it randomly - if seed is None: - seed = np.random.randint(1000) - # Seed the numpy RNG with the seed. np.random.seed(seed=seed) From 3d776a5d618cf014be8ba69c637917cf6dedc1d0 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 14:18:24 +0000 Subject: [PATCH 25/37] Changing generator to function --- validphys2/src/validphys/pseudodata.py | 138 ++++++++++++------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 0a1b70e064..b65024a5b7 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -166,84 +166,80 @@ def make_replica(list_of_commondata, seed=None): # Seed the numpy RNG with the seed. np.random.seed(seed=seed) - # We make this generator infinite in the sense that it shall never - # raise a StopIteration exception. Thus we can continually request - # pseudodata from it and it will remember the RNG state. + # The inner while True loop is for ensuring a positive definite + # pseudodata replica while True: - # The inner while True loop is for ensuring a positive definite - # pseudodata replica - while True: - pseudodatas = [] - special_add = [] - special_mult = [] - mult_shifts = [] - check_positive_masks = [] - for cd in list_of_commondata: - pseudodata = cd.central_values.to_numpy() - - # add contribution from statistical uncertainty - pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) - - # ~~~ ADDITIVE ERRORS ~~~ - add_errors = cd.additive_errors - add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() - - pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) - - # correlated within dataset - add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() - pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) - - # append the partially shifted pseudodata - pseudodatas.append(pseudodata) - # store the additive errors with correlations between datasets for later use - special_add.append( - add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] - ) - # ~~~ MULTIPLICATIVE ERRORS ~~~ - mult_errors = cd.multiplicative_errors - mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() - # convert to from percent to fraction - mult_shift = ( - 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 - ).prod(axis=1) - - mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() - mult_shift *= ( - 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 - ).prod(axis=1) - - mult_shifts.append(mult_shift) - - # store the multiplicative errors with correlations between datasets for later use - special_mult.append( - mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] - ) + pseudodatas = [] + special_add = [] + special_mult = [] + mult_shifts = [] + check_positive_masks = [] + for cd in list_of_commondata: + pseudodata = cd.central_values.to_numpy() + + # add contribution from statistical uncertainty + pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + + # ~~~ ADDITIVE ERRORS ~~~ + add_errors = cd.additive_errors + add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() + + pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + + # correlated within dataset + add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() + pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + + # append the partially shifted pseudodata + pseudodatas.append(pseudodata) + # store the additive errors with correlations between datasets for later use + special_add.append( + add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + # ~~~ MULTIPLICATIVE ERRORS ~~~ + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + ).prod(axis=1) + + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + mult_shift *= ( + 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + ).prod(axis=1) + + mult_shifts.append(mult_shift) + + # store the multiplicative errors with correlations between datasets for later use + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) - # mask out the data we want to check are all positive - if "ASY" in cd.commondataproc: - check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) - else: - check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) + # mask out the data we want to check are all positive + if "ASY" in cd.commondataproc: + check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) + else: + check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) - # if we sort here (which sorts columns), then permuting datasets doesn't change the result - # non-overlapping systematics are set to NaN by concat, fill with 0 instead. - special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() - special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() + # if we sort here (which sorts columns), then permuting datasets doesn't change the result + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() - all_pseudodata = ( - np.concatenate(pseudodatas, axis=0) - + special_add_errors @ np.random.randn(special_add_errors.shape[1]) - ) * ( - np.concatenate(mult_shifts, axis=0) - * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) - ) + all_pseudodata = ( + np.concatenate(pseudodatas, axis=0) + + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + ) * ( + np.concatenate(mult_shifts, axis=0) + * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + ) - if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): - break + if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): + break - yield all_pseudodata + return all_pseudodata @check_darwin_single_process From 1984534edf01378c7e29ea082b70b956f1e0d3b8 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 14:27:56 +0000 Subject: [PATCH 26/37] Updating doc string --- validphys2/src/validphys/pseudodata.py | 58 ++++++++++---------------- 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index b65024a5b7..dcd47203b3 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -111,57 +111,43 @@ def read_fit_pseudodata(fitcontext, context_index): def make_replica(list_of_commondata, seed=None): - # TODO: this is the old docstring: update it - """Generator that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas - of the data central value. This generator is infinite in the sense that it will - never raise a StopIteration exception - - The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix with the covariance matrix being obtained using - :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call - to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to - ignore multiplicative uncertainties. Gaussian sampling is performed using the - additive part of the covariance matrix. An additional gaussian sampling for the - renormalization of the data having multiplicative uncertainties happens later. - All the contributions to the covariance matrix coming from theory errors should be - accounted for in the additive part. + """Function that takes in a list of :py:class:`validphys.coredata.CommonData` + objects and returns pseudodata replicas of the data central value accounting for + possible correlations between systematic uncertainties. + + The function loops until positive definite pseudodata is generated for any + non-asymmetry datasets. In the case of an asymmetry dataset negative values are + permitted so the loop block executes only once. Parameters --------- - commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` - CommonData which stores information about systematic errors, - their treatment and description. + list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`] + Iterable of CommonData objects which stores information about systematic errors, + their treatment and description, for each dataset. seed: int, None - Seed used to initialise the random number generator. If ``None`` then a random seed is allocated - using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing - inadvertent correlations. + Seed used to initialise the numpy random number generator. If ``None`` then a random seed is + allocated using the default numpy behaviour. Returns ------- pseudodata: np.array - Numpy array which is N_dat (where N_dat is the number of data points after cuts) + Numpy array which is N_dat (where N_dat is the combined number of data points after cuts) containing monte carlo samples of data centered around the data central value. - Example ------- + >>> from validphys.commondataparser import load_commondata >>> from validphys.loader import Loader - >>> l = Loader() >>> from validphys.pseudodata import make_replica - >>> cd = l.check_commondata("NMC") - >>> data_generator = make_replica(cd) - >>> next(data_generator) - - Random Generator allocated: ranlux - array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - - - .. todo:: Replace the inhouse random number generation with a numpy equivalent. - .. todo:: Allow for correlations between datasets within an experiment. + >>> l = Loader() + >>> cd1 = l.check_commondata("NMC") + >>> cd2 = l.check_commondata("NMCPD") + >>> lds = map(load_commondata, (cd1, cd2)) + >>> make_replica(lds) + array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , + 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, + 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , """ # Seed the numpy RNG with the seed. np.random.seed(seed=seed) From b5df64e4f56d53e5811b87819e34df518d67046b Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 16:16:34 +0000 Subject: [PATCH 27/37] Using new numpy.random interface --- validphys2/src/validphys/pseudodata.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index dcd47203b3..e5ec8a30f6 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -7,7 +7,6 @@ import multiprocessing as mp import os import pathlib -import random import numpy as np import pandas as pd @@ -150,7 +149,7 @@ def make_replica(list_of_commondata, seed=None): 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , """ # Seed the numpy RNG with the seed. - np.random.seed(seed=seed) + rng = np.random.default_rng(seed=seed) # The inner while True loop is for ensuring a positive definite # pseudodata replica @@ -164,17 +163,17 @@ def make_replica(list_of_commondata, seed=None): pseudodata = cd.central_values.to_numpy() # add contribution from statistical uncertainty - pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata)) # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() - pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1) # correlated within dataset add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() - pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1]) # append the partially shifted pseudodata pseudodatas.append(pseudodata) @@ -187,12 +186,12 @@ def make_replica(list_of_commondata, seed=None): mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() # convert to from percent to fraction mult_shift = ( - 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 ).prod(axis=1) mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() mult_shift *= ( - 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 ).prod(axis=1) mult_shifts.append(mult_shift) @@ -216,10 +215,10 @@ def make_replica(list_of_commondata, seed=None): all_pseudodata = ( np.concatenate(pseudodatas, axis=0) - + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + + special_add_errors @ rng.normal(size=special_add_errors.shape[1]) ) * ( np.concatenate(mult_shifts, axis=0) - * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1) ) if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): From ed674e84297f1cf5fbd5d2d1a3a51e3b40c4d063 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 27 Jan 2021 18:09:40 +0000 Subject: [PATCH 28/37] add copy to central values before generating data --- validphys2/src/validphys/pseudodata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index e5ec8a30f6..28c150ecc3 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -160,7 +160,8 @@ def make_replica(list_of_commondata, seed=None): mult_shifts = [] check_positive_masks = [] for cd in list_of_commondata: - pseudodata = cd.central_values.to_numpy() + # copy here to avoid mutating the central values. + pseudodata = cd.central_values.to_numpy(copy=True) # add contribution from statistical uncertainty pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata)) From f4810349e48dda1f673cbf2cd6a2c053265d49e8 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Thu, 28 Jan 2021 12:21:25 +0000 Subject: [PATCH 29/37] added some tests to catch that central values are unchanged and that generated data is correct length. --- .../validphys/tests/test_pythonmakereplica.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 validphys2/src/validphys/tests/test_pythonmakereplica.py diff --git a/validphys2/src/validphys/tests/test_pythonmakereplica.py b/validphys2/src/validphys/tests/test_pythonmakereplica.py new file mode 100644 index 0000000000..639d0a371b --- /dev/null +++ b/validphys2/src/validphys/tests/test_pythonmakereplica.py @@ -0,0 +1,78 @@ +""" +test_pythonmakereplica.py + +Module for testing the python implementation of make replica + +""" +import numpy as np +import pytest + +from validphys.api import API +from validphys.pseudodata import make_replica +from validphys.tests.conftest import DATA +from validphys.tests.test_covmats import CORR_DATA + + +SINGLE_SYS_DATASETS = [ + {"dataset": "DYE886R"}, + {"dataset": "D0ZRAP", "cfac": ["QCD"]}, + {"dataset": "ATLAS_SINGLETOP_TCH_R_13TEV", "cfac": ["QCD"]}, + {"dataset": "CMS_SINGLETOP_TCH_R_8TEV", "cfac": ["QCD"]}, + {"dataset": "CMS_SINGLETOP_TCH_R_13TEV", "cfac": ["QCD"]}, +] + +@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"]) +@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS]) +def test_commondata_unchanged(data_config, dataset_inputs, use_cuts): + """Load the commondata, then generate some pseudodata using make replica + Check that the following attributes of the commondata have not been + modified: central_values, commondata_table, systematics_table. + + """ + config = dict(data_config) + config["dataset_inputs"] = dataset_inputs + config["use_cuts"] = use_cuts + ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config) + # keep a copy of all central values and all systematics + original_cvs = [np.array(cd.central_values.to_numpy(copy=True)) for cd in ld_cds] + original_sys = [np.array(cd.systematics_table.to_numpy(copy=True)) for cd in ld_cds] + make_replica(ld_cds) + for post_mkrep_cd, pre_mkrep_cv, pre_mkrep_sys in zip(ld_cds, original_cvs, original_sys): + np.testing.assert_allclose( + post_mkrep_cd.central_values.to_numpy(), pre_mkrep_cv + ) + np.testing.assert_allclose( + post_mkrep_cd.systematics_table.to_numpy(), pre_mkrep_sys + ) + + +@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"]) +@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS]) +def test_pseudodata_seeding(data_config, dataset_inputs, use_cuts): + """Check that using a seed reproduces the pseudodata. Note that this also + will check that the commondata hasn't been modified since reproducing + the same commondata requires that the commondata is unchanged and that + the random numbers are generated and used identically. + + """ + seed = 123456 + config = dict(data_config) + config["dataset_inputs"] = dataset_inputs + config["use_cuts"] = use_cuts + ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config) + rep_1 = make_replica(ld_cds, seed=seed) + rep_2 = make_replica(ld_cds, seed=seed) + np.testing.assert_allclose(rep_1, rep_2) + + +@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"]) +@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS]) +def test_pseudodata_has_correct_ndata(data_config, dataset_inputs, use_cuts): + """Check that we get the correct ndata when generating pseudodata""" + config = dict(data_config) + config["dataset_inputs"] = dataset_inputs + config["use_cuts"] = use_cuts + ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config) + rep = make_replica(ld_cds) + ndata = np.sum([cd.ndata for cd in ld_cds]) + assert len(rep) == ndata From 133c07cb811647217f66fa2c0a963eb485ce0873 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 3 Feb 2021 10:44:37 +0000 Subject: [PATCH 30/37] update make replica test that commondata is unchanged to do what it says it does --- .../validphys/tests/test_pythonmakereplica.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/validphys2/src/validphys/tests/test_pythonmakereplica.py b/validphys2/src/validphys/tests/test_pythonmakereplica.py index 639d0a371b..627de972de 100644 --- a/validphys2/src/validphys/tests/test_pythonmakereplica.py +++ b/validphys2/src/validphys/tests/test_pythonmakereplica.py @@ -4,7 +4,10 @@ Module for testing the python implementation of make replica """ +from copy import deepcopy + import numpy as np +from pandas.testing import assert_frame_equal, assert_series_equal import pytest from validphys.api import API @@ -33,17 +36,22 @@ def test_commondata_unchanged(data_config, dataset_inputs, use_cuts): config["dataset_inputs"] = dataset_inputs config["use_cuts"] = use_cuts ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config) - # keep a copy of all central values and all systematics - original_cvs = [np.array(cd.central_values.to_numpy(copy=True)) for cd in ld_cds] - original_sys = [np.array(cd.systematics_table.to_numpy(copy=True)) for cd in ld_cds] + + # keep a copy of all dataframes/series pre make replica + pre_mkrep_cvs = [deepcopy(cd.central_values) for cd in ld_cds] + pre_mkrep_sys_tabs = [deepcopy(cd.systematics_table) for cd in ld_cds] + pre_mkrep_cd_tabs = [deepcopy(cd.commondata_table) for cd in ld_cds] + make_replica(ld_cds) - for post_mkrep_cd, pre_mkrep_cv, pre_mkrep_sys in zip(ld_cds, original_cvs, original_sys): - np.testing.assert_allclose( - post_mkrep_cd.central_values.to_numpy(), pre_mkrep_cv - ) - np.testing.assert_allclose( - post_mkrep_cd.systematics_table.to_numpy(), pre_mkrep_sys - ) + + for post_mkrep_cd, pre_mkrep_cv in zip(ld_cds, pre_mkrep_cvs): + assert_series_equal(post_mkrep_cd.central_values, pre_mkrep_cv) + + for post_mkrep_cd, pre_mkrep_sys_tab in zip(ld_cds, pre_mkrep_sys_tabs): + assert_frame_equal(post_mkrep_cd.systematics_table, pre_mkrep_sys_tab) + + for post_mkrep_cd, pre_mkrep_cd_tab in zip(ld_cds, pre_mkrep_cd_tabs): + assert_frame_equal(post_mkrep_cd.commondata_table, pre_mkrep_cd_tab) @pytest.mark.parametrize("use_cuts", ["nocuts", "internal"]) From 8378139cfff032992b3de495636fa1fe3218f708 Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:38:56 +0000 Subject: [PATCH 31/37] Apply suggestions from code review Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 28c150ecc3..f6ee7d8472 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -111,7 +111,7 @@ def read_fit_pseudodata(fitcontext, context_index): def make_replica(list_of_commondata, seed=None): """Function that takes in a list of :py:class:`validphys.coredata.CommonData` - objects and returns pseudodata replicas of the data central value accounting for + objects and returns a pseudodata replica accounting for possible correlations between systematic uncertainties. The function loops until positive definite pseudodata is generated for any @@ -208,7 +208,6 @@ def make_replica(list_of_commondata, seed=None): else: check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) - # if we sort here (which sorts columns), then permuting datasets doesn't change the result # non-overlapping systematics are set to NaN by concat, fill with 0 instead. special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() From 50f33aa6de88634e8be19cd5196627621e15e8ee Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 16:40:47 +0000 Subject: [PATCH 32/37] Fixing rebase problems --- validphys2/src/validphys/covmats.py | 1 - validphys2/src/validphys/pseudodata.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 189d61d89b..e913f36105 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -19,7 +19,6 @@ check_speclabels_different, check_data_cuts_match_theorycovmat, ) -from validphys.core import PDF, DataGroupSpec, DataSetSpec from validphys.results import ThPredictionsResult log = logging.getLogger(__name__) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index f6ee7d8472..054a996eb8 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -12,8 +12,6 @@ import pandas as pd from validphys.checks import check_cuts_fromfit, check_darwin_single_process -from validphys.commondataparser import load_commondata -from validphys.coredata import CommonData from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect @@ -21,8 +19,6 @@ import n3fit.io.reader as reader from n3fit.performfit import initialize_seeds -import NNPDF - log = logging.getLogger(__name__) From 432063d26d071dc32dce7a63f8db02c463fe83a2 Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 16:41:33 +0000 Subject: [PATCH 33/37] iterable -> list in docstring --- validphys2/src/validphys/pseudodata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 054a996eb8..e083c1f09f 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -116,8 +116,8 @@ def make_replica(list_of_commondata, seed=None): Parameters --------- - list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`] - Iterable of CommonData objects which stores information about systematic errors, + list_of_commondata: list[:py:class:`validphys.coredata.CommonData`] + List of CommonData objects which stores information about systematic errors, their treatment and description, for each dataset. seed: int, None From e00e8d00d58b5e2be3630fa791eee3b4ca9a0251 Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 16:49:32 +0000 Subject: [PATCH 34/37] using API to load cd --- validphys2/src/validphys/pseudodata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index e083c1f09f..80b6770972 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -132,13 +132,13 @@ def make_replica(list_of_commondata, seed=None): Example ------- - >>> from validphys.commondataparser import load_commondata - >>> from validphys.loader import Loader + >>> from validphys.api import API >>> from validphys.pseudodata import make_replica - >>> l = Loader() - >>> cd1 = l.check_commondata("NMC") - >>> cd2 = l.check_commondata("NMCPD") - >>> lds = map(load_commondata, (cd1, cd2)) + >>> lds = API.dataset_inputs_loaded_cd_with_cuts( + dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}], + use_cuts="nocuts", + theoryid=53 + ) >>> make_replica(lds) array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, From ae2623e0ab42373c78a68a93769db7ac18cb15ea Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 17:47:13 +0000 Subject: [PATCH 35/37] making make replica a provider --- validphys2/src/validphys/pseudodata.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 80b6770972..b1ec0c3933 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -105,7 +105,7 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index -def make_replica(list_of_commondata, seed=None): +def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None): """Function that takes in a list of :py:class:`validphys.coredata.CommonData` objects and returns a pseudodata replica accounting for possible correlations between systematic uncertainties. @@ -116,7 +116,7 @@ def make_replica(list_of_commondata, seed=None): Parameters --------- - list_of_commondata: list[:py:class:`validphys.coredata.CommonData`] + dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`] List of CommonData objects which stores information about systematic errors, their treatment and description, for each dataset. @@ -133,13 +133,11 @@ def make_replica(list_of_commondata, seed=None): Example ------- >>> from validphys.api import API - >>> from validphys.pseudodata import make_replica - >>> lds = API.dataset_inputs_loaded_cd_with_cuts( + >>> pseudodata = API.make_replica( dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}], use_cuts="nocuts", theoryid=53 ) - >>> make_replica(lds) array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , @@ -155,7 +153,7 @@ def make_replica(list_of_commondata, seed=None): special_mult = [] mult_shifts = [] check_positive_masks = [] - for cd in list_of_commondata: + for cd in dataset_inputs_loaded_cd_with_cuts: # copy here to avoid mutating the central values. pseudodata = cd.central_values.to_numpy(copy=True) From bbbd22697fb0843e0f848fe513083ed7423b116a Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 17:49:50 +0000 Subject: [PATCH 36/37] index pseudodata --- validphys2/src/validphys/pseudodata.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index b1ec0c3933..dba391e6ad 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -221,6 +221,13 @@ def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None): return all_pseudodata +def indexed_make_replica(groups_index, make_replica): + """Index the make_replica pseudodata appropriately + """ + + return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) + + @check_darwin_single_process def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): """A function to obtain information about the pseudodata that went From 6a12bd628d113f6e5aa7821d47c4db4814e72c3f Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Feb 2021 18:20:20 +0000 Subject: [PATCH 37/37] Adding regression test for make_replica --- validphys2/src/validphys/tableloader.py | 5 + .../tests/regressions/test_mcreplica.csv | 267 ++++++++++++++++++ .../src/validphys/tests/test_regressions.py | 15 +- 3 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 validphys2/src/validphys/tests/regressions/test_mcreplica.csv diff --git a/validphys2/src/validphys/tableloader.py b/validphys2/src/validphys/tableloader.py index 34883ad293..8d4a493fbb 100644 --- a/validphys2/src/validphys/tableloader.py +++ b/validphys2/src/validphys/tableloader.py @@ -32,6 +32,11 @@ def fixup_header(df, head_index, dtype): *oldcols.levels[head_index+1:]]) df.columns = newcols +def parse_data_cv(filename): + """Useful for reading DataFrames with just one column.""" + df = sane_load(filename, index_col=[0, 1, 2]) + return df + def parse_exp_mat(filename): """Parse a dump of a matrix like experiments_covmat.""" df = sane_load(filename, header=[0,1,2], index_col=[0,1,2]) diff --git a/validphys2/src/validphys/tests/regressions/test_mcreplica.csv b/validphys2/src/validphys/tests/regressions/test_mcreplica.csv new file mode 100644 index 0000000000..bdfab83f4b --- /dev/null +++ b/validphys2/src/validphys/tests/regressions/test_mcreplica.csv @@ -0,0 +1,267 @@ +group dataset id data +ATLAS ATLASWZRAP36PB 0 625874.0721154683 +ATLAS ATLASWZRAP36PB 1 629690.9940296302 +ATLAS ATLASWZRAP36PB 2 647804.631311913 +ATLAS ATLASWZRAP36PB 3 640294.5028097499 +ATLAS ATLASWZRAP36PB 4 666615.6925633255 +ATLAS ATLASWZRAP36PB 5 679742.6040870899 +ATLAS ATLASWZRAP36PB 6 661543.8956116233 +ATLAS ATLASWZRAP36PB 7 661401.0725677101 +ATLAS ATLASWZRAP36PB 8 694185.1736757511 +ATLAS ATLASWZRAP36PB 9 674955.0227261193 +ATLAS ATLASWZRAP36PB 10 595607.510395848 +ATLAS ATLASWZRAP36PB 11 461178.58903195267 +ATLAS ATLASWZRAP36PB 12 454625.3670724297 +ATLAS ATLASWZRAP36PB 13 481323.87751588045 +ATLAS ATLASWZRAP36PB 14 458849.11243098835 +ATLAS ATLASWZRAP36PB 15 439424.94273936 +ATLAS ATLASWZRAP36PB 16 432160.16046439036 +ATLAS ATLASWZRAP36PB 17 403846.69581751985 +ATLAS ATLASWZRAP36PB 18 396368.08282609435 +ATLAS ATLASWZRAP36PB 19 393885.7902546321 +ATLAS ATLASWZRAP36PB 20 366443.97889378696 +ATLAS ATLASWZRAP36PB 21 344324.53993439255 +ATLAS ATLASWZRAP36PB 22 135144.70842506472 +ATLAS ATLASWZRAP36PB 23 139058.67466877738 +ATLAS ATLASWZRAP36PB 24 129580.76369541005 +ATLAS ATLASWZRAP36PB 25 125247.04589283372 +ATLAS ATLASWZRAP36PB 26 119478.16024355698 +ATLAS ATLASWZRAP36PB 27 115032.04083971222 +ATLAS ATLASWZRAP36PB 28 104865.22589012324 +ATLAS ATLASWZRAP36PB 29 57082.53327858195 +ATLAS ATLASZHIGHMASS49FB 0 225.2241265581888 +ATLAS ATLASZHIGHMASS49FB 1 102.44036519725282 +ATLAS ATLASZHIGHMASS49FB 2 51.11583760792298 +ATLAS ATLASZHIGHMASS49FB 3 27.488866078195667 +ATLAS ATLASZHIGHMASS49FB 4 18.220176599654852 +ATLAS ATLASZHIGHMASS49FB 5 11.174983683757414 +ATLAS ATLASZHIGHMASS49FB 6 8.695255033748985 +ATLAS ATLASZHIGHMASS49FB 7 4.521850917275335 +ATLAS ATLASZHIGHMASS49FB 8 1.7204940792698935 +ATLAS ATLASZHIGHMASS49FB 9 0.4698742453309486 +ATLAS ATLASZHIGHMASS49FB 10 0.14129611171097067 +ATLAS ATLASZHIGHMASS49FB 11 0.027030707761265878 +ATLAS ATLASZHIGHMASS49FB 12 0.005449291740977146 +ATLAS ATLASLOMASSDY11EXT 0 13576.517089454817 +ATLAS ATLASLOMASSDY11EXT 1 23865.177412820023 +ATLAS ATLASLOMASSDY11EXT 2 14606.30537341771 +ATLAS ATLASLOMASSDY11EXT 3 6812.0492301686645 +ATLAS ATLASLOMASSDY11EXT 4 2873.966448543525 +ATLAS ATLASLOMASSDY11EXT 5 1270.81693829667 +ATLAS ATLASWZRAP11 0 575091.3781838007 +ATLAS ATLASWZRAP11 1 573831.5774693731 +ATLAS ATLASWZRAP11 2 577308.4133944701 +ATLAS ATLASWZRAP11 3 581166.6968380663 +ATLAS ATLASWZRAP11 4 579180.5469411756 +ATLAS ATLASWZRAP11 5 595392.1602877442 +ATLAS ATLASWZRAP11 6 590119.459024331 +ATLAS ATLASWZRAP11 7 598485.3027461147 +ATLAS ATLASWZRAP11 8 596928.0553084731 +ATLAS ATLASWZRAP11 9 589065.2544999232 +ATLAS ATLASWZRAP11 10 554887.8805729451 +ATLAS ATLASWZRAP11 11 432286.1471328586 +ATLAS ATLASWZRAP11 12 428885.6752850355 +ATLAS ATLASWZRAP11 13 426354.80807207135 +ATLAS ATLASWZRAP11 14 419728.3652439385 +ATLAS ATLASWZRAP11 15 411428.30053804716 +ATLAS ATLASWZRAP11 16 402163.10469116265 +ATLAS ATLASWZRAP11 17 384833.51730835374 +ATLAS ATLASWZRAP11 18 373988.9640125536 +ATLAS ATLASWZRAP11 19 360622.88887275255 +ATLAS ATLASWZRAP11 20 340702.7760869247 +ATLAS ATLASWZRAP11 21 317134.5980054034 +ATLAS ATLASWZRAP11 22 133799.96364220293 +ATLAS ATLASWZRAP11 23 133448.86365596423 +ATLAS ATLASWZRAP11 24 132637.63442726055 +ATLAS ATLASWZRAP11 25 130409.92750454393 +ATLAS ATLASWZRAP11 26 130912.15120182281 +ATLAS ATLASWZRAP11 27 127353.39305518712 +ATLAS ATLASWZRAP11 28 117895.4802432277 +ATLAS ATLASWZRAP11 29 105832.75453175441 +ATLAS ATLASWZRAP11 30 88155.7125609602 +ATLAS ATLASWZRAP11 31 67414.70246827461 +ATLAS ATLASWZRAP11 32 44841.69930108373 +ATLAS ATLASWZRAP11 33 21730.44158986774 +CMS CMSZDIFF12 0 9886.334551571706 +CMS CMSZDIFF12 1 2906.558370791581 +CMS CMSZDIFF12 2 1065.8985621057848 +CMS CMSZDIFF12 3 456.26790474310286 +CMS CMSZDIFF12 4 224.22271558328507 +CMS CMSZDIFF12 5 110.44990446224114 +CMS CMSZDIFF12 6 62.579695282891535 +CMS CMSZDIFF12 7 30.063450520529134 +CMS CMSZDIFF12 8 13.922543892672062 +CMS CMSZDIFF12 9 0.6141016049164042 +CMS CMSZDIFF12 10 9856.366953238841 +CMS CMSZDIFF12 11 2865.600713840613 +CMS CMSZDIFF12 12 1059.860323720172 +CMS CMSZDIFF12 13 445.179296515881 +CMS CMSZDIFF12 14 212.87853289770084 +CMS CMSZDIFF12 15 109.79806111700381 +CMS CMSZDIFF12 16 57.81793265111689 +CMS CMSZDIFF12 17 30.846128145509127 +CMS CMSZDIFF12 18 13.294573202916089 +CMS CMSZDIFF12 19 0.6295615951503276 +CMS CMSZDIFF12 20 9172.143310236264 +CMS CMSZDIFF12 21 2584.4159164450075 +CMS CMSZDIFF12 22 934.5819007730142 +CMS CMSZDIFF12 23 414.03599558528543 +CMS CMSZDIFF12 24 202.31230308651277 +CMS CMSZDIFF12 25 101.61697272749291 +CMS CMSZDIFF12 26 55.241506126064486 +CMS CMSZDIFF12 27 28.99689277027626 +CMS CMSZDIFF12 28 13.300620394041099 +CMS CMSZDIFF12 29 0.5485904704152681 +CMS CMSZDIFF12 30 6881.389313938887 +CMS CMSZDIFF12 31 1929.5303610925562 +CMS CMSZDIFF12 32 742.7463056241985 +CMS CMSZDIFF12 33 318.6113512739281 +CMS CMSZDIFF12 34 166.3854976523764 +CMS CMSZDIFF12 35 83.77177274421992 +CMS CMSZDIFF12 36 47.38911931541346 +CMS CMSZDIFF12 37 23.574007377898496 +CMS CMSZDIFF12 38 11.935536472481251 +CMS CMSZDIFF12 39 0.45705487477676426 +CMS CMSZDIFF12 40 3707.934954547729 +CMS CMSZDIFF12 41 1021.7408959849994 +CMS CMSZDIFF12 42 383.9123726446375 +CMS CMSZDIFF12 43 178.03122251767607 +CMS CMSZDIFF12 44 92.3591212500585 +CMS CMSZDIFF12 45 47.898485423688264 +CMS CMSZDIFF12 46 28.687924568924732 +CMS CMSZDIFF12 47 15.073356404547805 +CMS CMSZDIFF12 48 6.8587092533392475 +CMS CMSZDIFF12 49 0.2537524987802203 +CMS CMSJETS11 0 3064.8412856542986 +CMS CMSJETS11 1 1449.5091642980894 +CMS CMSJETS11 2 676.135506669856 +CMS CMSJETS11 3 328.6688802887687 +CMS CMSJETS11 4 170.7883047739323 +CMS CMSJETS11 5 89.97188258438256 +CMS CMSJETS11 6 48.1640183282502 +CMS CMSJETS11 7 26.21942900354327 +CMS CMSJETS11 8 14.019421384758264 +CMS CMSJETS11 9 7.99060694491261 +CMS CMSJETS11 10 4.620071821124748 +CMS CMSJETS11 11 2.653929526854456 +CMS CMSJETS11 12 1.5706174029329598 +CMS CMSJETS11 13 0.8776470184526391 +CMS CMSJETS11 14 0.5180721939935242 +CMS CMSJETS11 15 0.3138789290162737 +CMS CMSJETS11 16 0.18168510415487119 +CMS CMSJETS11 17 0.10788319709417281 +CMS CMSJETS11 18 0.06449279142848932 +CMS CMSJETS11 19 0.03712444722658268 +CMS CMSJETS11 20 0.021921922929691376 +CMS CMSJETS11 21 0.013033700911296818 +CMS CMSJETS11 22 0.007512607144544201 +CMS CMSJETS11 23 0.004114373157279753 +CMS CMSJETS11 24 0.0022362655927216464 +CMS CMSJETS11 25 0.001482939378122927 +CMS CMSJETS11 26 0.0007342552398618355 +CMS CMSJETS11 27 0.00031227731387260945 +CMS CMSJETS11 28 0.00020129807744728782 +CMS CMSJETS11 29 9.225641840455252e-05 +CMS CMSJETS11 30 5.8513477579783676e-05 +CMS CMSJETS11 31 1.4973792398939266e-05 +CMS CMSJETS11 32 2.257710172411508e-06 +CMS CMSJETS11 33 2938.4983182356086 +CMS CMSJETS11 34 1306.838248806242 +CMS CMSJETS11 35 597.4152766849537 +CMS CMSJETS11 36 290.0206657424005 +CMS CMSJETS11 37 152.23116359185778 +CMS CMSJETS11 38 83.52798139030051 +CMS CMSJETS11 39 42.09591785153588 +CMS CMSJETS11 40 22.8506640473896 +CMS CMSJETS11 41 13.020425758807377 +CMS CMSJETS11 42 7.290889884016562 +CMS CMSJETS11 43 4.026941804347156 +CMS CMSJETS11 44 2.3465671617499315 +CMS CMSJETS11 45 1.3340712136907575 +CMS CMSJETS11 46 0.7737929315587418 +CMS CMSJETS11 47 0.43960904171665677 +CMS CMSJETS11 48 0.25276210474175886 +CMS CMSJETS11 49 0.14717720949714796 +CMS CMSJETS11 50 0.0847412733331661 +CMS CMSJETS11 51 0.048881737479790216 +CMS CMSJETS11 52 0.028451560671024542 +CMS CMSJETS11 53 0.01599524989930021 +CMS CMSJETS11 54 0.008733853605811368 +CMS CMSJETS11 55 0.004594104744837374 +CMS CMSJETS11 56 0.002236004787141291 +CMS CMSJETS11 57 0.0014403506626033526 +CMS CMSJETS11 58 0.0007585311754938085 +CMS CMSJETS11 59 0.0003499911931369715 +CMS CMSJETS11 60 0.00011161977018081332 +CMS CMSJETS11 61 7.773134653806228e-05 +CMS CMSJETS11 62 1.0739797452865742e-05 +CMS CMSJETS11 63 2582.266754202891 +CMS CMSJETS11 64 1096.3104931367543 +CMS CMSJETS11 65 515.8803717520409 +CMS CMSJETS11 66 247.584409749495 +CMS CMSJETS11 67 125.56185712566229 +CMS CMSJETS11 68 64.59934467165961 +CMS CMSJETS11 69 33.7709164498504 +CMS CMSJETS11 70 18.28629614870014 +CMS CMSJETS11 71 9.863997071005198 +CMS CMSJETS11 72 5.208105065908411 +CMS CMSJETS11 73 2.890135232927644 +CMS CMSJETS11 74 1.6089548948524601 +CMS CMSJETS11 75 0.849452414837028 +CMS CMSJETS11 76 0.4673888181261503 +CMS CMSJETS11 77 0.2668471983983226 +CMS CMSJETS11 78 0.14592329381516939 +CMS CMSJETS11 79 0.07508557256079366 +CMS CMSJETS11 80 0.040293907781475916 +CMS CMSJETS11 81 0.02119984449211859 +CMS CMSJETS11 82 0.010995803998216362 +CMS CMSJETS11 83 0.005466812931544958 +CMS CMSJETS11 84 0.002642109565833648 +CMS CMSJETS11 85 0.0013200185870372624 +CMS CMSJETS11 86 0.0005247905933328646 +CMS CMSJETS11 87 0.00028209984999926743 +CMS CMSJETS11 88 0.0001720895024153124 +CMS CMSJETS11 89 1.1238503789800609e-05 +CMS CMSJETS11 90 2003.922630727413 +CMS CMSJETS11 91 874.731868408299 +CMS CMSJETS11 92 392.1741073514707 +CMS CMSJETS11 93 187.66789571263348 +CMS CMSJETS11 94 92.02147434259216 +CMS CMSJETS11 95 44.1730889548527 +CMS CMSJETS11 96 23.833613441467335 +CMS CMSJETS11 97 11.819012342827115 +CMS CMSJETS11 98 6.209444739322469 +CMS CMSJETS11 99 3.099195938055046 +CMS CMSJETS11 100 1.5455020113031772 +CMS CMSJETS11 101 0.8181216756637731 +CMS CMSJETS11 102 0.3978236711718021 +CMS CMSJETS11 103 0.1973724877272748 +CMS CMSJETS11 104 0.09488179243734336 +CMS CMSJETS11 105 0.04284285858513589 +CMS CMSJETS11 106 0.02013247664048886 +CMS CMSJETS11 107 0.00904569048761001 +CMS CMSJETS11 108 0.003957411563322136 +CMS CMSJETS11 109 0.0013947956627950616 +CMS CMSJETS11 110 0.0005423450935199612 +CMS CMSJETS11 111 0.00019426047276743468 +CMS CMSJETS11 112 8.641325482048829e-05 +CMS CMSJETS11 113 1.1205549962724038e-05 +CMS CMSJETS11 114 1378.9721889654777 +CMS CMSJETS11 115 586.5280099525319 +CMS CMSJETS11 116 254.46531592644695 +CMS CMSJETS11 117 117.48542213423157 +CMS CMSJETS11 118 50.164169623489386 +CMS CMSJETS11 119 23.869966504044648 +CMS CMSJETS11 120 10.595115571480735 +CMS CMSJETS11 121 4.680940497616173 +CMS CMSJETS11 122 2.026200900502425 +CMS CMSJETS11 123 0.9369729158473754 +CMS CMSJETS11 124 0.3821993863199565 +CMS CMSJETS11 125 0.14796011741945186 +CMS CMSJETS11 126 0.05808605576889366 +CMS CMSJETS11 127 0.020258461063771327 +CMS CMSJETS11 128 0.006549884219507623 +CMS CMSJETS11 129 0.0021102678286488382 +CMS CMSJETS11 130 0.00027998963463139373 +CMS CMSJETS11 131 8.597370643627551e-05 +CMS CMSJETS11 132 9.361652706292585e-06 diff --git a/validphys2/src/validphys/tests/test_regressions.py b/validphys2/src/validphys/tests/test_regressions.py index 9dd2c10333..dd671ee3ec 100644 --- a/validphys2/src/validphys/tests/test_regressions.py +++ b/validphys2/src/validphys/tests/test_regressions.py @@ -18,8 +18,9 @@ import NNPDF from validphys import results from validphys.api import API -from validphys.tableloader import (parse_exp_mat, load_perreplica_chi2_table, +from validphys.tableloader import (parse_data_cv, parse_exp_mat, load_perreplica_chi2_table, sane_load, load_fits_chi2_table) +from validphys.tests.test_covmats import CORR_DATA @@ -51,6 +52,18 @@ def f_(*args, **kwargs): return f_ return decorator + +@make_table_comp(parse_data_cv) +def test_mcreplica(data_config): + config = dict(data_config) + config["dataset_inputs"] = CORR_DATA + seed = 123456 + # Use no cuts because if filter rules change in the + # future then this test will end up failing + rep = API.indexed_make_replica(**config, seed=seed) + return rep + + @make_table_comp(parse_exp_mat) def test_expcovmat(data_config): mat = API.groups_covmat_no_table(**data_config)