From 8493c7e892b1181041674a0bf5c0d3f6f17e59df Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 19 Aug 2020 14:57:02 +0100 Subject: [PATCH 01/28] A direct translation of the C++ pseudodata code No attempt at optimization has been done Uses the same seed structure as C++ so will give the same pseudodata if the same RNG seed is used. This commit is a proof of concept. --- validphys2/src/validphys/covmats.py | 61 +++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 530ced5815..3bf0b2dbf4 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -12,6 +12,7 @@ from validphys.calcutils import regularize_covmat, get_df_block from validphys.core import PDF, DataGroupSpec, DataSetSpec +from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, check_norm_threshold, @@ -21,10 +22,64 @@ ) from validphys.results import ThPredictionsResult +import NNPDF + log = logging.getLogger(__name__) INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") +def make_replica(commondata, seed=1): + # dataset_input: {dataset: NMC, frac: 0.5} + + # use_cuts: nocuts + + # theoryid: 53 + + # template_text: | + # {@make_replica@} + + # actions_: + # - report(main=True) + + # TODO: eventually remove the inhouse random number generator + # in favour of e.g numpy + NNPDF.RandomGenerator.InitRNG(0, seed) + rng = NNPDF.RandomGenerator.GetRNG() + + ld_cd = load_commondata(commondata) + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(covmat_for_sampling) + + rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] + deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] + deviates, rand = map(np.array, (deviates, rand)) + + correlated_deviates = sampling_matrix @ deviates + + artdata = np.array(ld_cd.central_values) + correlated_deviates + + sys_table = ld_cd.sys_errors + for i in range(ld_cd.ndata): + xnor = 1 + for j in range(ld_cd.nsys): + sys = sys_table.iloc[i, j] + if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): + continue + + if sys.name == 'UNCORR': + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) + else: + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rand[j] * sys.mult/100) + artdata[i] = xnor * artdata[i] + + # TODO: check the pseudodata is positive + # requires implementing the above code in a while loop + return artdata + def covmat_from_systematics(commondata): """Take the statistical uncertainty and systematics table from @@ -68,6 +123,12 @@ def covmat_from_systematics(commondata): CommonData which stores information about systematic errors, their treatment and description. + use_mult_errors: bool + Boolean which controls whether we use multiplicative errors + when computing the covmat. By default it should be set to ``True`` + unles one wishes to uses a sampling matrix for pseudodata generation, + in which case it should be set to ``False``. + Returns ------- cov_mat : np.array From 68e9ced95e59d83f3ab84ec1bc94bcee8f27021c Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 20 Aug 2020 11:46:02 +0100 Subject: [PATCH 02/28] Loop until positivity of pseudodata is achieved --- validphys2/src/validphys/covmats.py | 54 ++++++++++++++++------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 3bf0b2dbf4..18c76261ff 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -50,34 +50,40 @@ def make_replica(commondata, seed=1): covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) - rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] - deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] - deviates, rand = map(np.array, (deviates, rand)) - - correlated_deviates = sampling_matrix @ deviates + # We need to loop until the pseudodata is positive + while True: + rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] + deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] + deviates, rand = map(np.array, (deviates, rand)) + + correlated_deviates = sampling_matrix @ deviates + + artdata = np.array(ld_cd.central_values) + correlated_deviates + + sys_table = ld_cd.sys_errors + for i in range(ld_cd.ndata): + xnor = 1 + for j in range(ld_cd.nsys): + sys = sys_table.iloc[i, j] + if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): + continue - artdata = np.array(ld_cd.central_values) + correlated_deviates + if sys.name == 'UNCORR': + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) + else: + if sys.sys_type == 'ADD': + continue + xnor *= (1.0 + rand[j] * sys.mult/100) + artdata[i] = xnor * artdata[i] - sys_table = ld_cd.sys_errors - for i in range(ld_cd.ndata): - xnor = 1 - for j in range(ld_cd.nsys): - sys = sys_table.iloc[i, j] - if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): - continue + artdata = np.array(artdata) - if sys.name == 'UNCORR': - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) - else: - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rand[j] * sys.mult/100) - artdata[i] = xnor * artdata[i] + # Check positivity of the pseudodata + if all(artdata > 0): + break - # TODO: check the pseudodata is positive - # requires implementing the above code in a while loop return artdata From b2245c8f5f600469f1124a89bdac326c5105d99a Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 20 Aug 2020 15:47:37 +0100 Subject: [PATCH 03/28] Using pandas and numpy to generate the pseudodata This removes the for loop implementation --- validphys2/src/validphys/covmats.py | 73 +++++++++++++++++++---------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 18c76261ff..749bd01ab2 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -50,41 +50,64 @@ def make_replica(commondata, seed=1): covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) + central_values = ld_cd.central_values + nsys, ndata = ld_cd.nsys, ld_cd.ndata + + # Define these variables here to prevent them + # from being redefined in the while loop + sys_names = ld_cd.systype_table["name"] + sys_types = ld_cd.systype_table["type"] + systematics = ld_cd.sys_errors + + # Drop any systematics that are related to theory uncertainties + # or have explicitly been labelled as a systematic to be skipped. + # We are only interested in multiplicative uncertainties + mult_uncorr_sys = systematics.loc[ + :, + (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) + & (sys_types == "MULT"), + ] + mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + + # We want to know which of the remaining systematics were + # uncorrelated. The other we mark as NaN + mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + # We need to loop until the pseudodata is positive while True: - rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)] - deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)] - deviates, rand = map(np.array, (deviates, rand)) + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) correlated_deviates = sampling_matrix @ deviates - artdata = np.array(ld_cd.central_values) + correlated_deviates - - sys_table = ld_cd.sys_errors - for i in range(ld_cd.ndata): - xnor = 1 - for j in range(ld_cd.nsys): - sys = sys_table.iloc[i, j] - if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'): - continue - - if sys.name == 'UNCORR': - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100) - else: - if sys.sys_type == 'ADD': - continue - xnor *= (1.0 + rand[j] * sys.mult/100) - artdata[i] = xnor * artdata[i] + pseudodata = np.array(central_values) + correlated_deviates + + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( + [ + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) + ] + ) - artdata = np.array(artdata) + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata # Check positivity of the pseudodata - if all(artdata > 0): + if all(pseudodata > 0): break - return artdata + return pseudodata def covmat_from_systematics(commondata): From 196f31faf51f85107505af59a1371a8da77febeb Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 24 Aug 2020 15:24:47 +0100 Subject: [PATCH 04/28] Adding docstring for make_replica function Also correcting some typos in the docstring of covmat_from_systematics --- validphys2/src/validphys/covmats.py | 64 ++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 749bd01ab2..541b607cf7 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -12,6 +12,7 @@ from validphys.calcutils import regularize_covmat, get_df_block from validphys.core import PDF, DataGroupSpec, DataSetSpec +from validphys.coredata import CommonData from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, @@ -29,24 +30,58 @@ INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") def make_replica(commondata, seed=1): - # dataset_input: {dataset: NMC, frac: 0.5} + """Function that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas + of the data central value. - # use_cuts: nocuts + The square root (obtained by the Cholesky decomposition) of the covariance + matrix is used as the sampling matrix. The covariance matrix is obtained using + :py:func:`validphys.covmats.covmat_from_systematics` and must not use + multiplicative systematic uncertainties when obtaining the sampling matrix. - # theoryid: 53 + Parameters + --------- + commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` + CommonData which stores information about systematic errors, + their treatment and description. - # template_text: | - # {@make_replica@} + seed: int + Seed used to initialise the random number generator. - # actions_: - # - report(main=True) + Returns + ------- + pseudodata: np.array + Numpy array which is N_dat (where N_dat is the number of data points after cuts) + containing monte carlo samples of data centered around the data central value. + + Example + ------- + >>> from validphys.loader import Loader + >>> l = Loader() + >>> from validphys.covmats import make_replica + >>> cd = l.check_commondata("NMC") + >>> make_replica(cd) + - Random Generator allocated: ranlux + array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + + .. todo:: Replace the inhouse random number generation with a numpy equivalent. + .. todo:: Allow for correlations between datasets within an experiment. + """ # TODO: eventually remove the inhouse random number generator # in favour of e.g numpy NNPDF.RandomGenerator.InitRNG(0, seed) rng = NNPDF.RandomGenerator.GetRNG() - ld_cd = load_commondata(commondata) + if not isinstance(commondata, CommonData): + ld_cd = load_commondata(commondata) + else: + ld_cd = commondata + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) sampling_matrix = sqrt_covmat(covmat_for_sampling) @@ -69,8 +104,7 @@ def make_replica(commondata, seed=1): ] mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) - # We want to know which of the remaining systematics were - # uncorrelated. The other we mark as NaN + # We want to know the names of the remaining systematics. mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] # We need to loop until the pseudodata is positive @@ -146,9 +180,9 @@ def covmat_from_systematics(commondata): `paper `_ outlining the procedure, specifically equation 2 and surrounding text. - Paramaters + Parameters ---------- - commondata : validphys.coredata.CommonData + commondata: validphys.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. @@ -160,9 +194,9 @@ def covmat_from_systematics(commondata): Returns ------- - cov_mat : np.array - Numpy array which is N_dat x N_dat (where N_dat is the number of data - points after cuts) containing uncertainty and correlation information. + cov_mat: np.array + Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts) + containing uncertainty and correlation information. Example ------- From c5ba90d867071291f651d037ebec48c5b8858a9a Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Wed, 26 Aug 2020 13:38:58 +0100 Subject: [PATCH 05/28] Apply suggestions from code review Co-authored-by: Cameron Voisey <32741139+voisey@users.noreply.github.com> --- validphys2/src/validphys/covmats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 541b607cf7..55b9396017 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -31,7 +31,7 @@ def make_replica(commondata, seed=1): """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas + :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas of the data central value. The square root (obtained by the Cholesky decomposition) of the covariance @@ -189,7 +189,7 @@ def covmat_from_systematics(commondata): use_mult_errors: bool Boolean which controls whether we use multiplicative errors when computing the covmat. By default it should be set to ``True`` - unles one wishes to uses a sampling matrix for pseudodata generation, + unless one wishes to uses a sampling matrix for pseudodata generation, in which case it should be set to ``False``. Returns From ca2f21422e2c33c9a7d586b52aa576e58fad0b60 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 27 Aug 2020 13:00:32 +0100 Subject: [PATCH 06/28] Moving to pseudodata.py --- validphys2/src/validphys/covmats.py | 4 - validphys2/src/validphys/pseudodata.py | 120 +++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 55b9396017..51626b6ac4 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -12,8 +12,6 @@ from validphys.calcutils import regularize_covmat, get_df_block from validphys.core import PDF, DataGroupSpec, DataSetSpec -from validphys.coredata import CommonData -from validphys.commondataparser import load_commondata from validphys.checks import ( check_dataset_cuts_match_theorycovmat, check_norm_threshold, @@ -23,8 +21,6 @@ ) from validphys.results import ThPredictionsResult -import NNPDF - log = logging.getLogger(__name__) INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index d62331f467..d0990114ab 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -12,12 +12,17 @@ import pandas as pd from validphys.checks import check_cuts_fromfit, check_darwin_single_process +from validphys.commondataparser import load_commondata +from validphys.coredata import CommonData +from validphys.covmats import covmat_from_systematics, sqrt_covmat from reportengine import collect import n3fit.io.reader as reader from n3fit.performfit import initialize_seeds +import NNPDF + log = logging.getLogger(__name__) @@ -104,6 +109,121 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index +def make_replica(commondata, seed=1): + """Function that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas + of the data central value. + + The square root (obtained by the Cholesky decomposition) of the covariance + matrix is used as the sampling matrix. The covariance matrix is obtained using + :py:func:`validphys.covmats.covmat_from_systematics` and must not use + multiplicative systematic uncertainties when obtaining the sampling matrix. + + Parameters + --------- + commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` + CommonData which stores information about systematic errors, + their treatment and description. + + seed: int + Seed used to initialise the random number generator. + + Returns + ------- + pseudodata: np.array + Numpy array which is N_dat (where N_dat is the number of data points after cuts) + containing monte carlo samples of data centered around the data central value. + + + Example + ------- + >>> from validphys.loader import Loader + >>> l = Loader() + >>> from validphys.pseudodata import make_replica + >>> cd = l.check_commondata("NMC") + >>> make_replica(cd) + - Random Generator allocated: ranlux + array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + + .. todo:: Replace the inhouse random number generation with a numpy equivalent. + .. todo:: Allow for correlations between datasets within an experiment. + """ + # TODO: eventually remove the inhouse random number generator + # in favour of e.g numpy + NNPDF.RandomGenerator.InitRNG(0, seed) + rng = NNPDF.RandomGenerator.GetRNG() + + if not isinstance(commondata, CommonData): + ld_cd = load_commondata(commondata) + else: + ld_cd = commondata + + covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(covmat_for_sampling) + + central_values = ld_cd.central_values + nsys, ndata = ld_cd.nsys, ld_cd.ndata + + # Define these variables here to prevent them + # from being redefined in the while loop + sys_names = ld_cd.systype_table["name"] + sys_types = ld_cd.systype_table["type"] + systematics = ld_cd.sys_errors + + # Drop any systematics that are related to theory uncertainties + # or have explicitly been labelled as a systematic to be skipped. + # We are only interested in multiplicative uncertainties + mult_uncorr_sys = systematics.loc[ + :, + (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) + & (sys_types == "MULT"), + ] + mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + + # We want to know the names of the remaining systematics. + mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + + # We need to loop until the pseudodata is positive + while True: + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) + + correlated_deviates = sampling_matrix @ deviates + + pseudodata = np.array(central_values) + correlated_deviates + + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( + [ + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) + ] + ) + + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + + # Check positivity of the pseudodata + if all(pseudodata > 0): + break + + return pseudodata + + @check_darwin_single_process def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): """A function to obtain information about the pseudodata that went From 821b6aae77721a1915b063fedb55967cd6d9e263 Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 27 Aug 2020 15:54:30 +0100 Subject: [PATCH 07/28] Making the function into an infinite generator Thus it remembers the RNG state --- validphys2/src/validphys/pseudodata.py | 74 ++++++++++++++------------ 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index d0990114ab..69a65e2aef 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -110,9 +110,10 @@ def read_fit_pseudodata(fitcontext, context_index): def make_replica(commondata, seed=1): - """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas - of the data central value. + """Generator that takes in a :py:class:`validphys.coredata.CommonData` or + :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas + of the data central value. This generator is infinite in the sense that it will + never raise a StopIteration exception The square root (obtained by the Cholesky decomposition) of the covariance matrix is used as the sampling matrix. The covariance matrix is obtained using @@ -141,13 +142,14 @@ def make_replica(commondata, seed=1): >>> l = Loader() >>> from validphys.pseudodata import make_replica >>> cd = l.check_commondata("NMC") - >>> make_replica(cd) + >>> data_generator = make_replica(cd) + >>> next(data_generator) - Random Generator allocated: ranlux array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , + 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, + 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, + 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, + .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. @@ -187,41 +189,45 @@ def make_replica(commondata, seed=1): # We want to know the names of the remaining systematics. mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] - # We need to loop until the pseudodata is positive + # We make this generator infinite in the sense that it shall never + # raise a StopIteration exception. Thus we can continuely request + # pseudodata from it and it will remember the RNG state. while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) + # We need to loop until the pseudodata is positive + while True: + # It is useful to have this quantity as a pandas.Series + rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) + # And this as a numpy.array + deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - correlated_deviates = sampling_matrix @ deviates + correlated_deviates = sampling_matrix @ deviates - pseudodata = np.array(central_values) + correlated_deviates + pseudodata = np.array(central_values) + correlated_deviates - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ + # Matrix of random numbers generated by either sampling from rand in the + # correct manner, or by requesting a new random number if the systematic + # error type was UNCORR + random_matrix = np.array( [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() + [ + rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] + for i, sys_type in mult_uncorr_names.iteritems() + ] + for _ in range(ndata) ] - for _ in range(ndata) - ] - ) + ) - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + # The final pseudodata is obtained by doing an elementwise multiplication + # of the multiplicative uncertainties with the above random matrix. We then + # add unity to all elements and take the product across columns. This vector + # must then be elementwise multiplied by central values + correlated deviates. + pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata - if all(pseudodata > 0): - break + # Check positivity of the pseudodata + if all(pseudodata > 0): + break - return pseudodata + yield pseudodata @check_darwin_single_process From 7f4c3f567da4d0c2a28b06bb3815c7d0c5602f45 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 4 Sep 2020 10:54:01 +0100 Subject: [PATCH 08/28] Adding comment explaining why we divide the multiplicative uncertainties by 100 --- validphys2/src/validphys/pseudodata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 69a65e2aef..186747f62c 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -184,6 +184,8 @@ def make_replica(commondata, seed=1): (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) & (sys_types == "MULT"), ] + # The multiplicative uncertainties are quoted as percentages, so + # we change them to fractional values mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) # We want to know the names of the remaining systematics. From 9f28ca9592812afcc214f5ac96561123a8ec88e6 Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Fri, 4 Sep 2020 12:29:26 +0100 Subject: [PATCH 09/28] Update validphys2/src/validphys/covmats.py Co-authored-by: Zaharid --- validphys2/src/validphys/covmats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 51626b6ac4..921ae2bfdd 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -185,7 +185,7 @@ def covmat_from_systematics(commondata): use_mult_errors: bool Boolean which controls whether we use multiplicative errors when computing the covmat. By default it should be set to ``True`` - unless one wishes to uses a sampling matrix for pseudodata generation, + unless one wishes to sample pseudodata from a multigaussian distribution, in which case it should be set to ``False``. Returns From da36db57b96ceb58c3e5f63a193d1af1016a6ef7 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 4 Sep 2020 16:10:04 +0100 Subject: [PATCH 10/28] Changing covmat_for_sampling to additive_covmat --- validphys2/src/validphys/pseudodata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 186747f62c..09ce272ad2 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -164,8 +164,8 @@ def make_replica(commondata, seed=1): else: ld_cd = commondata - covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(covmat_for_sampling) + additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False) + sampling_matrix = sqrt_covmat(additive_covmat) central_values = ld_cd.central_values nsys, ndata = ld_cd.nsys, ld_cd.ndata From 5649743c07984f18f8856cabaaa593beffffb132 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 15:30:32 +0100 Subject: [PATCH 11/28] Changing variable names to better reflect their purpose --- validphys2/src/validphys/pseudodata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 09ce272ad2..e111906767 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -179,17 +179,17 @@ def make_replica(commondata, seed=1): # Drop any systematics that are related to theory uncertainties # or have explicitly been labelled as a systematic to be skipped. # We are only interested in multiplicative uncertainties - mult_uncorr_sys = systematics.loc[ + mult_sys = systematics.loc[ :, (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) & (sys_types == "MULT"), ] # The multiplicative uncertainties are quoted as percentages, so # we change them to fractional values - mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) + mult_sys = mult_sys.applymap(lambda x: x.mult / 100) # We want to know the names of the remaining systematics. - mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] + mult_names = sys_names.loc[mult_sys.columns] # We make this generator infinite in the sense that it shall never # raise a StopIteration exception. Thus we can continuely request @@ -213,7 +213,7 @@ def make_replica(commondata, seed=1): [ [ rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() + for i, sys_type in mult_names.iteritems() ] for _ in range(ndata) ] @@ -223,7 +223,7 @@ def make_replica(commondata, seed=1): # of the multiplicative uncertainties with the above random matrix. We then # add unity to all elements and take the product across columns. This vector # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata + pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata # Check positivity of the pseudodata if all(pseudodata > 0): From fe6a653c51ea133c436c182220e27e95d305cb46 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 16:12:33 +0100 Subject: [PATCH 12/28] Accounting for the asymmetry edge case --- validphys2/src/validphys/pseudodata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index e111906767..1414f8f3ef 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -225,8 +225,9 @@ def make_replica(commondata, seed=1): # must then be elementwise multiplied by central values + correlated deviates. pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata - if all(pseudodata > 0): + # Check positivity of the pseudodata. Note that if the dataset is an asymmetry + # then it is allowed to be negative. + if all(pseudodata > 0) or ("ASY" in commondata.process_type): break yield pseudodata From 7d5c9d289a10e42c3233ef79bc6da5866111562c Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 16 Sep 2020 16:55:48 +0100 Subject: [PATCH 13/28] Adding information on the nature of sampling in the docstring --- validphys2/src/validphys/pseudodata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 1414f8f3ef..0c462cd083 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -120,6 +120,11 @@ def make_replica(commondata, seed=1): :py:func:`validphys.covmats.covmat_from_systematics` and must not use multiplicative systematic uncertainties when obtaining the sampling matrix. + Gaussian sampling is performed using the additive part of the covariance matrix. + An additional gaussian sampling for the renormalization of the data having multiplicative + uncertainties happens later. All the contributions to the covariance matrix coming from + theory errors should be accounted for in the additive part + Parameters --------- commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` From 3365dec2687608b33f13f8c25311cf954c540df3 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 6 Oct 2020 17:29:05 +0100 Subject: [PATCH 14/28] Correcting covmat_from_systematics example --- validphys2/src/validphys/covmats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 921ae2bfdd..f861f12884 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -198,7 +198,7 @@ def covmat_from_systematics(commondata): ------- >>> from validphys.commondataparser import load_commondata >>> from validphys.loader import Loader - >>> from validphys.calcutils import covmat_from_systematics + >>> from validphys.covmats import covmat_from_systematics >>> l = Loader() >>> cd = l.check_commondata("NMC") >>> cd = load_commondata(cd) From a3fe40fe48736388dbc66420706e41611c9a4b47 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 13 Oct 2020 15:15:56 +0100 Subject: [PATCH 15/28] Clarifying doc strings Adding Zahari's review comments --- validphys2/src/validphys/covmats.py | 7 ++++--- validphys2/src/validphys/pseudodata.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index f861f12884..17660d76c5 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -183,10 +183,11 @@ def covmat_from_systematics(commondata): their treatment and description. use_mult_errors: bool - Boolean which controls whether we use multiplicative errors + Boolean which controls whether multiplicative errors are used when computing the covmat. By default it should be set to ``True`` - unless one wishes to sample pseudodata from a multigaussian distribution, - in which case it should be set to ``False``. + since setting this parameter to ``False`` will completely ignore multiplicative + uncertainties. This flag is present for use with pseudodata generation where the + multiplicative uncertainties should be ignored. Returns ------- diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 0c462cd083..7ed133912d 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -116,14 +116,14 @@ def make_replica(commondata, seed=1): never raise a StopIteration exception The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix. The covariance matrix is obtained using - :py:func:`validphys.covmats.covmat_from_systematics` and must not use - multiplicative systematic uncertainties when obtaining the sampling matrix. - - Gaussian sampling is performed using the additive part of the covariance matrix. - An additional gaussian sampling for the renormalization of the data having multiplicative - uncertainties happens later. All the contributions to the covariance matrix coming from - theory errors should be accounted for in the additive part + matrix is used as the sampling matrix with the covariance matrix being obtained using + :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call + to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to + ignore multiplicative uncertainties. Gaussian sampling is performed using the + additive part of the covariance matrix. An additional gaussian sampling for the + renormalization of the data having multiplicative uncertainties happens later. + All the contributions to the covariance matrix coming from theory errors should be + accounted for in the additive part. Parameters --------- From 79fbe95d7c3364f531a355e33632db8f39802984 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 13 Oct 2020 15:20:40 +0100 Subject: [PATCH 16/28] Setting default seed kwarg to None Assigning seed randomly if not provided explicilty in function call --- validphys2/src/validphys/pseudodata.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 7ed133912d..b804e68853 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -7,6 +7,7 @@ import multiprocessing as mp import os import pathlib +import random import numpy as np import pandas as pd @@ -109,7 +110,7 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index -def make_replica(commondata, seed=1): +def make_replica(commondata, seed=None): """Generator that takes in a :py:class:`validphys.coredata.CommonData` or :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas of the data central value. This generator is infinite in the sense that it will @@ -131,8 +132,10 @@ def make_replica(commondata, seed=1): CommonData which stores information about systematic errors, their treatment and description. - seed: int - Seed used to initialise the random number generator. + seed: int, None + Seed used to initialise the random number generator. If ``None`` then a random seed is allocated + using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing + inadvertent correlations. Returns ------- @@ -159,6 +162,11 @@ def make_replica(commondata, seed=1): .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. """ + # If seed is None (i.e not explicitly provided in the function call) + # then set it randomly + if seed is None: + seed = random.randint(1, 1000) + # TODO: eventually remove the inhouse random number generator # in favour of e.g numpy NNPDF.RandomGenerator.InitRNG(0, seed) From 8905f0ee2e28de97b801379feb9dce3195442cb3 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 17 Nov 2020 11:23:53 +0000 Subject: [PATCH 17/28] Removing make replica from covmats.py --- validphys2/src/validphys/covmats.py | 114 ---------------------------- 1 file changed, 114 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 17660d76c5..e184e57da0 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -25,120 +25,6 @@ INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR") -def make_replica(commondata, seed=1): - """Function that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas - of the data central value. - - The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix. The covariance matrix is obtained using - :py:func:`validphys.covmats.covmat_from_systematics` and must not use - multiplicative systematic uncertainties when obtaining the sampling matrix. - - Parameters - --------- - commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` - CommonData which stores information about systematic errors, - their treatment and description. - - seed: int - Seed used to initialise the random number generator. - - Returns - ------- - pseudodata: np.array - Numpy array which is N_dat (where N_dat is the number of data points after cuts) - containing monte carlo samples of data centered around the data central value. - - - Example - ------- - >>> from validphys.loader import Loader - >>> l = Loader() - >>> from validphys.covmats import make_replica - >>> cd = l.check_commondata("NMC") - >>> make_replica(cd) - - Random Generator allocated: ranlux - array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - 0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 , - - .. todo:: Replace the inhouse random number generation with a numpy equivalent. - .. todo:: Allow for correlations between datasets within an experiment. - """ - # TODO: eventually remove the inhouse random number generator - # in favour of e.g numpy - NNPDF.RandomGenerator.InitRNG(0, seed) - rng = NNPDF.RandomGenerator.GetRNG() - - if not isinstance(commondata, CommonData): - ld_cd = load_commondata(commondata) - else: - ld_cd = commondata - - covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(covmat_for_sampling) - - central_values = ld_cd.central_values - nsys, ndata = ld_cd.nsys, ld_cd.ndata - - # Define these variables here to prevent them - # from being redefined in the while loop - sys_names = ld_cd.systype_table["name"] - sys_types = ld_cd.systype_table["type"] - systematics = ld_cd.sys_errors - - # Drop any systematics that are related to theory uncertainties - # or have explicitly been labelled as a systematic to be skipped. - # We are only interested in multiplicative uncertainties - mult_uncorr_sys = systematics.loc[ - :, - (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) - & (sys_types == "MULT"), - ] - mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100) - - # We want to know the names of the remaining systematics. - mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns] - - # We need to loop until the pseudodata is positive - while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - - correlated_deviates = sampling_matrix @ deviates - - pseudodata = np.array(central_values) + correlated_deviates - - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ - [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_uncorr_names.iteritems() - ] - for _ in range(ndata) - ] - ) - - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata - - # Check positivity of the pseudodata - if all(pseudodata > 0): - break - - return pseudodata - def covmat_from_systematics(commondata): """Take the statistical uncertainty and systematics table from From 0a8ec390a654c1208bb81766ea69bbda6b1c9afc Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 17 Nov 2020 13:35:46 +0000 Subject: [PATCH 18/28] Adding use_mult_errors flag to datasets_covmat_from_systematics --- validphys2/src/validphys/covmats.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index e184e57da0..85ccc91ee4 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -68,13 +68,6 @@ def covmat_from_systematics(commondata): CommonData which stores information about systematic errors, their treatment and description. - use_mult_errors: bool - Boolean which controls whether multiplicative errors are used - when computing the covmat. By default it should be set to ``True`` - since setting this parameter to ``False`` will completely ignore multiplicative - uncertainties. This flag is present for use with pseudodata generation where the - multiplicative uncertainties should be ignored. - Returns ------- cov_mat: np.array @@ -108,7 +101,7 @@ def covmat_from_systematics(commondata): commondata.stat_errors.to_numpy(), commondata.systematic_errors()) -def datasets_covmat_from_systematics(list_of_commondata): +def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True): """Given a list containing :py:class:`validphys.coredata.CommonData` s, construct the full covariance matrix. @@ -123,6 +116,10 @@ def datasets_covmat_from_systematics(list_of_commondata): list_of_commondata : list[validphys.coredata.CommonData] list of CommonData objects. + use_mult_errors: bool + Whether or not to include multiplicative systematic errors in the computation of + the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`. + Returns ------- cov_mat : np.array @@ -147,7 +144,10 @@ def datasets_covmat_from_systematics(list_of_commondata): block_diags = [] for cd in list_of_commondata: - errors = cd.systematic_errors() + if use_mult_errors: + errors = cd.systematic_errors() + else: + errors = cd.additive_errors # separate out the special uncertainties which can be correlated across # datasets is_intra_dataset_error = errors.columns.isin(INTRA_DATASET_SYS_NAME) From a1820d353909dafd10d7358d63aa823984066705 Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 7 Dec 2020 15:17:21 +0000 Subject: [PATCH 19/28] Adding functionality to compute pseudodata for a list of commondata Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 140 ++++++++++++------------- 1 file changed, 69 insertions(+), 71 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index b804e68853..2e7d5cbee4 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -15,7 +15,7 @@ from validphys.checks import check_cuts_fromfit, check_darwin_single_process from validphys.commondataparser import load_commondata from validphys.coredata import CommonData -from validphys.covmats import covmat_from_systematics, sqrt_covmat +from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect @@ -110,7 +110,8 @@ def read_fit_pseudodata(fitcontext, context_index): yield pseudodata.drop("type", axis=1), tr.index, val.index -def make_replica(commondata, seed=None): +def make_replica(list_of_commondata, seed=None): + # TODO: this is the old docstring: update it """Generator that takes in a :py:class:`validphys.coredata.CommonData` or :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas of the data central value. This generator is infinite in the sense that it will @@ -165,85 +166,82 @@ def make_replica(commondata, seed=None): # If seed is None (i.e not explicitly provided in the function call) # then set it randomly if seed is None: - seed = random.randint(1, 1000) + seed = np.random.randint(1000) - # TODO: eventually remove the inhouse random number generator - # in favour of e.g numpy - NNPDF.RandomGenerator.InitRNG(0, seed) - rng = NNPDF.RandomGenerator.GetRNG() - - if not isinstance(commondata, CommonData): - ld_cd = load_commondata(commondata) - else: - ld_cd = commondata - - additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False) - sampling_matrix = sqrt_covmat(additive_covmat) - - central_values = ld_cd.central_values - nsys, ndata = ld_cd.nsys, ld_cd.ndata - - # Define these variables here to prevent them - # from being redefined in the while loop - sys_names = ld_cd.systype_table["name"] - sys_types = ld_cd.systype_table["type"] - systematics = ld_cd.sys_errors - - # Drop any systematics that are related to theory uncertainties - # or have explicitly been labelled as a systematic to be skipped. - # We are only interested in multiplicative uncertainties - mult_sys = systematics.loc[ - :, - (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"])) - & (sys_types == "MULT"), - ] - # The multiplicative uncertainties are quoted as percentages, so - # we change them to fractional values - mult_sys = mult_sys.applymap(lambda x: x.mult / 100) - - # We want to know the names of the remaining systematics. - mult_names = sys_names.loc[mult_sys.columns] + # Seed the numpy RNG with the seed. + np.random.seed(seed=seed) # We make this generator infinite in the sense that it shall never - # raise a StopIteration exception. Thus we can continuely request + # raise a StopIteration exception. Thus we can continually request # pseudodata from it and it will remember the RNG state. while True: - # We need to loop until the pseudodata is positive + # The inner while True loop is for ensuring a positive definite + # pseudodata replica while True: - # It is useful to have this quantity as a pandas.Series - rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index) - # And this as a numpy.array - deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)]) - - correlated_deviates = sampling_matrix @ deviates - - pseudodata = np.array(central_values) + correlated_deviates - - # Matrix of random numbers generated by either sampling from rand in the - # correct manner, or by requesting a new random number if the systematic - # error type was UNCORR - random_matrix = np.array( - [ - [ - rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i] - for i, sys_type in mult_names.iteritems() - ] - for _ in range(ndata) - ] - ) + pseudodatas = [] + special_add = [] + special_mult = [] + mult_shifts = [] + check_positive_masks = [] + for cd in list_of_commondata: + pseudodata = cd.central_values.to_numpy() + + add_errors = cd.additive_errors + add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() + + pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + + # correlated within dataset + add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() + pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + + # append the partially shifted pseudodata + pseudodatas.append(pseudodata) + special_add.append( + add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + ).prod(axis=1) + + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + mult_shift *= ( + 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + ).prod(axis=1) + + mult_shifts.append(mult_shift) + + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + + # mask out the data we want to check are all positive + if "ASY" in cd.commondataproc: + check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) + else: + check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) + + # if we sort here (which sorts columns), then permuting datasets doesn't change the result + special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy() + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() - # The final pseudodata is obtained by doing an elementwise multiplication - # of the multiplicative uncertainties with the above random matrix. We then - # add unity to all elements and take the product across columns. This vector - # must then be elementwise multiplied by central values + correlated deviates. - pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata - # Check positivity of the pseudodata. Note that if the dataset is an asymmetry - # then it is allowed to be negative. - if all(pseudodata > 0) or ("ASY" in commondata.process_type): + all_pseudodata = ( + np.concatenate(pseudodatas, axis=0) + + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + ) * ( + np.concatenate(mult_shifts, axis=0) + * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + ) + + if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): break - yield pseudodata + yield all_pseudodata @check_darwin_single_process From 5b33d2766bf5e06dcf0ce4f4ae88760a7544e954 Mon Sep 17 00:00:00 2001 From: siranipour Date: Tue, 8 Dec 2020 13:09:15 +0000 Subject: [PATCH 20/28] Adding fillna(0) to special additive errors --- validphys2/src/validphys/pseudodata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 2e7d5cbee4..da9311e00b 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -225,8 +225,8 @@ def make_replica(list_of_commondata, seed=None): check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) # if we sort here (which sorts columns), then permuting datasets doesn't change the result - special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy() # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() From cfdb94619ed92f0edaab849befbebff9e5bb9b19 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 9 Dec 2020 15:58:08 +0000 Subject: [PATCH 21/28] Changing covmats.py to not use the use_mult_errors flag --- validphys2/src/validphys/covmats.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 85ccc91ee4..02b8268b9b 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -101,7 +101,7 @@ def covmat_from_systematics(commondata): commondata.stat_errors.to_numpy(), commondata.systematic_errors()) -def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True): +def datasets_covmat_from_systematics(list_of_commondata): """Given a list containing :py:class:`validphys.coredata.CommonData` s, construct the full covariance matrix. @@ -116,10 +116,6 @@ def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True): list_of_commondata : list[validphys.coredata.CommonData] list of CommonData objects. - use_mult_errors: bool - Whether or not to include multiplicative systematic errors in the computation of - the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`. - Returns ------- cov_mat : np.array @@ -144,10 +140,7 @@ def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True): block_diags = [] for cd in list_of_commondata: - if use_mult_errors: - errors = cd.systematic_errors() - else: - errors = cd.additive_errors + errors = cd.systematic_errors() # separate out the special uncertainties which can be correlated across # datasets is_intra_dataset_error = errors.columns.isin(INTRA_DATASET_SYS_NAME) From 0129724f5bb1525bcc987c27abe1fbcaa156e211 Mon Sep 17 00:00:00 2001 From: wilsonmr <33907451+wilsonmr@users.noreply.github.com> Date: Wed, 16 Dec 2020 10:01:31 +0000 Subject: [PATCH 22/28] Apply suggestions from code review - comments to split up code Co-authored-by: Rosalyn Pearson <33020850+RosalynLP@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index da9311e00b..eb450f0fc9 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -186,6 +186,7 @@ def make_replica(list_of_commondata, seed=None): for cd in list_of_commondata: pseudodata = cd.central_values.to_numpy() + # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() @@ -197,9 +198,11 @@ def make_replica(list_of_commondata, seed=None): # append the partially shifted pseudodata pseudodatas.append(pseudodata) + # store the additive errors with correlations between datasets for later use special_add.append( add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] ) + # ~~~ MULTIPLICATIVE ERRORS ~~~ mult_errors = cd.multiplicative_errors mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() # convert to from percent to fraction @@ -214,6 +217,7 @@ def make_replica(list_of_commondata, seed=None): mult_shifts.append(mult_shift) + # store the multiplicative errors with correlations between datasets for later use special_mult.append( mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] ) From 6772fb69bcd3dfb8b7cef0669c4923ded6f757a9 Mon Sep 17 00:00:00 2001 From: wilsonm Date: Wed, 16 Dec 2020 10:17:23 +0000 Subject: [PATCH 23/28] include stat errors --- validphys2/src/validphys/pseudodata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index eb450f0fc9..c74529b48f 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -186,6 +186,9 @@ def make_replica(list_of_commondata, seed=None): for cd in list_of_commondata: pseudodata = cd.central_values.to_numpy() + # add contribution from statistical uncertainty + pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() From b662abbb5fdfde851e291cdbce3ae0fff09a353f Mon Sep 17 00:00:00 2001 From: siranipour <43517072+siranipour@users.noreply.github.com> Date: Fri, 15 Jan 2021 14:06:45 +0000 Subject: [PATCH 24/28] Removing if seed is None Numpy has this as default behaviour Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com> --- validphys2/src/validphys/pseudodata.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index c74529b48f..0a1b70e064 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -163,11 +163,6 @@ def make_replica(list_of_commondata, seed=None): .. todo:: Replace the inhouse random number generation with a numpy equivalent. .. todo:: Allow for correlations between datasets within an experiment. """ - # If seed is None (i.e not explicitly provided in the function call) - # then set it randomly - if seed is None: - seed = np.random.randint(1000) - # Seed the numpy RNG with the seed. np.random.seed(seed=seed) From a6423de477a0f55cb7c0bdb6d6ca24a19d8e78c0 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 14:18:24 +0000 Subject: [PATCH 25/28] Changing generator to function --- validphys2/src/validphys/pseudodata.py | 138 ++++++++++++------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 0a1b70e064..b65024a5b7 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -166,84 +166,80 @@ def make_replica(list_of_commondata, seed=None): # Seed the numpy RNG with the seed. np.random.seed(seed=seed) - # We make this generator infinite in the sense that it shall never - # raise a StopIteration exception. Thus we can continually request - # pseudodata from it and it will remember the RNG state. + # The inner while True loop is for ensuring a positive definite + # pseudodata replica while True: - # The inner while True loop is for ensuring a positive definite - # pseudodata replica - while True: - pseudodatas = [] - special_add = [] - special_mult = [] - mult_shifts = [] - check_positive_masks = [] - for cd in list_of_commondata: - pseudodata = cd.central_values.to_numpy() - - # add contribution from statistical uncertainty - pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) - - # ~~~ ADDITIVE ERRORS ~~~ - add_errors = cd.additive_errors - add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() - - pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) - - # correlated within dataset - add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() - pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) - - # append the partially shifted pseudodata - pseudodatas.append(pseudodata) - # store the additive errors with correlations between datasets for later use - special_add.append( - add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] - ) - # ~~~ MULTIPLICATIVE ERRORS ~~~ - mult_errors = cd.multiplicative_errors - mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() - # convert to from percent to fraction - mult_shift = ( - 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 - ).prod(axis=1) - - mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() - mult_shift *= ( - 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 - ).prod(axis=1) - - mult_shifts.append(mult_shift) - - # store the multiplicative errors with correlations between datasets for later use - special_mult.append( - mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] - ) + pseudodatas = [] + special_add = [] + special_mult = [] + mult_shifts = [] + check_positive_masks = [] + for cd in list_of_commondata: + pseudodata = cd.central_values.to_numpy() + + # add contribution from statistical uncertainty + pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + + # ~~~ ADDITIVE ERRORS ~~~ + add_errors = cd.additive_errors + add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() + + pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + + # correlated within dataset + add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() + pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + + # append the partially shifted pseudodata + pseudodatas.append(pseudodata) + # store the additive errors with correlations between datasets for later use + special_add.append( + add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) + # ~~~ MULTIPLICATIVE ERRORS ~~~ + mult_errors = cd.multiplicative_errors + mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() + # convert to from percent to fraction + mult_shift = ( + 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + ).prod(axis=1) + + mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() + mult_shift *= ( + 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + ).prod(axis=1) + + mult_shifts.append(mult_shift) + + # store the multiplicative errors with correlations between datasets for later use + special_mult.append( + mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] + ) - # mask out the data we want to check are all positive - if "ASY" in cd.commondataproc: - check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) - else: - check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) + # mask out the data we want to check are all positive + if "ASY" in cd.commondataproc: + check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) + else: + check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) - # if we sort here (which sorts columns), then permuting datasets doesn't change the result - # non-overlapping systematics are set to NaN by concat, fill with 0 instead. - special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() - special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() + # if we sort here (which sorts columns), then permuting datasets doesn't change the result + # non-overlapping systematics are set to NaN by concat, fill with 0 instead. + special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() + special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() - all_pseudodata = ( - np.concatenate(pseudodatas, axis=0) - + special_add_errors @ np.random.randn(special_add_errors.shape[1]) - ) * ( - np.concatenate(mult_shifts, axis=0) - * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) - ) + all_pseudodata = ( + np.concatenate(pseudodatas, axis=0) + + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + ) * ( + np.concatenate(mult_shifts, axis=0) + * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + ) - if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): - break + if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): + break - yield all_pseudodata + return all_pseudodata @check_darwin_single_process From 3419bc66e2185e44b6cc71deff957c934845c4a9 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 14:27:56 +0000 Subject: [PATCH 26/28] Updating doc string --- validphys2/src/validphys/pseudodata.py | 58 ++++++++++---------------- 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index b65024a5b7..dcd47203b3 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -111,57 +111,43 @@ def read_fit_pseudodata(fitcontext, context_index): def make_replica(list_of_commondata, seed=None): - # TODO: this is the old docstring: update it - """Generator that takes in a :py:class:`validphys.coredata.CommonData` or - :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas - of the data central value. This generator is infinite in the sense that it will - never raise a StopIteration exception - - The square root (obtained by the Cholesky decomposition) of the covariance - matrix is used as the sampling matrix with the covariance matrix being obtained using - :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call - to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to - ignore multiplicative uncertainties. Gaussian sampling is performed using the - additive part of the covariance matrix. An additional gaussian sampling for the - renormalization of the data having multiplicative uncertainties happens later. - All the contributions to the covariance matrix coming from theory errors should be - accounted for in the additive part. + """Function that takes in a list of :py:class:`validphys.coredata.CommonData` + objects and returns pseudodata replicas of the data central value accounting for + possible correlations between systematic uncertainties. + + The function loops until positive definite pseudodata is generated for any + non-asymmetry datasets. In the case of an asymmetry dataset negative values are + permitted so the loop block executes only once. Parameters --------- - commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec` - CommonData which stores information about systematic errors, - their treatment and description. + list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`] + Iterable of CommonData objects which stores information about systematic errors, + their treatment and description, for each dataset. seed: int, None - Seed used to initialise the random number generator. If ``None`` then a random seed is allocated - using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing - inadvertent correlations. + Seed used to initialise the numpy random number generator. If ``None`` then a random seed is + allocated using the default numpy behaviour. Returns ------- pseudodata: np.array - Numpy array which is N_dat (where N_dat is the number of data points after cuts) + Numpy array which is N_dat (where N_dat is the combined number of data points after cuts) containing monte carlo samples of data centered around the data central value. - Example ------- + >>> from validphys.commondataparser import load_commondata >>> from validphys.loader import Loader - >>> l = Loader() >>> from validphys.pseudodata import make_replica - >>> cd = l.check_commondata("NMC") - >>> data_generator = make_replica(cd) - >>> next(data_generator) - - Random Generator allocated: ranlux - array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631, - 0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287, - 0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687, - 0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424, - - - .. todo:: Replace the inhouse random number generation with a numpy equivalent. - .. todo:: Allow for correlations between datasets within an experiment. + >>> l = Loader() + >>> cd1 = l.check_commondata("NMC") + >>> cd2 = l.check_commondata("NMCPD") + >>> lds = map(load_commondata, (cd1, cd2)) + >>> make_replica(lds) + array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , + 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, + 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , """ # Seed the numpy RNG with the seed. np.random.seed(seed=seed) From 14abf628273121073e7eeef747c2db66147d6a78 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 15 Jan 2021 16:16:34 +0000 Subject: [PATCH 27/28] Using new numpy.random interface --- validphys2/src/validphys/pseudodata.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index dcd47203b3..e5ec8a30f6 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -7,7 +7,6 @@ import multiprocessing as mp import os import pathlib -import random import numpy as np import pandas as pd @@ -150,7 +149,7 @@ def make_replica(list_of_commondata, seed=None): 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , """ # Seed the numpy RNG with the seed. - np.random.seed(seed=seed) + rng = np.random.default_rng(seed=seed) # The inner while True loop is for ensuring a positive definite # pseudodata replica @@ -164,17 +163,17 @@ def make_replica(list_of_commondata, seed=None): pseudodata = cd.central_values.to_numpy() # add contribution from statistical uncertainty - pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata)) + pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata)) # ~~~ ADDITIVE ERRORS ~~~ add_errors = cd.additive_errors add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() - pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1) + pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1) # correlated within dataset add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() - pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1]) + pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1]) # append the partially shifted pseudodata pseudodatas.append(pseudodata) @@ -187,12 +186,12 @@ def make_replica(list_of_commondata, seed=None): mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() # convert to from percent to fraction mult_shift = ( - 1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100 + 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 ).prod(axis=1) mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() mult_shift *= ( - 1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100 + 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 ).prod(axis=1) mult_shifts.append(mult_shift) @@ -216,10 +215,10 @@ def make_replica(list_of_commondata, seed=None): all_pseudodata = ( np.concatenate(pseudodatas, axis=0) - + special_add_errors @ np.random.randn(special_add_errors.shape[1]) + + special_add_errors @ rng.normal(size=special_add_errors.shape[1]) ) * ( np.concatenate(mult_shifts, axis=0) - * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1) + * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1) ) if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): From 9282cc1941ed3f5e78a3644f733ce2e4d6b9990e Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 21 Jan 2021 17:49:26 +0000 Subject: [PATCH 28/28] Working on reconstructing pseudodata --- validphys2/src/validphys/pseudodata.py | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index e5ec8a30f6..fc4cc81324 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -30,6 +30,66 @@ context_index = collect("groups_index", ("fitcontext",)) + +def reconstruct_replica(fit, list_of_commondata, replica: int): + """A function that reconstructs the pseudodata seen by replica number ``replica`` + + #TODO: add list_of_commondata + Parameters + ---------- + fit: validphys.core.fit + The input fit + replica: int + The replica number for which the pseudodata is reconstructed + + Returns + ------- + #XXX: maybe this will be a pd.Series + pseudodata: np.array + The pseudodata in array format + + Raises + ------ + KeyError: + If one of ``trvalseed``, ``nnseed``, or ``mcseed`` is not found under the ``fitting`` + namespace + + Example: + >>> from validphys.api import API + >>> API.reconstruct_replica(fit="NNPDF40_nnlo_as_0118", replica=0) + #TODO: finish this example + """ + log.warning(f"Attempt to reconstruct pseudodata for MC replica {replica} " + f"of {fit}. Be advised that this functionality is not available " + "for all fits, but only those fitted after N3FIT switched to the " + "Python MakeReplica function." + ) + + runcard = fit.as_input() + + try: + fit_ns = runcard["fitting"] + trvlseed = fit_ns["trvlseed"] + nnseed = fit_ns["nnseed"] + mcseed = fit_ns["mcseed"] + except KeyError as e: + raise KeyError(f"The following keys were not found in the fitting namespace {str(e)}, " + "please ensure the input fit is a valid N3FIT fit." + ) from e + + seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep=True) + + pseudodata = make_replica(list_of_commondata, seed=mcseed) + + # TODO: change the need for datasets and exp_name + trmasks, vlmasks = reader.make_tr_val_mask(datasets=None, exp_name=None, seed=trvlseed) + + training = pseudodata[:, trmasks] + validation = pseudodata[:, vlmasks] + + return training, validation + + @check_cuts_fromfit def read_fit_pseudodata(fitcontext, context_index): """Generator to handle the reading of training and validation splits for a fit that has been