-
Notifications
You must be signed in to change notification settings - Fork 14
MakeReplica in python #866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eef21af
dc4dd22
9abb31c
6841317
8005e84
01f2b7f
ed9edaa
7bc29fd
3ccda95
37b460c
078cd5c
55760a0
8217d8d
f2d42c7
98e53e3
213de7f
612197f
3f1ce74
f0fa628
c7626b8
c3fd73f
a5213cf
7ee813f
46020df
3d776a5
1984534
b5df64e
ed674e8
f481034
133c07c
8378139
50f33aa
432063d
e00e8d0
ae2623e
bbbd226
6a12bd6
5638e49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| import pandas as pd | ||
|
|
||
| from validphys.checks import check_cuts_fromfit, check_darwin_single_process | ||
| from validphys.covmats import INTRA_DATASET_SYS_NAME | ||
|
|
||
| from reportengine import collect | ||
|
|
||
|
|
@@ -104,6 +105,129 @@ def read_fit_pseudodata(fitcontext, context_index): | |
| yield pseudodata.drop("type", axis=1), tr.index, val.index | ||
|
|
||
|
|
||
| def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None): | ||
| """Function that takes in a list of :py:class:`validphys.coredata.CommonData` | ||
| objects and returns a pseudodata replica accounting for | ||
| possible correlations between systematic uncertainties. | ||
|
|
||
| The function loops until positive definite pseudodata is generated for any | ||
| non-asymmetry datasets. In the case of an asymmetry dataset negative values are | ||
| permitted so the loop block executes only once. | ||
|
Comment on lines
+114
to
+115
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if I'm being pedantic here but the loop is based on the list of datasets so although negative values are permitted for ASY datasets, it may still execute twice. My suggestion would be "...non-asymmetry datasets. In the case of an asymmetry dataset negative values are permitted." I'll leave it to somebody who is better at writing docstrings than me to determine if this is worth changing @RosalynLP
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wilsonmr yep fwiw I think this should def be changed to what you said |
||
|
|
||
| Parameters | ||
| --------- | ||
| dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`] | ||
| List of CommonData objects which stores information about systematic errors, | ||
| their treatment and description, for each dataset. | ||
|
|
||
| seed: int, None | ||
| Seed used to initialise the numpy random number generator. If ``None`` then a random seed is | ||
| allocated using the default numpy behaviour. | ||
|
|
||
| Returns | ||
| ------- | ||
| pseudodata: np.array | ||
| Numpy array which is N_dat (where N_dat is the combined number of data points after cuts) | ||
| containing monte carlo samples of data centered around the data central value. | ||
|
|
||
| Example | ||
| ------- | ||
| >>> from validphys.api import API | ||
| >>> pseudodata = API.make_replica( | ||
| dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}], | ||
| use_cuts="nocuts", | ||
| theoryid=53 | ||
| ) | ||
| array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 , | ||
| 0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155, | ||
| 0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 , | ||
| """ | ||
| # Seed the numpy RNG with the seed. | ||
| rng = np.random.default_rng(seed=seed) | ||
|
|
||
| # The inner while True loop is for ensuring a positive definite | ||
| # pseudodata replica | ||
| while True: | ||
| pseudodatas = [] | ||
| special_add = [] | ||
| special_mult = [] | ||
| mult_shifts = [] | ||
| check_positive_masks = [] | ||
| for cd in dataset_inputs_loaded_cd_with_cuts: | ||
| # copy here to avoid mutating the central values. | ||
| pseudodata = cd.central_values.to_numpy(copy=True) | ||
|
|
||
| # add contribution from statistical uncertainty | ||
| pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata)) | ||
|
|
||
| # ~~~ ADDITIVE ERRORS ~~~ | ||
| add_errors = cd.additive_errors | ||
| add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy() | ||
|
|
||
| pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1) | ||
|
|
||
| # correlated within dataset | ||
| add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy() | ||
| pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1]) | ||
|
|
||
| # append the partially shifted pseudodata | ||
| pseudodatas.append(pseudodata) | ||
| # store the additive errors with correlations between datasets for later use | ||
| special_add.append( | ||
| add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)] | ||
| ) | ||
| # ~~~ MULTIPLICATIVE ERRORS ~~~ | ||
| mult_errors = cd.multiplicative_errors | ||
| mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() | ||
| # convert to from percent to fraction | ||
| mult_shift = ( | ||
| 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 | ||
| ).prod(axis=1) | ||
|
|
||
| mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() | ||
| mult_shift *= ( | ||
| 1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100 | ||
| ).prod(axis=1) | ||
|
|
||
| mult_shifts.append(mult_shift) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand the logic here. I don't see how these things are being correlated across datasets. Random nubers for thing like lumi should only be sampled once, and here it seems it is once per dataset... That logic only seems to be there for the "special" systematics.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this only gets applied to systematics which have a systype row of MULT, CORR. Everything that needs to be correlated across datasets i.e lumi will have a special name which is not CORR and, as you point out, is handled below
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I had missed the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, perhaps the comments could be made more obvious
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i.e etc. |
||
|
|
||
| # store the multiplicative errors with correlations between datasets for later use | ||
| special_mult.append( | ||
| mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] | ||
| ) | ||
|
|
||
| # mask out the data we want to check are all positive | ||
| if "ASY" in cd.commondataproc: | ||
| check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) | ||
| else: | ||
| check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) | ||
|
|
||
| # non-overlapping systematics are set to NaN by concat, fill with 0 instead. | ||
| special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy() | ||
| special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() | ||
|
|
||
|
|
||
| all_pseudodata = ( | ||
| np.concatenate(pseudodatas, axis=0) | ||
| + special_add_errors @ rng.normal(size=special_add_errors.shape[1]) | ||
| ) * ( | ||
| np.concatenate(mult_shifts, axis=0) | ||
| * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1) | ||
| ) | ||
|
|
||
| if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0): | ||
| break | ||
|
|
||
| return all_pseudodata | ||
|
|
||
|
|
||
| def indexed_make_replica(groups_index, make_replica): | ||
| """Index the make_replica pseudodata appropriately | ||
| """ | ||
|
|
||
| return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) | ||
|
|
||
|
|
||
| @check_darwin_single_process | ||
| def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): | ||
| """A function to obtain information about the pseudodata that went | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.