Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
eef21af
A direct translation of the C++ pseudodata code
Aug 19, 2020
dc4dd22
Loop until positivity of pseudodata is achieved
Aug 20, 2020
9abb31c
Using pandas and numpy to generate the pseudodata
Aug 20, 2020
6841317
Adding docstring for make_replica function
Aug 24, 2020
8005e84
Apply suggestions from code review
siranipour Aug 26, 2020
01f2b7f
Moving to pseudodata.py
Aug 27, 2020
ed9edaa
Making the function into an infinite generator
Aug 27, 2020
7bc29fd
Adding comment explaining why we divide the multiplicative uncertaint…
Sep 4, 2020
3ccda95
Update validphys2/src/validphys/covmats.py
siranipour Sep 4, 2020
37b460c
Changing covmat_for_sampling to additive_covmat
Sep 4, 2020
078cd5c
Changing variable names to better reflect their purpose
Sep 16, 2020
55760a0
Accounting for the asymmetry edge case
Sep 16, 2020
8217d8d
Adding information on the nature of sampling in the docstring
Sep 16, 2020
f2d42c7
Correcting covmat_from_systematics example
Oct 6, 2020
98e53e3
Clarifying doc strings
Oct 13, 2020
213de7f
Setting default seed kwarg to None
Oct 13, 2020
612197f
Removing make replica from covmats.py
Nov 17, 2020
3f1ce74
Adding use_mult_errors flag to datasets_covmat_from_systematics
Nov 17, 2020
f0fa628
Adding functionality to compute pseudodata for a list of commondata
Dec 7, 2020
c7626b8
Adding fillna(0) to special additive errors
Dec 8, 2020
c3fd73f
Changing covmats.py to not use the use_mult_errors flag
Dec 9, 2020
a5213cf
Apply suggestions from code review - comments to split up code
wilsonmr Dec 16, 2020
7ee813f
include stat errors
wilsonmr Dec 16, 2020
46020df
Removing if seed is None
siranipour Jan 15, 2021
3d776a5
Changing generator to function
Jan 15, 2021
1984534
Updating doc string
Jan 15, 2021
b5df64e
Using new numpy.random interface
Jan 15, 2021
ed674e8
add copy to central values before generating data
wilsonmr Jan 27, 2021
f481034
added some tests to catch that central values are unchanged and that …
wilsonmr Jan 28, 2021
133c07c
update make replica test that commondata is unchanged to do what it s…
wilsonmr Feb 3, 2021
8378139
Apply suggestions from code review
siranipour Feb 8, 2021
50f33aa
Fixing rebase problems
Feb 8, 2021
432063d
iterable -> list in docstring
Feb 8, 2021
e00e8d0
using API to load cd
Feb 8, 2021
ae2623e
making make replica a provider
Feb 8, 2021
bbbd226
index pseudodata
Feb 8, 2021
6a12bd6
Adding regression test for make_replica
Feb 8, 2021
5638e49
Merge branch 'master' into python-pseudodata
siranipour Feb 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions validphys2/src/validphys/covmats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from reportengine.table import table

from validphys.calcutils import regularize_covmat, get_df_block
from validphys.core import PDF, DataGroupSpec, DataSetSpec
Comment thread
siranipour marked this conversation as resolved.
from validphys.checks import (
check_dataset_cuts_match_theorycovmat,
check_norm_threshold,
Expand Down Expand Up @@ -66,6 +67,7 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):

Parameters
----------

loaded_commondata_with_cuts : validphys.coredata.CommonData
CommonData which stores information about systematic errors,
their treatment and description.
Expand All @@ -78,9 +80,9 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):

Returns
-------
cov_mat : np.array
Numpy array which is N_dat x N_dat (where N_dat is the number of data
points after cuts) containing uncertainty and correlation information.
cov_mat: np.array
Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
containing uncertainty and correlation information.

Example
-------
Expand All @@ -102,7 +104,7 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):

>>> from validphys.commondataparser import load_commondata
>>> from validphys.loader import Loader
>>> from validphys.calcutils import covmat_from_systematics
>>> from validphys.covmats import covmat_from_systematics
>>> l = Loader()
>>> cd = l.check_commondata("NMC")
>>> cd = load_commondata(cd)
Expand Down
124 changes: 124 additions & 0 deletions validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd

from validphys.checks import check_cuts_fromfit, check_darwin_single_process
from validphys.covmats import INTRA_DATASET_SYS_NAME

from reportengine import collect

Expand Down Expand Up @@ -104,6 +105,129 @@ def read_fit_pseudodata(fitcontext, context_index):
yield pseudodata.drop("type", axis=1), tr.index, val.index


def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None):
"""Function that takes in a list of :py:class:`validphys.coredata.CommonData`
objects and returns a pseudodata replica accounting for
possible correlations between systematic uncertainties.

The function loops until positive definite pseudodata is generated for any
non-asymmetry datasets. In the case of an asymmetry dataset negative values are
permitted so the loop block executes only once.
Comment on lines +114 to +115
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I'm being pedantic here but the loop is based on the list of datasets so although negative values are permitted for ASY datasets, it may still execute twice. My suggestion would be

"...non-asymmetry datasets. In the case of an asymmetry dataset negative values are permitted."

I'll leave it to somebody who is better at writing docstrings than me to determine if this is worth changing @RosalynLP

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wilsonmr yep fwiw I think this should def be changed to what you said


Parameters
---------
dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`]
List of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.

seed: int, None
Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
allocated using the default numpy behaviour.

Returns
-------
pseudodata: np.array
Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
containing monte carlo samples of data centered around the data central value.

Example
-------
>>> from validphys.api import API
>>> pseudodata = API.make_replica(
dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}],
use_cuts="nocuts",
theoryid=53
)
array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
"""
# Seed the numpy RNG with the seed.
rng = np.random.default_rng(seed=seed)

# The inner while True loop is for ensuring a positive definite
# pseudodata replica
while True:
pseudodatas = []
special_add = []
special_mult = []
mult_shifts = []
check_positive_masks = []
for cd in dataset_inputs_loaded_cd_with_cuts:
# copy here to avoid mutating the central values.
pseudodata = cd.central_values.to_numpy(copy=True)

# add contribution from statistical uncertainty
pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))

# ~~~ ADDITIVE ERRORS ~~~
add_errors = cd.additive_errors
add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()

pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)

# correlated within dataset
add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])

# append the partially shifted pseudodata
pseudodatas.append(pseudodata)
# store the additive errors with correlations between datasets for later use
special_add.append(
add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)
# ~~~ MULTIPLICATIVE ERRORS ~~~
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)

mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)

mult_shifts.append(mult_shift)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the logic here. I don't see how these things are being correlated across datasets. Random nubers for thing like lumi should only be sampled once, and here it seems it is once per dataset... That logic only seems to be there for the "special" systematics.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()

this only gets applied to systematics which have a systype row of MULT, CORR. Everything that needs to be correlated across datasets i.e lumi will have a special name which is not CORR and, as you point out, is handled below

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. I had missed the ~ in mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, perhaps the comments could be made more obvious

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i.e

            # all mult errors must be converted to from percent to fraction
            # generate all uncorrelated multiplicative shifts
            mult_shift = (
                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
            ).prod(axis=1)
            # generate shifts for systematics which are only correlated within this dataset
            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
            # store multiplicative shifts to apply after all additive shifts.
            mult_shifts.append(mult_shift)

etc.


# store the multiplicative errors with correlations between datasets for later use
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)

# mask out the data we want to check are all positive
if "ASY" in cd.commondataproc:
check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
else:
check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))

# non-overlapping systematics are set to NaN by concat, fill with 0 instead.
special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()


all_pseudodata = (
np.concatenate(pseudodatas, axis=0)
+ special_add_errors @ rng.normal(size=special_add_errors.shape[1])
) * (
np.concatenate(mult_shifts, axis=0)
* (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
)

if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
break

return all_pseudodata


def indexed_make_replica(groups_index, make_replica):
"""Index the make_replica pseudodata appropriately
"""

return pd.DataFrame(make_replica, index=groups_index, columns=["data"])


@check_darwin_single_process
def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
"""A function to obtain information about the pseudodata that went
Expand Down
5 changes: 5 additions & 0 deletions validphys2/src/validphys/tableloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def fixup_header(df, head_index, dtype):
*oldcols.levels[head_index+1:]])
df.columns = newcols

def parse_data_cv(filename):
"""Useful for reading DataFrames with just one column."""
df = sane_load(filename, index_col=[0, 1, 2])
return df

def parse_exp_mat(filename):
"""Parse a dump of a matrix like experiments_covmat."""
df = sane_load(filename, header=[0,1,2], index_col=[0,1,2])
Expand Down
Loading