Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8493c7e
A direct translation of the C++ pseudodata code
Aug 19, 2020
68e9ced
Loop until positivity of pseudodata is achieved
Aug 20, 2020
b2245c8
Using pandas and numpy to generate the pseudodata
Aug 20, 2020
196f31f
Adding docstring for make_replica function
Aug 24, 2020
c5ba90d
Apply suggestions from code review
siranipour Aug 26, 2020
ca2f214
Moving to pseudodata.py
Aug 27, 2020
821b6aa
Making the function into an infinite generator
Aug 27, 2020
7f4c3f5
Adding comment explaining why we divide the multiplicative uncertaint…
Sep 4, 2020
9f28ca9
Update validphys2/src/validphys/covmats.py
siranipour Sep 4, 2020
da36db5
Changing covmat_for_sampling to additive_covmat
Sep 4, 2020
5649743
Changing variable names to better reflect their purpose
Sep 16, 2020
fe6a653
Accounting for the asymmetry edge case
Sep 16, 2020
7d5c9d2
Adding information on the nature of sampling in the docstring
Sep 16, 2020
3365dec
Correcting covmat_from_systematics example
Oct 6, 2020
a3fe40f
Clarifying doc strings
Oct 13, 2020
79fbe95
Setting default seed kwarg to None
Oct 13, 2020
8905f0e
Removing make replica from covmats.py
Nov 17, 2020
0a8ec39
Adding use_mult_errors flag to datasets_covmat_from_systematics
Nov 17, 2020
a1820d3
Adding functionality to compute pseudodata for a list of commondata
Dec 7, 2020
5b33d27
Adding fillna(0) to special additive errors
Dec 8, 2020
cfdb946
Changing covmats.py to not use the use_mult_errors flag
Dec 9, 2020
0129724
Apply suggestions from code review - comments to split up code
wilsonmr Dec 16, 2020
6772fb6
include stat errors
wilsonmr Dec 16, 2020
b662abb
Removing if seed is None
siranipour Jan 15, 2021
a6423de
Changing generator to function
Jan 15, 2021
3419bc6
Updating doc string
Jan 15, 2021
14abf62
Using new numpy.random interface
Jan 15, 2021
9282cc1
Working on reconstructing pseudodata
Jan 21, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions validphys2/src/validphys/covmats.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,23 @@ def covmat_from_systematics(commondata):
`paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
outlining the procedure, specifically equation 2 and surrounding text.

Paramaters
Parameters
----------
commondata : validphys.coredata.CommonData
commondata: validphys.coredata.CommonData
CommonData which stores information about systematic errors,
their treatment and description.

Returns
-------
cov_mat : np.array
Numpy array which is N_dat x N_dat (where N_dat is the number of data
points after cuts) containing uncertainty and correlation information.
cov_mat: np.array
Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
containing uncertainty and correlation information.

Example
-------
>>> from validphys.commondataparser import load_commondata
>>> from validphys.loader import Loader
>>> from validphys.calcutils import covmat_from_systematics
>>> from validphys.covmats import covmat_from_systematics
>>> l = Loader()
>>> cd = l.check_commondata("NMC")
>>> cd = load_commondata(cd)
Expand Down
183 changes: 183 additions & 0 deletions validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,84 @@
import pandas as pd

from validphys.checks import check_cuts_fromfit, check_darwin_single_process
from validphys.commondataparser import load_commondata
from validphys.coredata import CommonData
from validphys.covmats import INTRA_DATASET_SYS_NAME

from reportengine import collect

import n3fit.io.reader as reader
from n3fit.performfit import initialize_seeds

import NNPDF

log = logging.getLogger(__name__)


fitted_pseudodata = collect('fitted_pseudodata_internal', ('fitcontext',))

context_index = collect("groups_index", ("fitcontext",))


def reconstruct_replica(fit, list_of_commondata, replica: int):
"""A function that reconstructs the pseudodata seen by replica number ``replica``

#TODO: add list_of_commondata
Parameters
----------
fit: validphys.core.fit
The input fit
replica: int
The replica number for which the pseudodata is reconstructed

Returns
-------
#XXX: maybe this will be a pd.Series
pseudodata: np.array
The pseudodata in array format

Raises
------
KeyError:
If one of ``trvalseed``, ``nnseed``, or ``mcseed`` is not found under the ``fitting``
namespace

Example:
>>> from validphys.api import API
>>> API.reconstruct_replica(fit="NNPDF40_nnlo_as_0118", replica=0)
#TODO: finish this example
"""
log.warning(f"Attempt to reconstruct pseudodata for MC replica {replica} "
f"of {fit}. Be advised that this functionality is not available "
"for all fits, but only those fitted after N3FIT switched to the "
"Python MakeReplica function."
)

runcard = fit.as_input()

try:
fit_ns = runcard["fitting"]
trvlseed = fit_ns["trvlseed"]
nnseed = fit_ns["nnseed"]
mcseed = fit_ns["mcseed"]
except KeyError as e:
raise KeyError(f"The following keys were not found in the fitting namespace {str(e)}, "
"please ensure the input fit is a valid N3FIT fit."
) from e

seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep=True)

pseudodata = make_replica(list_of_commondata, seed=mcseed)

# TODO: change the need for datasets and exp_name
trmasks, vlmasks = reader.make_tr_val_mask(datasets=None, exp_name=None, seed=trvlseed)

training = pseudodata[:, trmasks]
validation = pseudodata[:, vlmasks]

return training, validation


@check_cuts_fromfit
def read_fit_pseudodata(fitcontext, context_index):
"""Generator to handle the reading of training and validation splits for a fit that has been
Expand Down Expand Up @@ -104,6 +169,124 @@ def read_fit_pseudodata(fitcontext, context_index):
yield pseudodata.drop("type", axis=1), tr.index, val.index


def make_replica(list_of_commondata, seed=None):
"""Function that takes in a list of :py:class:`validphys.coredata.CommonData`
objects and returns pseudodata replicas of the data central value accounting for
possible correlations between systematic uncertainties.

The function loops until positive definite pseudodata is generated for any
non-asymmetry datasets. In the case of an asymmetry dataset negative values are
permitted so the loop block executes only once.

Parameters
---------
list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`]
Iterable of CommonData objects which stores information about systematic errors,
their treatment and description, for each dataset.

seed: int, None
Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
allocated using the default numpy behaviour.

Returns
-------
pseudodata: np.array
Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
containing monte carlo samples of data centered around the data central value.

Example
-------
>>> from validphys.commondataparser import load_commondata
>>> from validphys.loader import Loader
>>> from validphys.pseudodata import make_replica
>>> l = Loader()
>>> cd1 = l.check_commondata("NMC")
>>> cd2 = l.check_commondata("NMCPD")
>>> lds = map(load_commondata, (cd1, cd2))
>>> make_replica(lds)
array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
"""
# Seed the numpy RNG with the seed.
rng = np.random.default_rng(seed=seed)

# The inner while True loop is for ensuring a positive definite
# pseudodata replica
while True:
pseudodatas = []
special_add = []
special_mult = []
mult_shifts = []
check_positive_masks = []
for cd in list_of_commondata:
pseudodata = cd.central_values.to_numpy()

# add contribution from statistical uncertainty
pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))

# ~~~ ADDITIVE ERRORS ~~~
add_errors = cd.additive_errors
add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()

pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)

# correlated within dataset
add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])

# append the partially shifted pseudodata
pseudodatas.append(pseudodata)
# store the additive errors with correlations between datasets for later use
special_add.append(
add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)
# ~~~ MULTIPLICATIVE ERRORS ~~~
mult_errors = cd.multiplicative_errors
mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
# convert to from percent to fraction
mult_shift = (
1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
).prod(axis=1)

mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
mult_shift *= (
1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
).prod(axis=1)

mult_shifts.append(mult_shift)

# store the multiplicative errors with correlations between datasets for later use
special_mult.append(
mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
)

# mask out the data we want to check are all positive
if "ASY" in cd.commondataproc:
check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
else:
check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))

# if we sort here (which sorts columns), then permuting datasets doesn't change the result
# non-overlapping systematics are set to NaN by concat, fill with 0 instead.
special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()


all_pseudodata = (
np.concatenate(pseudodatas, axis=0)
+ special_add_errors @ rng.normal(size=special_add_errors.shape[1])
) * (
np.concatenate(mult_shifts, axis=0)
* (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
)

if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
break

return all_pseudodata


@check_darwin_single_process
def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
"""A function to obtain information about the pseudodata that went
Expand Down