NNPDF · siranipour · Aug 19, 2020 · Aug 20, 2020 · Aug 20, 2020 · Aug 24, 2020
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
@@ -62,23 +62,23 @@ def covmat_from_systematics(commondata):
     `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
     outlining the procedure, specifically equation 2 and surrounding text.
 
-    Paramaters
+    Parameters
     ----------
-    commondata : validphys.coredata.CommonData
+    commondata: validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
 
     Returns
     -------
-    cov_mat : np.array
-        Numpy array which is N_dat x N_dat (where N_dat is the number of data
-        points after cuts) containing uncertainty and correlation information.
+    cov_mat: np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
     Example
     -------
     >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> from validphys.calcutils import covmat_from_systematics
+    >>> from validphys.covmats import covmat_from_systematics
     >>> l = Loader()
     >>> cd = l.check_commondata("NMC")
     >>> cd = load_commondata(cd)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
@@ -12,19 +12,84 @@
 import pandas as pd
 
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
+from validphys.commondataparser import load_commondata
+from validphys.coredata import CommonData
+from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
 
 import n3fit.io.reader as reader
 from n3fit.performfit import initialize_seeds
 
+import NNPDF
+
 log = logging.getLogger(__name__)
 
 
 fitted_pseudodata = collect('fitted_pseudodata_internal', ('fitcontext',))
 
 context_index = collect("groups_index", ("fitcontext",))
 
+
+def reconstruct_replica(fit, list_of_commondata, replica: int):
+    """A function that reconstructs the pseudodata seen by replica number ``replica``
+
+    #TODO: add list_of_commondata
+    Parameters
+    ----------
+    fit: validphys.core.fit
+        The input fit
+    replica: int
+        The replica number for which the pseudodata is reconstructed
+
+    Returns
+    -------
+    #XXX: maybe this will be a pd.Series
+    pseudodata: np.array
+        The pseudodata in array format
+
+    Raises
+    ------
+    KeyError:
+        If one of ``trvalseed``, ``nnseed``, or ``mcseed`` is not found under the ``fitting``
+        namespace
+
+    Example:
+    >>> from validphys.api import API
+    >>> API.reconstruct_replica(fit="NNPDF40_nnlo_as_0118", replica=0)
+    #TODO: finish this example
+    """
+    log.warning(f"Attempt to reconstruct pseudodata for MC replica {replica} "
+                f"of {fit}. Be advised that this functionality is not available "
+                "for all fits, but only those fitted after N3FIT switched to the "
+                "Python MakeReplica function."
+    )
+
+    runcard = fit.as_input()
+
+    try:
+        fit_ns = runcard["fitting"]
+        trvlseed = fit_ns["trvlseed"]
+        nnseed = fit_ns["nnseed"]
+        mcseed = fit_ns["mcseed"]
+    except KeyError as e:
+        raise KeyError(f"The following keys were not found in the fitting namespace {str(e)}, "
+                        "please ensure the input fit is a valid N3FIT fit."
+        ) from e
+
+    seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep=True)
+
+    pseudodata = make_replica(list_of_commondata, seed=mcseed)
+
+    # TODO: change the need for datasets and exp_name
+    trmasks, vlmasks = reader.make_tr_val_mask(datasets=None, exp_name=None, seed=trvlseed)
+
+    training = pseudodata[:, trmasks]
+    validation = pseudodata[:, vlmasks]
+
+    return training, validation
+
+
 @check_cuts_fromfit
 def read_fit_pseudodata(fitcontext, context_index):
     """Generator to handle the reading of training and validation splits for a fit that has been
@@ -104,6 +169,124 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
+def make_replica(list_of_commondata, seed=None):
+    """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
+    objects and returns pseudodata replicas of the data central value accounting for
+    possible correlations between systematic uncertainties.
+
+    The function loops until positive definite pseudodata is generated for any
+    non-asymmetry datasets. In the case of an asymmetry dataset negative values are
+    permitted so the loop block executes only once.
+
+    Parameters
+    ---------
+    list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`]
+        Iterable of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
+
+    seed: int, None
+        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
+        allocated using the default numpy behaviour.
+
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
+
+    Example
+    -------
+    >>> from validphys.commondataparser import load_commondata
+    >>> from validphys.loader import Loader
+    >>> from validphys.pseudodata import make_replica
+    >>> l = Loader()
+    >>> cd1 = l.check_commondata("NMC")
+    >>> cd2 = l.check_commondata("NMCPD")
+    >>> lds = map(load_commondata, (cd1, cd2))
+    >>> make_replica(lds)
+    array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
+        0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
+        0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
+    """
+    # Seed the numpy RNG with the seed.
+    rng = np.random.default_rng(seed=seed)
+
+    # The inner while True loop is for ensuring a positive definite
+    # pseudodata replica
+    while True:
+        pseudodatas = []
+        special_add = []
+        special_mult = []
+        mult_shifts = []
+        check_positive_masks = []
+        for cd in list_of_commondata:
+            pseudodata = cd.central_values.to_numpy()
+
+            # add contribution from statistical uncertainty
+            pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))
+
+            # ~~~ ADDITIVE ERRORS  ~~~
+            add_errors = cd.additive_errors
+            add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+            pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)
+
+            # correlated within dataset
+            add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+            pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])
+
+            # append the partially shifted pseudodata
+            pseudodatas.append(pseudodata)
+            # store the additive errors with correlations between datasets for later use
+            special_add.append(
+                add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+            # ~~~ MULTIPLICATIVE ERRORS ~~~
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            # convert to from percent to fraction
+            mult_shift = (
+                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
+            ).prod(axis=1)
+
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            mult_shift *= (
+                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
+            ).prod(axis=1)
+
+            mult_shifts.append(mult_shift)
+
+            # store the multiplicative errors with correlations between datasets for later use
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+
+            # mask out the data we want to check are all positive
+            if "ASY" in cd.commondataproc:
+                check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+            else:
+                check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+
+        # if we sort here (which sorts columns), then permuting datasets doesn't change the result
+        # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+        special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+
+
+        all_pseudodata = (
+            np.concatenate(pseudodatas, axis=0)
+            + special_add_errors @ rng.normal(size=special_add_errors.shape[1])
+        ) * (
+            np.concatenate(mult_shifts, axis=0)
+            * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
+        )
+
+        if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
+            break
+
+    return all_pseudodata
+
+
 @check_darwin_single_process
 def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
     """A function to obtain information about the pseudodata that went