NNPDF · Zaharid · Feb 26, 2021 · Aug 19, 2020 · Aug 20, 2020 · Aug 20, 2020
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
@@ -11,6 +11,7 @@
 from reportengine.table import table
 
 from validphys.calcutils import regularize_covmat, get_df_block
+from validphys.core import PDF, DataGroupSpec, DataSetSpec
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
     check_norm_threshold,
@@ -66,6 +67,7 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):
 
     Parameters
     ----------
+
     loaded_commondata_with_cuts : validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
@@ -78,9 +80,9 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):
 
     Returns
     -------
-    cov_mat : np.array
-        Numpy array which is N_dat x N_dat (where N_dat is the number of data
-        points after cuts) containing uncertainty and correlation information.
+    cov_mat: np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
     Example
     -------
@@ -102,7 +104,7 @@ def covmat_from_systematics(loaded_commondata_with_cuts, _central_values=None):
 
     >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> from validphys.calcutils import covmat_from_systematics
+    >>> from validphys.covmats import covmat_from_systematics
     >>> l = Loader()
     >>> cd = l.check_commondata("NMC")
     >>> cd = load_commondata(cd)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
@@ -12,6 +12,7 @@
 import pandas as pd
 
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
+from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
 
@@ -104,6 +105,129 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
+def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None):
+    """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
+    objects and returns a pseudodata replica accounting for
+    possible correlations between systematic uncertainties.
+
+    The function loops until positive definite pseudodata is generated for any
+    non-asymmetry datasets. In the case of an asymmetry dataset negative values are
+    permitted so the loop block executes only once.
+
+    Parameters
+    ---------
+    dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`]
+        List of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
+
+    seed: int, None
+        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
+        allocated using the default numpy behaviour.
+
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
+
+    Example
+    -------
+    >>> from validphys.api import API
+    >>> pseudodata = API.make_replica(
+                                    dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}],
+                                    use_cuts="nocuts",
+                                    theoryid=53
+                                )
+    array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
+        0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
+        0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
+    """
+    # Seed the numpy RNG with the seed.
+    rng = np.random.default_rng(seed=seed)
+
+    # The inner while True loop is for ensuring a positive definite
+    # pseudodata replica
+    while True:
+        pseudodatas = []
+        special_add = []
+        special_mult = []
+        mult_shifts = []
+        check_positive_masks = []
+        for cd in dataset_inputs_loaded_cd_with_cuts:
+            # copy here to avoid mutating the central values.
+            pseudodata = cd.central_values.to_numpy(copy=True)
+
+            # add contribution from statistical uncertainty
+            pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))
+
+            # ~~~ ADDITIVE ERRORS  ~~~
+            add_errors = cd.additive_errors
+            add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+            pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)
+
+            # correlated within dataset
+            add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+            pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])
+
+            # append the partially shifted pseudodata
+            pseudodatas.append(pseudodata)
+            # store the additive errors with correlations between datasets for later use
+            special_add.append(
+                add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+            # ~~~ MULTIPLICATIVE ERRORS ~~~
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            # convert to from percent to fraction
+            mult_shift = (
+                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
+            ).prod(axis=1)
+
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            mult_shift *= (
+                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
+            ).prod(axis=1)
+
+            mult_shifts.append(mult_shift)
+
+            # store the multiplicative errors with correlations between datasets for later use
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+
+            # mask out the data we want to check are all positive
+            if "ASY" in cd.commondataproc:
+                check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+            else:
+                check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+
+        # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+        special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+
+
+        all_pseudodata = (
+            np.concatenate(pseudodatas, axis=0)
+            + special_add_errors @ rng.normal(size=special_add_errors.shape[1])
+        ) * (
+            np.concatenate(mult_shifts, axis=0)
+            * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
+        )
+
+        if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
+            break
+
+    return all_pseudodata
+
+
+def indexed_make_replica(groups_index, make_replica):
+    """Index the make_replica pseudodata appropriately
+    """
+
+    return pd.DataFrame(make_replica, index=groups_index, columns=["data"])
+
+
 @check_darwin_single_process
 def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
     """A function to obtain information about the pseudodata that went

diff --git a/validphys2/src/validphys/tableloader.py b/validphys2/src/validphys/tableloader.py
@@ -32,6 +32,11 @@ def fixup_header(df, head_index, dtype):
                                   *oldcols.levels[head_index+1:]])
     df.columns = newcols
 
+def parse_data_cv(filename):
+    """Useful for reading DataFrames with just one column."""
+    df = sane_load(filename, index_col=[0, 1, 2])
+    return df
+
 def parse_exp_mat(filename):
     """Parse a dump of a matrix like experiments_covmat."""
     df = sane_load(filename, header=[0,1,2], index_col=[0,1,2])