From eef21af15f1e256c58fa490d231700e4c6fb37b6 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 19 Aug 2020 14:57:02 +0100
Subject: [PATCH 01/37] A direct translation of the C++ pseudodata code No
 attempt at optimization has been done Uses the same seed structure as C++ so
 will give the same pseudodata if the same RNG seed is used. This commit is a
 proof of concept.

---
 validphys2/src/validphys/covmats.py | 62 +++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 74989025a0..1c2c59272f 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -11,6 +11,8 @@
 from reportengine.table import table
 
 from validphys.calcutils import regularize_covmat, get_df_block
+from validphys.core import PDF, DataGroupSpec, DataSetSpec
+from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
     check_norm_threshold,
@@ -21,10 +23,64 @@
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
 from validphys.results import ThPredictionsResult
 
+import NNPDF
+
 log = logging.getLogger(__name__)
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
+def make_replica(commondata, seed=1):
+    # dataset_input: {dataset: NMC, frac: 0.5}
+
+    # use_cuts: nocuts
+
+    # theoryid: 53
+
+    # template_text: |
+      # {@make_replica@}
+
+    # actions_:
+      # - report(main=True)
+
+    # TODO: eventually remove the inhouse random number generator
+    # in favour of e.g numpy
+    NNPDF.RandomGenerator.InitRNG(0, seed)
+    rng = NNPDF.RandomGenerator.GetRNG()
+
+    ld_cd = load_commondata(commondata)
+    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+
+    rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
+    deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
+    deviates, rand = map(np.array, (deviates, rand))
+
+    correlated_deviates = sampling_matrix @ deviates
+
+    artdata = np.array(ld_cd.central_values) + correlated_deviates
+
+    sys_table = ld_cd.sys_errors
+    for i in range(ld_cd.ndata):
+        xnor = 1
+        for j in range(ld_cd.nsys):
+            sys = sys_table.iloc[i, j]
+            if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
+                continue
+
+            if sys.name == 'UNCORR':
+                if sys.sys_type == 'ADD':
+                    continue
+                xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
+            else:
+                if sys.sys_type == 'ADD':
+                    continue
+                xnor *= (1.0 + rand[j] * sys.mult/100)
+        artdata[i] = xnor * artdata[i]
+
+    # TODO: check the pseudodata is positive
+    # requires implementing the above code in a while loop
+    return artdata
+
 
 def covmat_from_systematics(commondata, central_values=None):
     """Take the statistical uncertainty and systematics table from
@@ -74,6 +130,12 @@ def covmat_from_systematics(commondata, central_values=None):
         can be used to calculate, for example, the t0 covariance matrix by
         using the predictions from the central member of the t0 pdf.
 
+    use_mult_errors: bool
+        Boolean which controls whether we use multiplicative errors
+        when computing the covmat. By default it should be set to ``True``
+        unles one wishes to uses a sampling matrix for pseudodata generation,
+        in which case it should be set to ``False``.
+
     Returns
     -------
     cov_mat : np.array

From dc4dd22118bf3b841916acdc5d96ed7be44c36d3 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 20 Aug 2020 11:46:02 +0100
Subject: [PATCH 02/37] Loop until positivity of pseudodata is achieved

---
 validphys2/src/validphys/covmats.py | 54 ++++++++++++++++-------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 1c2c59272f..835b8d62d0 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -51,34 +51,40 @@ def make_replica(commondata, seed=1):
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
-    rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
-    deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
-    deviates, rand = map(np.array, (deviates, rand))
-
-    correlated_deviates = sampling_matrix @ deviates
+    # We need to loop until the pseudodata is positive
+    while True:
+        rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
+        deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
+        deviates, rand = map(np.array, (deviates, rand))
+
+        correlated_deviates = sampling_matrix @ deviates
+
+        artdata = np.array(ld_cd.central_values) + correlated_deviates
+
+        sys_table = ld_cd.sys_errors
+        for i in range(ld_cd.ndata):
+            xnor = 1
+            for j in range(ld_cd.nsys):
+                sys = sys_table.iloc[i, j]
+                if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
+                    continue
 
-    artdata = np.array(ld_cd.central_values) + correlated_deviates
+                if sys.name == 'UNCORR':
+                    if sys.sys_type == 'ADD':
+                        continue
+                    xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
+                else:
+                    if sys.sys_type == 'ADD':
+                        continue
+                    xnor *= (1.0 + rand[j] * sys.mult/100)
+            artdata[i] = xnor * artdata[i]
 
-    sys_table = ld_cd.sys_errors
-    for i in range(ld_cd.ndata):
-        xnor = 1
-        for j in range(ld_cd.nsys):
-            sys = sys_table.iloc[i, j]
-            if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
-                continue
+        artdata = np.array(artdata)
 
-            if sys.name == 'UNCORR':
-                if sys.sys_type == 'ADD':
-                    continue
-                xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
-            else:
-                if sys.sys_type == 'ADD':
-                    continue
-                xnor *= (1.0 + rand[j] * sys.mult/100)
-        artdata[i] = xnor * artdata[i]
+        # Check positivity of the pseudodata
+        if all(artdata > 0):
+            break
 
-    # TODO: check the pseudodata is positive
-    # requires implementing the above code in a while loop
     return artdata
 
 

From 9abb31c2ed1905191100ff8b8f989aff9fff8518 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 20 Aug 2020 15:47:37 +0100
Subject: [PATCH 03/37] Using pandas and numpy to generate the pseudodata

This removes the for loop implementation
---
 validphys2/src/validphys/covmats.py | 73 +++++++++++++++++++----------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 835b8d62d0..963893bc4f 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -51,41 +51,64 @@ def make_replica(commondata, seed=1):
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
+    central_values = ld_cd.central_values
+    nsys, ndata = ld_cd.nsys, ld_cd.ndata
+
+    # Define these variables here to prevent them
+    # from being redefined in the while loop
+    sys_names = ld_cd.systype_table["name"]
+    sys_types = ld_cd.systype_table["type"]
+    systematics = ld_cd.sys_errors
+
+    # Drop any systematics that are related to theory uncertainties
+    # or have explicitly been labelled as a systematic to be skipped.
+    # We are only interested in multiplicative uncertainties
+    mult_uncorr_sys = systematics.loc[
+        :,
+        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
+        & (sys_types == "MULT"),
+    ]
+    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+
+    # We want to know which of the remaining systematics were
+    # uncorrelated. The other we mark as NaN
+    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+
     # We need to loop until the pseudodata is positive
     while True:
-        rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
-        deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
-        deviates, rand = map(np.array, (deviates, rand))
+        # It is useful to have this quantity as a pandas.Series
+        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+        # And this as a numpy.array
+        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
 
         correlated_deviates = sampling_matrix @ deviates
 
-        artdata = np.array(ld_cd.central_values) + correlated_deviates
-
-        sys_table = ld_cd.sys_errors
-        for i in range(ld_cd.ndata):
-            xnor = 1
-            for j in range(ld_cd.nsys):
-                sys = sys_table.iloc[i, j]
-                if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
-                    continue
-
-                if sys.name == 'UNCORR':
-                    if sys.sys_type == 'ADD':
-                        continue
-                    xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
-                else:
-                    if sys.sys_type == 'ADD':
-                        continue
-                    xnor *= (1.0 + rand[j] * sys.mult/100)
-            artdata[i] = xnor * artdata[i]
+        pseudodata = np.array(central_values) + correlated_deviates
+
+        # Matrix of random numbers generated by either sampling from rand in the
+        # correct manner, or by requesting a new random number if the systematic
+        # error type was UNCORR
+        random_matrix = np.array(
+            [
+                [
+                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                    for i, sys_type in mult_uncorr_names.iteritems()
+                ]
+                for _ in range(ndata)
+            ]
+        )
 
-        artdata = np.array(artdata)
+        # The final pseudodata is obtained by doing an elementwise multiplication
+        # of the multiplicative uncertainties with the above random matrix. We then
+        # add unity to all elements and take the product across columns. This vector
+        # must then be elementwise multiplied by central values + correlated deviates.
+        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
 
         # Check positivity of the pseudodata
-        if all(artdata > 0):
+        if all(pseudodata > 0):
             break
 
-    return artdata
+    return pseudodata
 
 
 def covmat_from_systematics(commondata, central_values=None):

From 684131710bfe23c704448555cd06532dd39aca1c Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 24 Aug 2020 15:24:47 +0100
Subject: [PATCH 04/37] Adding docstring for make_replica function

Also correcting some typos in the docstring of covmat_from_systematics
---
 validphys2/src/validphys/covmats.py | 62 ++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 963893bc4f..41591541cb 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -12,6 +12,7 @@
 
 from validphys.calcutils import regularize_covmat, get_df_block
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
+from validphys.coredata import CommonData
 from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
@@ -30,24 +31,58 @@
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
 def make_replica(commondata, seed=1):
-    # dataset_input: {dataset: NMC, frac: 0.5}
+    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas
+    of the data central value.
 
-    # use_cuts: nocuts
+    The square root (obtained by the Cholesky decomposition) of the covariance
+    matrix is used as the sampling matrix. The covariance matrix is obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
+    multiplicative systematic uncertainties when obtaining the sampling matrix.
 
-    # theoryid: 53
+    Parameters
+    ---------
+    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
+        CommonData which stores information about systematic errors,
+        their treatment and description.
 
-    # template_text: |
-      # {@make_replica@}
+    seed: int
+        Seed used to initialise the random number generator.
 
-    # actions_:
-      # - report(main=True)
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
 
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> l = Loader()
+    >>> from validphys.covmats import make_replica
+    >>> cd = l.check_commondata("NMC")
+    >>> make_replica(cd)
+    - Random Generator allocated: ranlux
+    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
+       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+
+    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
+    .. todo:: Allow for correlations between datasets within an experiment.
+    """
     # TODO: eventually remove the inhouse random number generator
     # in favour of e.g numpy
     NNPDF.RandomGenerator.InitRNG(0, seed)
     rng = NNPDF.RandomGenerator.GetRNG()
 
-    ld_cd = load_commondata(commondata)
+    if not isinstance(commondata, CommonData):
+        ld_cd = load_commondata(commondata)
+    else:
+        ld_cd = commondata
+
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
@@ -70,8 +105,7 @@ def make_replica(commondata, seed=1):
     ]
     mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
 
-    # We want to know which of the remaining systematics were
-    # uncorrelated. The other we mark as NaN
+    # We want to know the names of the remaining systematics.
     mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
 
     # We need to loop until the pseudodata is positive
@@ -149,7 +183,7 @@ def covmat_from_systematics(commondata, central_values=None):
 
     Parameters
     ----------
-    commondata : validphys.coredata.CommonData
+    commondata: validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
     central_values : None, np.array
@@ -167,9 +201,9 @@ def covmat_from_systematics(commondata, central_values=None):
 
     Returns
     -------
-    cov_mat : np.array
-        Numpy array which is N_dat x N_dat (where N_dat is the number of data
-        points after cuts) containing uncertainty and correlation information.
+    cov_mat: np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
     Example
     -------

From 8005e8403a952c768c22e622e66edbb0ddd3e371 Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Wed, 26 Aug 2020 13:38:58 +0100
Subject: [PATCH 05/37] Apply suggestions from code review

Co-authored-by: Cameron Voisey <32741139+voisey@users.noreply.github.com>
---
 validphys2/src/validphys/covmats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 41591541cb..88459ea5e4 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -32,7 +32,7 @@
 
 def make_replica(commondata, seed=1):
     """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas
+    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
     of the data central value.
 
     The square root (obtained by the Cholesky decomposition) of the covariance
@@ -196,7 +196,7 @@ def covmat_from_systematics(commondata, central_values=None):
     use_mult_errors: bool
         Boolean which controls whether we use multiplicative errors
         when computing the covmat. By default it should be set to ``True``
-        unles one wishes to uses a sampling matrix for pseudodata generation,
+        unless one wishes to uses a sampling matrix for pseudodata generation,
         in which case it should be set to ``False``.
 
     Returns

From 01f2b7fc14debfd639df28d83ed40517bf71eac9 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 27 Aug 2020 13:00:32 +0100
Subject: [PATCH 06/37] Moving to pseudodata.py

---
 validphys2/src/validphys/covmats.py    |   4 -
 validphys2/src/validphys/pseudodata.py | 120 +++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 88459ea5e4..efdf356433 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -12,8 +12,6 @@
 
 from validphys.calcutils import regularize_covmat, get_df_block
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
-from validphys.coredata import CommonData
-from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
     check_norm_threshold,
@@ -24,8 +22,6 @@
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
 from validphys.results import ThPredictionsResult
 
-import NNPDF
-
 log = logging.getLogger(__name__)
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index d62331f467..d0990114ab 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -12,12 +12,17 @@
 import pandas as pd
 
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
+from validphys.commondataparser import load_commondata
+from validphys.coredata import CommonData
+from validphys.covmats import covmat_from_systematics, sqrt_covmat
 
 from reportengine import collect
 
 import n3fit.io.reader as reader
 from n3fit.performfit import initialize_seeds
 
+import NNPDF
+
 log = logging.getLogger(__name__)
 
 
@@ -104,6 +109,121 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
+def make_replica(commondata, seed=1):
+    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
+    of the data central value.
+
+    The square root (obtained by the Cholesky decomposition) of the covariance
+    matrix is used as the sampling matrix. The covariance matrix is obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
+    multiplicative systematic uncertainties when obtaining the sampling matrix.
+
+    Parameters
+    ---------
+    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
+        CommonData which stores information about systematic errors,
+        their treatment and description.
+
+    seed: int
+        Seed used to initialise the random number generator.
+
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> l = Loader()
+    >>> from validphys.pseudodata import make_replica
+    >>> cd = l.check_commondata("NMC")
+    >>> make_replica(cd)
+    - Random Generator allocated: ranlux
+    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
+       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+
+    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
+    .. todo:: Allow for correlations between datasets within an experiment.
+    """
+    # TODO: eventually remove the inhouse random number generator
+    # in favour of e.g numpy
+    NNPDF.RandomGenerator.InitRNG(0, seed)
+    rng = NNPDF.RandomGenerator.GetRNG()
+
+    if not isinstance(commondata, CommonData):
+        ld_cd = load_commondata(commondata)
+    else:
+        ld_cd = commondata
+
+    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+
+    central_values = ld_cd.central_values
+    nsys, ndata = ld_cd.nsys, ld_cd.ndata
+
+    # Define these variables here to prevent them
+    # from being redefined in the while loop
+    sys_names = ld_cd.systype_table["name"]
+    sys_types = ld_cd.systype_table["type"]
+    systematics = ld_cd.sys_errors
+
+    # Drop any systematics that are related to theory uncertainties
+    # or have explicitly been labelled as a systematic to be skipped.
+    # We are only interested in multiplicative uncertainties
+    mult_uncorr_sys = systematics.loc[
+        :,
+        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
+        & (sys_types == "MULT"),
+    ]
+    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+
+    # We want to know the names of the remaining systematics.
+    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+
+    # We need to loop until the pseudodata is positive
+    while True:
+        # It is useful to have this quantity as a pandas.Series
+        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+        # And this as a numpy.array
+        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
+
+        correlated_deviates = sampling_matrix @ deviates
+
+        pseudodata = np.array(central_values) + correlated_deviates
+
+        # Matrix of random numbers generated by either sampling from rand in the
+        # correct manner, or by requesting a new random number if the systematic
+        # error type was UNCORR
+        random_matrix = np.array(
+            [
+                [
+                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                    for i, sys_type in mult_uncorr_names.iteritems()
+                ]
+                for _ in range(ndata)
+            ]
+        )
+
+        # The final pseudodata is obtained by doing an elementwise multiplication
+        # of the multiplicative uncertainties with the above random matrix. We then
+        # add unity to all elements and take the product across columns. This vector
+        # must then be elementwise multiplied by central values + correlated deviates.
+        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+
+        # Check positivity of the pseudodata
+        if all(pseudodata > 0):
+            break
+
+    return pseudodata
+
+
 @check_darwin_single_process
 def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
     """A function to obtain information about the pseudodata that went

From ed9edaa8aaa2fc0f2cd426f6dcadeaa3a16f7113 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 27 Aug 2020 15:54:30 +0100
Subject: [PATCH 07/37] Making the function into an infinite generator

Thus it remembers the RNG state
---
 validphys2/src/validphys/pseudodata.py | 74 ++++++++++++++------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index d0990114ab..69a65e2aef 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -110,9 +110,10 @@ def read_fit_pseudodata(fitcontext, context_index):
 
 
 def make_replica(commondata, seed=1):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
-    of the data central value.
+    """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
+    of the data central value. This generator is infinite in the sense that it will
+    never raise a StopIteration exception
 
     The square root (obtained by the Cholesky decomposition) of the covariance
     matrix is used as the sampling matrix. The covariance matrix is obtained using
@@ -141,13 +142,14 @@ def make_replica(commondata, seed=1):
     >>> l = Loader()
     >>> from validphys.pseudodata import make_replica
     >>> cd = l.check_commondata("NMC")
-    >>> make_replica(cd)
+    >>> data_generator = make_replica(cd)
+    >>> next(data_generator)
     - Random Generator allocated: ranlux
     array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+           0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+           0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+           0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+
 
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
@@ -187,41 +189,45 @@ def make_replica(commondata, seed=1):
     # We want to know the names of the remaining systematics.
     mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
 
-    # We need to loop until the pseudodata is positive
+    # We make this generator infinite in the sense that it shall never
+    # raise a StopIteration exception. Thus we can continuely request
+    # pseudodata from it and it will remember the RNG state.
     while True:
-        # It is useful to have this quantity as a pandas.Series
-        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-        # And this as a numpy.array
-        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
+        # We need to loop until the pseudodata is positive
+        while True:
+            # It is useful to have this quantity as a pandas.Series
+            rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+            # And this as a numpy.array
+            deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
 
-        correlated_deviates = sampling_matrix @ deviates
+            correlated_deviates = sampling_matrix @ deviates
 
-        pseudodata = np.array(central_values) + correlated_deviates
+            pseudodata = np.array(central_values) + correlated_deviates
 
-        # Matrix of random numbers generated by either sampling from rand in the
-        # correct manner, or by requesting a new random number if the systematic
-        # error type was UNCORR
-        random_matrix = np.array(
-            [
+            # Matrix of random numbers generated by either sampling from rand in the
+            # correct manner, or by requesting a new random number if the systematic
+            # error type was UNCORR
+            random_matrix = np.array(
                 [
-                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                    for i, sys_type in mult_uncorr_names.iteritems()
+                    [
+                        rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                        for i, sys_type in mult_uncorr_names.iteritems()
+                    ]
+                    for _ in range(ndata)
                 ]
-                for _ in range(ndata)
-            ]
-        )
+            )
 
-        # The final pseudodata is obtained by doing an elementwise multiplication
-        # of the multiplicative uncertainties with the above random matrix. We then
-        # add unity to all elements and take the product across columns. This vector
-        # must then be elementwise multiplied by central values + correlated deviates.
-        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+            # The final pseudodata is obtained by doing an elementwise multiplication
+            # of the multiplicative uncertainties with the above random matrix. We then
+            # add unity to all elements and take the product across columns. This vector
+            # must then be elementwise multiplied by central values + correlated deviates.
+            pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
 
-        # Check positivity of the pseudodata
-        if all(pseudodata > 0):
-            break
+            # Check positivity of the pseudodata
+            if all(pseudodata > 0):
+                break
 
-    return pseudodata
+        yield pseudodata
 
 
 @check_darwin_single_process

From 7bc29fd74d87da3a6e197744b9cafb10b8ff5ed4 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 4 Sep 2020 10:54:01 +0100
Subject: [PATCH 08/37] Adding comment explaining why we divide the
 multiplicative uncertainties by 100

---
 validphys2/src/validphys/pseudodata.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 69a65e2aef..186747f62c 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -184,6 +184,8 @@ def make_replica(commondata, seed=1):
         (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
         & (sys_types == "MULT"),
     ]
+    # The multiplicative uncertainties are quoted as percentages, so
+    # we change them to fractional values
     mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
 
     # We want to know the names of the remaining systematics.

From 3ccda956d82352fdbb30cf2accb5ba650e27db91 Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Fri, 4 Sep 2020 12:29:26 +0100
Subject: [PATCH 09/37] Update validphys2/src/validphys/covmats.py

Co-authored-by: Zaharid <zk261@cam.ac.uk>
---
 validphys2/src/validphys/covmats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index efdf356433..d70a5a89c3 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -192,7 +192,7 @@ def covmat_from_systematics(commondata, central_values=None):
     use_mult_errors: bool
         Boolean which controls whether we use multiplicative errors
         when computing the covmat. By default it should be set to ``True``
-        unless one wishes to uses a sampling matrix for pseudodata generation,
+        unless one wishes to sample pseudodata from a multigaussian distribution,
         in which case it should be set to ``False``.
 
     Returns

From 37b460c075367c0c368b3030aa87e3a69398271f Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 4 Sep 2020 16:10:04 +0100
Subject: [PATCH 10/37] Changing covmat_for_sampling to additive_covmat

---
 validphys2/src/validphys/pseudodata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 186747f62c..09ce272ad2 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -164,8 +164,8 @@ def make_replica(commondata, seed=1):
     else:
         ld_cd = commondata
 
-    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+    additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(additive_covmat)
 
     central_values = ld_cd.central_values
     nsys, ndata = ld_cd.nsys, ld_cd.ndata

From 078cd5c4f889485010cfdaa9677dd57358460812 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 15:30:32 +0100
Subject: [PATCH 11/37] Changing variable names to better reflect their purpose

---
 validphys2/src/validphys/pseudodata.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 09ce272ad2..e111906767 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -179,17 +179,17 @@ def make_replica(commondata, seed=1):
     # Drop any systematics that are related to theory uncertainties
     # or have explicitly been labelled as a systematic to be skipped.
     # We are only interested in multiplicative uncertainties
-    mult_uncorr_sys = systematics.loc[
+    mult_sys = systematics.loc[
         :,
         (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
         & (sys_types == "MULT"),
     ]
     # The multiplicative uncertainties are quoted as percentages, so
     # we change them to fractional values
-    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+    mult_sys = mult_sys.applymap(lambda x: x.mult / 100)
 
     # We want to know the names of the remaining systematics.
-    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+    mult_names = sys_names.loc[mult_sys.columns]
 
     # We make this generator infinite in the sense that it shall never
     # raise a StopIteration exception. Thus we can continuely request
@@ -213,7 +213,7 @@ def make_replica(commondata, seed=1):
                 [
                     [
                         rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                        for i, sys_type in mult_uncorr_names.iteritems()
+                        for i, sys_type in mult_names.iteritems()
                     ]
                     for _ in range(ndata)
                 ]
@@ -223,7 +223,7 @@ def make_replica(commondata, seed=1):
             # of the multiplicative uncertainties with the above random matrix. We then
             # add unity to all elements and take the product across columns. This vector
             # must then be elementwise multiplied by central values + correlated deviates.
-            pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+            pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
             # Check positivity of the pseudodata
             if all(pseudodata > 0):

From 55760a077bfc7d0c4829e362582345e1d4c36650 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 16:12:33 +0100
Subject: [PATCH 12/37] Accounting for the asymmetry edge case

---
 validphys2/src/validphys/pseudodata.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index e111906767..1414f8f3ef 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -225,8 +225,9 @@ def make_replica(commondata, seed=1):
             # must then be elementwise multiplied by central values + correlated deviates.
             pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
-            # Check positivity of the pseudodata
-            if all(pseudodata > 0):
+            # Check positivity of the pseudodata. Note that if the dataset is an asymmetry
+            # then it is allowed to be negative.
+            if all(pseudodata > 0) or ("ASY" in commondata.process_type):
                 break
 
         yield pseudodata

From 8217d8d0ff3974afc850886fc4b46a93e40b2f1b Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 16:55:48 +0100
Subject: [PATCH 13/37] Adding information on the nature of sampling in the
 docstring

---
 validphys2/src/validphys/pseudodata.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 1414f8f3ef..0c462cd083 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -120,6 +120,11 @@ def make_replica(commondata, seed=1):
     :py:func:`validphys.covmats.covmat_from_systematics` and must not use
     multiplicative systematic uncertainties when obtaining the sampling matrix.
 
+    Gaussian sampling is performed using the additive part of the covariance matrix.
+    An additional gaussian sampling for the renormalization of the data having multiplicative
+    uncertainties happens later. All the contributions to the covariance matrix coming from
+    theory errors should be accounted for in the additive part
+
     Parameters
     ---------
     commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`

From f2d42c7e4f952c0e41ffa386010041fe8f2ca00c Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 6 Oct 2020 17:29:05 +0100
Subject: [PATCH 14/37] Correcting covmat_from_systematics example

---
 validphys2/src/validphys/covmats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index d70a5a89c3..8559d9aceb 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -205,7 +205,7 @@ def covmat_from_systematics(commondata, central_values=None):
     -------
     >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> from validphys.calcutils import covmat_from_systematics
+    >>> from validphys.covmats import covmat_from_systematics
     >>> l = Loader()
     >>> cd = l.check_commondata("NMC")
     >>> cd = load_commondata(cd)

From 98e53e38165054fd94ed151d07b2bea65de32703 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 13 Oct 2020 15:15:56 +0100
Subject: [PATCH 15/37] Clarifying doc strings

Adding Zahari's review comments
---
 validphys2/src/validphys/covmats.py    |  7 ++++---
 validphys2/src/validphys/pseudodata.py | 16 ++++++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 8559d9aceb..76a3410c8d 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -190,10 +190,11 @@ def covmat_from_systematics(commondata, central_values=None):
         using the predictions from the central member of the t0 pdf.
 
     use_mult_errors: bool
-        Boolean which controls whether we use multiplicative errors
+        Boolean which controls whether multiplicative errors are used
         when computing the covmat. By default it should be set to ``True``
-        unless one wishes to sample pseudodata from a multigaussian distribution,
-        in which case it should be set to ``False``.
+        since setting this parameter to ``False`` will completely ignore multiplicative
+        uncertainties. This flag is present for use with pseudodata generation where the
+        multiplicative uncertainties should be ignored.
 
     Returns
     -------
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 0c462cd083..7ed133912d 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -116,14 +116,14 @@ def make_replica(commondata, seed=1):
     never raise a StopIteration exception
 
     The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix. The covariance matrix is obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
-    multiplicative systematic uncertainties when obtaining the sampling matrix.
-
-    Gaussian sampling is performed using the additive part of the covariance matrix.
-    An additional gaussian sampling for the renormalization of the data having multiplicative
-    uncertainties happens later. All the contributions to the covariance matrix coming from
-    theory errors should be accounted for in the additive part
+    matrix is used as the sampling matrix with the covariance matrix being obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call
+    to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to
+    ignore multiplicative uncertainties. Gaussian sampling is performed using the
+    additive part of the covariance matrix. An additional gaussian sampling for the
+    renormalization of the data having multiplicative uncertainties happens later.
+    All the contributions to the covariance matrix coming from theory errors should be
+    accounted for in the additive part.
 
     Parameters
     ---------

From 213de7f067a9737fa3cf2b4592d0713952f1cdb4 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 13 Oct 2020 15:20:40 +0100
Subject: [PATCH 16/37] Setting default seed kwarg to None

Assigning seed randomly if not provided explicilty in function call
---
 validphys2/src/validphys/pseudodata.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 7ed133912d..b804e68853 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -7,6 +7,7 @@
 import multiprocessing as mp
 import os
 import pathlib
+import random
 
 import numpy as np
 import pandas as pd
@@ -109,7 +110,7 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
-def make_replica(commondata, seed=1):
+def make_replica(commondata, seed=None):
     """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
     :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
     of the data central value. This generator is infinite in the sense that it will
@@ -131,8 +132,10 @@ def make_replica(commondata, seed=1):
         CommonData which stores information about systematic errors,
         their treatment and description.
 
-    seed: int
-        Seed used to initialise the random number generator.
+    seed: int, None
+        Seed used to initialise the random number generator. If ``None`` then a random seed is allocated
+        using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing
+        inadvertent correlations.
 
     Returns
     -------
@@ -159,6 +162,11 @@ def make_replica(commondata, seed=1):
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
     """
+    # If seed is None (i.e not explicitly provided in the function call)
+    # then set it randomly
+    if seed is None:
+        seed = random.randint(1, 1000)
+
     # TODO: eventually remove the inhouse random number generator
     # in favour of e.g numpy
     NNPDF.RandomGenerator.InitRNG(0, seed)

From 612197f18de7990b7b3756b728d868856290fbfa Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 17 Nov 2020 11:23:53 +0000
Subject: [PATCH 17/37] Removing make replica from covmats.py

---
 validphys2/src/validphys/covmats.py | 114 ----------------------------
 1 file changed, 114 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 76a3410c8d..96c5453d8c 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -26,120 +26,6 @@
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
-def make_replica(commondata, seed=1):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
-    of the data central value.
-
-    The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix. The covariance matrix is obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
-    multiplicative systematic uncertainties when obtaining the sampling matrix.
-
-    Parameters
-    ---------
-    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
-        CommonData which stores information about systematic errors,
-        their treatment and description.
-
-    seed: int
-        Seed used to initialise the random number generator.
-
-    Returns
-    -------
-    pseudodata: np.array
-        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
-        containing monte carlo samples of data centered around the data central value.
-
-
-    Example
-    -------
-    >>> from validphys.loader import Loader
-    >>> l = Loader()
-    >>> from validphys.covmats import make_replica
-    >>> cd = l.check_commondata("NMC")
-    >>> make_replica(cd)
-    - Random Generator allocated: ranlux
-    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
-
-    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
-    .. todo:: Allow for correlations between datasets within an experiment.
-    """
-    # TODO: eventually remove the inhouse random number generator
-    # in favour of e.g numpy
-    NNPDF.RandomGenerator.InitRNG(0, seed)
-    rng = NNPDF.RandomGenerator.GetRNG()
-
-    if not isinstance(commondata, CommonData):
-        ld_cd = load_commondata(commondata)
-    else:
-        ld_cd = commondata
-
-    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(covmat_for_sampling)
-
-    central_values = ld_cd.central_values
-    nsys, ndata = ld_cd.nsys, ld_cd.ndata
-
-    # Define these variables here to prevent them
-    # from being redefined in the while loop
-    sys_names = ld_cd.systype_table["name"]
-    sys_types = ld_cd.systype_table["type"]
-    systematics = ld_cd.sys_errors
-
-    # Drop any systematics that are related to theory uncertainties
-    # or have explicitly been labelled as a systematic to be skipped.
-    # We are only interested in multiplicative uncertainties
-    mult_uncorr_sys = systematics.loc[
-        :,
-        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
-        & (sys_types == "MULT"),
-    ]
-    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
-
-    # We want to know the names of the remaining systematics.
-    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
-
-    # We need to loop until the pseudodata is positive
-    while True:
-        # It is useful to have this quantity as a pandas.Series
-        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-        # And this as a numpy.array
-        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
-
-        correlated_deviates = sampling_matrix @ deviates
-
-        pseudodata = np.array(central_values) + correlated_deviates
-
-        # Matrix of random numbers generated by either sampling from rand in the
-        # correct manner, or by requesting a new random number if the systematic
-        # error type was UNCORR
-        random_matrix = np.array(
-            [
-                [
-                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                    for i, sys_type in mult_uncorr_names.iteritems()
-                ]
-                for _ in range(ndata)
-            ]
-        )
-
-        # The final pseudodata is obtained by doing an elementwise multiplication
-        # of the multiplicative uncertainties with the above random matrix. We then
-        # add unity to all elements and take the product across columns. This vector
-        # must then be elementwise multiplied by central values + correlated deviates.
-        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
-
-        # Check positivity of the pseudodata
-        if all(pseudodata > 0):
-            break
-
-    return pseudodata
-
 
 def covmat_from_systematics(commondata, central_values=None):
     """Take the statistical uncertainty and systematics table from

From 3f1ce74a6cad692f4246a9bbd75f68386680df01 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 17 Nov 2020 13:35:46 +0000
Subject: [PATCH 18/37] Adding use_mult_errors flag to
 datasets_covmat_from_systematics

---
 validphys2/src/validphys/covmats.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 96c5453d8c..263fabd17d 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -75,13 +75,6 @@ def covmat_from_systematics(commondata, central_values=None):
         can be used to calculate, for example, the t0 covariance matrix by
         using the predictions from the central member of the t0 pdf.
 
-    use_mult_errors: bool
-        Boolean which controls whether multiplicative errors are used
-        when computing the covmat. By default it should be set to ``True``
-        since setting this parameter to ``False`` will completely ignore multiplicative
-        uncertainties. This flag is present for use with pseudodata generation where the
-        multiplicative uncertainties should be ignored.
-
     Returns
     -------
     cov_mat: np.array
@@ -139,6 +132,10 @@ def datasets_covmat_from_systematics(
         contribution. By default this is None and the experimental central
         values are used.
 
+    use_mult_errors: bool
+        Whether or not to include multiplicative systematic errors in the computation of
+        the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`.
+
     Returns
     -------
     cov_mat : np.array

From f0fa6282add7e68dadba2bda40f9f34eb60a1d1b Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 7 Dec 2020 15:17:21 +0000
Subject: [PATCH 19/37] Adding functionality to compute pseudodata for a list
 of commondata Co-authored-by: wilsonmr
 <33907451+wilsonmr@users.noreply.github.com>

---
 validphys2/src/validphys/pseudodata.py | 140 ++++++++++++-------------
 1 file changed, 69 insertions(+), 71 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index b804e68853..2e7d5cbee4 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -15,7 +15,7 @@
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
 from validphys.commondataparser import load_commondata
 from validphys.coredata import CommonData
-from validphys.covmats import covmat_from_systematics, sqrt_covmat
+from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
 
@@ -110,7 +110,8 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
-def make_replica(commondata, seed=None):
+def make_replica(list_of_commondata, seed=None):
+    # TODO: this is the old docstring: update it
     """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
     :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
     of the data central value. This generator is infinite in the sense that it will
@@ -165,85 +166,82 @@ def make_replica(commondata, seed=None):
     # If seed is None (i.e not explicitly provided in the function call)
     # then set it randomly
     if seed is None:
-        seed = random.randint(1, 1000)
+        seed = np.random.randint(1000)
 
-    # TODO: eventually remove the inhouse random number generator
-    # in favour of e.g numpy
-    NNPDF.RandomGenerator.InitRNG(0, seed)
-    rng = NNPDF.RandomGenerator.GetRNG()
-
-    if not isinstance(commondata, CommonData):
-        ld_cd = load_commondata(commondata)
-    else:
-        ld_cd = commondata
-
-    additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(additive_covmat)
-
-    central_values = ld_cd.central_values
-    nsys, ndata = ld_cd.nsys, ld_cd.ndata
-
-    # Define these variables here to prevent them
-    # from being redefined in the while loop
-    sys_names = ld_cd.systype_table["name"]
-    sys_types = ld_cd.systype_table["type"]
-    systematics = ld_cd.sys_errors
-
-    # Drop any systematics that are related to theory uncertainties
-    # or have explicitly been labelled as a systematic to be skipped.
-    # We are only interested in multiplicative uncertainties
-    mult_sys = systematics.loc[
-        :,
-        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
-        & (sys_types == "MULT"),
-    ]
-    # The multiplicative uncertainties are quoted as percentages, so
-    # we change them to fractional values
-    mult_sys = mult_sys.applymap(lambda x: x.mult / 100)
-
-    # We want to know the names of the remaining systematics.
-    mult_names = sys_names.loc[mult_sys.columns]
+    # Seed the numpy RNG with the seed.
+    np.random.seed(seed=seed)
 
     # We make this generator infinite in the sense that it shall never
-    # raise a StopIteration exception. Thus we can continuely request
+    # raise a StopIteration exception. Thus we can continually request
     # pseudodata from it and it will remember the RNG state.
     while True:
-        # We need to loop until the pseudodata is positive
+        # The inner while True loop is for ensuring a positive definite
+        # pseudodata replica
         while True:
-            # It is useful to have this quantity as a pandas.Series
-            rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-            # And this as a numpy.array
-            deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
-
-            correlated_deviates = sampling_matrix @ deviates
-
-            pseudodata = np.array(central_values) + correlated_deviates
-
-            # Matrix of random numbers generated by either sampling from rand in the
-            # correct manner, or by requesting a new random number if the systematic
-            # error type was UNCORR
-            random_matrix = np.array(
-                [
-                    [
-                        rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                        for i, sys_type in mult_names.iteritems()
-                    ]
-                    for _ in range(ndata)
-                ]
-            )
+            pseudodatas = []
+            special_add = []
+            special_mult = []
+            mult_shifts = []
+            check_positive_masks = []
+            for cd in list_of_commondata:
+                pseudodata = cd.central_values.to_numpy()
+
+                add_errors = cd.additive_errors
+                add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+                pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+
+                # correlated within dataset
+                add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+                pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+
+                # append the partially shifted pseudodata
+                pseudodatas.append(pseudodata)
+                special_add.append(
+                    add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+                )
+                mult_errors = cd.multiplicative_errors
+                mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+                # convert to from percent to fraction
+                mult_shift = (
+                    1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+                ).prod(axis=1)
+
+                mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+                mult_shift *= (
+                    1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+                ).prod(axis=1)
+
+                mult_shifts.append(mult_shift)
+
+                special_mult.append(
+                    mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+                )
+
+                # mask out the data we want to check are all positive
+                if "ASY" in cd.commondataproc:
+                    check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+                else:
+                    check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+
+            # if we sort here (which sorts columns), then permuting datasets doesn't change the result
+            special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy()
+            # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+            special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
-            # The final pseudodata is obtained by doing an elementwise multiplication
-            # of the multiplicative uncertainties with the above random matrix. We then
-            # add unity to all elements and take the product across columns. This vector
-            # must then be elementwise multiplied by central values + correlated deviates.
-            pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
-            # Check positivity of the pseudodata. Note that if the dataset is an asymmetry
-            # then it is allowed to be negative.
-            if all(pseudodata > 0) or ("ASY" in commondata.process_type):
+            all_pseudodata = (
+                np.concatenate(pseudodatas, axis=0)
+                + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+            ) * (
+                np.concatenate(mult_shifts, axis=0)
+                * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+            )
+
+            if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
                 break
 
-        yield pseudodata
+        yield all_pseudodata
 
 
 @check_darwin_single_process

From c7626b80b9a19b0a57996f3bb57e0485701d54ce Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 8 Dec 2020 13:09:15 +0000
Subject: [PATCH 20/37] Adding fillna(0) to special additive errors

---
 validphys2/src/validphys/pseudodata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 2e7d5cbee4..da9311e00b 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -225,8 +225,8 @@ def make_replica(list_of_commondata, seed=None):
                     check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
 
             # if we sort here (which sorts columns), then permuting datasets doesn't change the result
-            special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy()
             # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+            special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
             special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
 

From c3fd73f9f2b2d337e71dd7fefe520aad3fed5888 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 9 Dec 2020 15:58:08 +0000
Subject: [PATCH 21/37] Changing covmats.py to not use the use_mult_errors flag

---
 validphys2/src/validphys/covmats.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 263fabd17d..189d61d89b 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -132,10 +132,6 @@ def datasets_covmat_from_systematics(
         contribution. By default this is None and the experimental central
         values are used.
 
-    use_mult_errors: bool
-        Whether or not to include multiplicative systematic errors in the computation of
-        the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`.
-
     Returns
     -------
     cov_mat : np.array

From a5213cf605b27e3de60b832c304345f84171ba55 Mon Sep 17 00:00:00 2001
From: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
Date: Wed, 16 Dec 2020 10:01:31 +0000
Subject: [PATCH 22/37] Apply suggestions from code review - comments to split
 up code

Co-authored-by: Rosalyn Pearson <33020850+RosalynLP@users.noreply.github.com>
---
 validphys2/src/validphys/pseudodata.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index da9311e00b..eb450f0fc9 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -186,6 +186,7 @@ def make_replica(list_of_commondata, seed=None):
             for cd in list_of_commondata:
                 pseudodata = cd.central_values.to_numpy()
 
+                # ~~~ ADDITIVE ERRORS  ~~~
                 add_errors = cd.additive_errors
                 add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
 
@@ -197,9 +198,11 @@ def make_replica(list_of_commondata, seed=None):
 
                 # append the partially shifted pseudodata
                 pseudodatas.append(pseudodata)
+                # store the additive errors with correlations between datasets for later use
                 special_add.append(
                     add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
                 )
+                # ~~~ MULTIPLICATIVE ERRORS ~~~
                 mult_errors = cd.multiplicative_errors
                 mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
                 # convert to from percent to fraction
@@ -214,6 +217,7 @@ def make_replica(list_of_commondata, seed=None):
 
                 mult_shifts.append(mult_shift)
 
+                # store the multiplicative errors with correlations between datasets for later use
                 special_mult.append(
                     mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
                 )

From 7ee813f204f1ae792ef4fd22817684b2f3f5a7fa Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 16 Dec 2020 10:17:23 +0000
Subject: [PATCH 23/37] include stat errors

---
 validphys2/src/validphys/pseudodata.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index eb450f0fc9..c74529b48f 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -186,6 +186,9 @@ def make_replica(list_of_commondata, seed=None):
             for cd in list_of_commondata:
                 pseudodata = cd.central_values.to_numpy()
 
+                # add contribution from statistical uncertainty
+                pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+
                 # ~~~ ADDITIVE ERRORS  ~~~
                 add_errors = cd.additive_errors
                 add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()

From 46020df63205a844ae9309def12a89af639d415d Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Fri, 15 Jan 2021 14:06:45 +0000
Subject: [PATCH 24/37] Removing if seed is None

Numpy has this as default behaviour

Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
---
 validphys2/src/validphys/pseudodata.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index c74529b48f..0a1b70e064 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -163,11 +163,6 @@ def make_replica(list_of_commondata, seed=None):
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
     """
-    # If seed is None (i.e not explicitly provided in the function call)
-    # then set it randomly
-    if seed is None:
-        seed = np.random.randint(1000)
-
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)
 

From 3d776a5d618cf014be8ba69c637917cf6dedc1d0 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 14:18:24 +0000
Subject: [PATCH 25/37] Changing generator to function

---
 validphys2/src/validphys/pseudodata.py | 138 ++++++++++++-------------
 1 file changed, 67 insertions(+), 71 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 0a1b70e064..b65024a5b7 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -166,84 +166,80 @@ def make_replica(list_of_commondata, seed=None):
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)
 
-    # We make this generator infinite in the sense that it shall never
-    # raise a StopIteration exception. Thus we can continually request
-    # pseudodata from it and it will remember the RNG state.
+    # The inner while True loop is for ensuring a positive definite
+    # pseudodata replica
     while True:
-        # The inner while True loop is for ensuring a positive definite
-        # pseudodata replica
-        while True:
-            pseudodatas = []
-            special_add = []
-            special_mult = []
-            mult_shifts = []
-            check_positive_masks = []
-            for cd in list_of_commondata:
-                pseudodata = cd.central_values.to_numpy()
-
-                # add contribution from statistical uncertainty
-                pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
-
-                # ~~~ ADDITIVE ERRORS  ~~~
-                add_errors = cd.additive_errors
-                add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
-
-                pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
-
-                # correlated within dataset
-                add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
-                pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
-
-                # append the partially shifted pseudodata
-                pseudodatas.append(pseudodata)
-                # store the additive errors with correlations between datasets for later use
-                special_add.append(
-                    add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-                )
-                # ~~~ MULTIPLICATIVE ERRORS ~~~
-                mult_errors = cd.multiplicative_errors
-                mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
-                # convert to from percent to fraction
-                mult_shift = (
-                    1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
-                ).prod(axis=1)
-
-                mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
-                mult_shift *= (
-                    1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
-                ).prod(axis=1)
-
-                mult_shifts.append(mult_shift)
-
-                # store the multiplicative errors with correlations between datasets for later use
-                special_mult.append(
-                    mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-                )
+        pseudodatas = []
+        special_add = []
+        special_mult = []
+        mult_shifts = []
+        check_positive_masks = []
+        for cd in list_of_commondata:
+            pseudodata = cd.central_values.to_numpy()
+
+            # add contribution from statistical uncertainty
+            pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+
+            # ~~~ ADDITIVE ERRORS  ~~~
+            add_errors = cd.additive_errors
+            add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+            pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+
+            # correlated within dataset
+            add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+            pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+
+            # append the partially shifted pseudodata
+            pseudodatas.append(pseudodata)
+            # store the additive errors with correlations between datasets for later use
+            special_add.append(
+                add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+            # ~~~ MULTIPLICATIVE ERRORS ~~~
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            # convert to from percent to fraction
+            mult_shift = (
+                1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+            ).prod(axis=1)
+
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            mult_shift *= (
+                1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+            ).prod(axis=1)
+
+            mult_shifts.append(mult_shift)
+
+            # store the multiplicative errors with correlations between datasets for later use
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
 
-                # mask out the data we want to check are all positive
-                if "ASY" in cd.commondataproc:
-                    check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
-                else:
-                    check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+            # mask out the data we want to check are all positive
+            if "ASY" in cd.commondataproc:
+                check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+            else:
+                check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
 
-            # if we sort here (which sorts columns), then permuting datasets doesn't change the result
-            # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
-            special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
-            special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+        # if we sort here (which sorts columns), then permuting datasets doesn't change the result
+        # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+        special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
 
-            all_pseudodata = (
-                np.concatenate(pseudodatas, axis=0)
-                + special_add_errors @ np.random.randn(special_add_errors.shape[1])
-            ) * (
-                np.concatenate(mult_shifts, axis=0)
-                * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
-            )
+        all_pseudodata = (
+            np.concatenate(pseudodatas, axis=0)
+            + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+        ) * (
+            np.concatenate(mult_shifts, axis=0)
+            * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+        )
 
-            if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
-                break
+        if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
+            break
 
-        yield all_pseudodata
+    return all_pseudodata
 
 
 @check_darwin_single_process

From 1984534edf01378c7e29ea082b70b956f1e0d3b8 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 14:27:56 +0000
Subject: [PATCH 26/37] Updating doc string

---
 validphys2/src/validphys/pseudodata.py | 58 ++++++++++----------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index b65024a5b7..dcd47203b3 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -111,57 +111,43 @@ def read_fit_pseudodata(fitcontext, context_index):
 
 
 def make_replica(list_of_commondata, seed=None):
-    # TODO: this is the old docstring: update it
-    """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
-    of the data central value. This generator is infinite in the sense that it will
-    never raise a StopIteration exception
-
-    The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix with the covariance matrix being obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call
-    to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to
-    ignore multiplicative uncertainties. Gaussian sampling is performed using the
-    additive part of the covariance matrix. An additional gaussian sampling for the
-    renormalization of the data having multiplicative uncertainties happens later.
-    All the contributions to the covariance matrix coming from theory errors should be
-    accounted for in the additive part.
+    """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
+    objects and returns pseudodata replicas of the data central value accounting for
+    possible correlations between systematic uncertainties.
+
+    The function loops until positive definite pseudodata is generated for any
+    non-asymmetry datasets. In the case of an asymmetry dataset negative values are
+    permitted so the loop block executes only once.
 
     Parameters
     ---------
-    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
-        CommonData which stores information about systematic errors,
-        their treatment and description.
+    list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`]
+        Iterable of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
 
     seed: int, None
-        Seed used to initialise the random number generator. If ``None`` then a random seed is allocated
-        using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing
-        inadvertent correlations.
+        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
+        allocated using the default numpy behaviour.
 
     Returns
     -------
     pseudodata: np.array
-        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
         containing monte carlo samples of data centered around the data central value.
 
-
     Example
     -------
+    >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> l = Loader()
     >>> from validphys.pseudodata import make_replica
-    >>> cd = l.check_commondata("NMC")
-    >>> data_generator = make_replica(cd)
-    >>> next(data_generator)
-    - Random Generator allocated: ranlux
-    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-           0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-           0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-           0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-
-
-    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
-    .. todo:: Allow for correlations between datasets within an experiment.
+    >>> l = Loader()
+    >>> cd1 = l.check_commondata("NMC")
+    >>> cd2 = l.check_commondata("NMCPD")
+    >>> lds = map(load_commondata, (cd1, cd2))
+    >>> make_replica(lds)
+    array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
+        0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
+        0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
     """
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)

From b5df64e4f56d53e5811b87819e34df518d67046b Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 16:16:34 +0000
Subject: [PATCH 27/37] Using new numpy.random interface

---
 validphys2/src/validphys/pseudodata.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index dcd47203b3..e5ec8a30f6 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -7,7 +7,6 @@
 import multiprocessing as mp
 import os
 import pathlib
-import random
 
 import numpy as np
 import pandas as pd
@@ -150,7 +149,7 @@ def make_replica(list_of_commondata, seed=None):
         0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
     """
     # Seed the numpy RNG with the seed.
-    np.random.seed(seed=seed)
+    rng = np.random.default_rng(seed=seed)
 
     # The inner while True loop is for ensuring a positive definite
     # pseudodata replica
@@ -164,17 +163,17 @@ def make_replica(list_of_commondata, seed=None):
             pseudodata = cd.central_values.to_numpy()
 
             # add contribution from statistical uncertainty
-            pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+            pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))
 
             # ~~~ ADDITIVE ERRORS  ~~~
             add_errors = cd.additive_errors
             add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
 
-            pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+            pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)
 
             # correlated within dataset
             add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
-            pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+            pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])
 
             # append the partially shifted pseudodata
             pseudodatas.append(pseudodata)
@@ -187,12 +186,12 @@ def make_replica(list_of_commondata, seed=None):
             mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
             # convert to from percent to fraction
             mult_shift = (
-                1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
             ).prod(axis=1)
 
             mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
             mult_shift *= (
-                1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
             ).prod(axis=1)
 
             mult_shifts.append(mult_shift)
@@ -216,10 +215,10 @@ def make_replica(list_of_commondata, seed=None):
 
         all_pseudodata = (
             np.concatenate(pseudodatas, axis=0)
-            + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+            + special_add_errors @ rng.normal(size=special_add_errors.shape[1])
         ) * (
             np.concatenate(mult_shifts, axis=0)
-            * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+            * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
         )
 
         if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):

From ed674e84297f1cf5fbd5d2d1a3a51e3b40c4d063 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 27 Jan 2021 18:09:40 +0000
Subject: [PATCH 28/37] add copy to central values before generating data

---
 validphys2/src/validphys/pseudodata.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index e5ec8a30f6..28c150ecc3 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -160,7 +160,8 @@ def make_replica(list_of_commondata, seed=None):
         mult_shifts = []
         check_positive_masks = []
         for cd in list_of_commondata:
-            pseudodata = cd.central_values.to_numpy()
+            # copy here to avoid mutating the central values.
+            pseudodata = cd.central_values.to_numpy(copy=True)
 
             # add contribution from statistical uncertainty
             pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))

From f4810349e48dda1f673cbf2cd6a2c053265d49e8 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Thu, 28 Jan 2021 12:21:25 +0000
Subject: [PATCH 29/37] added some tests to catch that central values are
 unchanged and that generated data is correct length.

---
 .../validphys/tests/test_pythonmakereplica.py | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 validphys2/src/validphys/tests/test_pythonmakereplica.py

diff --git a/validphys2/src/validphys/tests/test_pythonmakereplica.py b/validphys2/src/validphys/tests/test_pythonmakereplica.py
new file mode 100644
index 0000000000..639d0a371b
--- /dev/null
+++ b/validphys2/src/validphys/tests/test_pythonmakereplica.py
@@ -0,0 +1,78 @@
+"""
+test_pythonmakereplica.py
+
+Module for testing the python implementation of make replica
+
+"""
+import numpy as np
+import pytest
+
+from validphys.api import API
+from validphys.pseudodata import make_replica
+from validphys.tests.conftest import DATA
+from validphys.tests.test_covmats import CORR_DATA
+
+
+SINGLE_SYS_DATASETS = [
+    {"dataset": "DYE886R"},
+    {"dataset": "D0ZRAP", "cfac": ["QCD"]},
+    {"dataset": "ATLAS_SINGLETOP_TCH_R_13TEV", "cfac": ["QCD"]},
+    {"dataset": "CMS_SINGLETOP_TCH_R_8TEV", "cfac": ["QCD"]},
+    {"dataset": "CMS_SINGLETOP_TCH_R_13TEV", "cfac": ["QCD"]},
+]
+
+@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"])
+@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS])
+def test_commondata_unchanged(data_config, dataset_inputs, use_cuts):
+    """Load the commondata, then generate some pseudodata using make replica
+    Check that the following attributes of the commondata have not been
+    modified: central_values, commondata_table, systematics_table.
+
+    """
+    config = dict(data_config)
+    config["dataset_inputs"] = dataset_inputs
+    config["use_cuts"] = use_cuts
+    ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config)
+    # keep a copy of all central values and all systematics
+    original_cvs = [np.array(cd.central_values.to_numpy(copy=True)) for cd in ld_cds]
+    original_sys = [np.array(cd.systematics_table.to_numpy(copy=True)) for cd in ld_cds]
+    make_replica(ld_cds)
+    for post_mkrep_cd, pre_mkrep_cv, pre_mkrep_sys in zip(ld_cds, original_cvs, original_sys):
+        np.testing.assert_allclose(
+            post_mkrep_cd.central_values.to_numpy(), pre_mkrep_cv
+        )
+        np.testing.assert_allclose(
+            post_mkrep_cd.systematics_table.to_numpy(), pre_mkrep_sys
+        )
+
+
+@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"])
+@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS])
+def test_pseudodata_seeding(data_config, dataset_inputs, use_cuts):
+    """Check that using a seed reproduces the pseudodata. Note that this also
+    will check that the commondata hasn't been modified since reproducing
+    the same commondata requires that the commondata is unchanged and that
+    the random numbers are generated and used identically.
+
+    """
+    seed = 123456
+    config = dict(data_config)
+    config["dataset_inputs"] = dataset_inputs
+    config["use_cuts"] = use_cuts
+    ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config)
+    rep_1 = make_replica(ld_cds, seed=seed)
+    rep_2 = make_replica(ld_cds, seed=seed)
+    np.testing.assert_allclose(rep_1, rep_2)
+
+
+@pytest.mark.parametrize("use_cuts", ["nocuts", "internal"])
+@pytest.mark.parametrize("dataset_inputs", [DATA, CORR_DATA, SINGLE_SYS_DATASETS])
+def test_pseudodata_has_correct_ndata(data_config, dataset_inputs, use_cuts):
+    """Check that we get the correct ndata when generating pseudodata"""
+    config = dict(data_config)
+    config["dataset_inputs"] = dataset_inputs
+    config["use_cuts"] = use_cuts
+    ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config)
+    rep = make_replica(ld_cds)
+    ndata = np.sum([cd.ndata for cd in ld_cds])
+    assert len(rep) == ndata

From 133c07cb811647217f66fa2c0a963eb485ce0873 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 3 Feb 2021 10:44:37 +0000
Subject: [PATCH 30/37] update make replica test that commondata is unchanged
 to do what it says it does

---
 .../validphys/tests/test_pythonmakereplica.py | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/validphys2/src/validphys/tests/test_pythonmakereplica.py b/validphys2/src/validphys/tests/test_pythonmakereplica.py
index 639d0a371b..627de972de 100644
--- a/validphys2/src/validphys/tests/test_pythonmakereplica.py
+++ b/validphys2/src/validphys/tests/test_pythonmakereplica.py
@@ -4,7 +4,10 @@
 Module for testing the python implementation of make replica
 
 """
+from copy import deepcopy
+
 import numpy as np
+from pandas.testing import assert_frame_equal, assert_series_equal
 import pytest
 
 from validphys.api import API
@@ -33,17 +36,22 @@ def test_commondata_unchanged(data_config, dataset_inputs, use_cuts):
     config["dataset_inputs"] = dataset_inputs
     config["use_cuts"] = use_cuts
     ld_cds = API.dataset_inputs_loaded_cd_with_cuts(**config)
-    # keep a copy of all central values and all systematics
-    original_cvs = [np.array(cd.central_values.to_numpy(copy=True)) for cd in ld_cds]
-    original_sys = [np.array(cd.systematics_table.to_numpy(copy=True)) for cd in ld_cds]
+
+    # keep a copy of all dataframes/series pre make replica
+    pre_mkrep_cvs = [deepcopy(cd.central_values) for cd in ld_cds]
+    pre_mkrep_sys_tabs = [deepcopy(cd.systematics_table) for cd in ld_cds]
+    pre_mkrep_cd_tabs = [deepcopy(cd.commondata_table) for cd in ld_cds]
+
     make_replica(ld_cds)
-    for post_mkrep_cd, pre_mkrep_cv, pre_mkrep_sys in zip(ld_cds, original_cvs, original_sys):
-        np.testing.assert_allclose(
-            post_mkrep_cd.central_values.to_numpy(), pre_mkrep_cv
-        )
-        np.testing.assert_allclose(
-            post_mkrep_cd.systematics_table.to_numpy(), pre_mkrep_sys
-        )
+
+    for post_mkrep_cd, pre_mkrep_cv in zip(ld_cds, pre_mkrep_cvs):
+        assert_series_equal(post_mkrep_cd.central_values, pre_mkrep_cv)
+
+    for post_mkrep_cd, pre_mkrep_sys_tab in zip(ld_cds, pre_mkrep_sys_tabs):
+        assert_frame_equal(post_mkrep_cd.systematics_table, pre_mkrep_sys_tab)
+
+    for post_mkrep_cd, pre_mkrep_cd_tab in zip(ld_cds, pre_mkrep_cd_tabs):
+        assert_frame_equal(post_mkrep_cd.commondata_table, pre_mkrep_cd_tab)
 
 
 @pytest.mark.parametrize("use_cuts", ["nocuts", "internal"])

From 8378139cfff032992b3de495636fa1fe3218f708 Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Mon, 8 Feb 2021 16:38:56 +0000
Subject: [PATCH 31/37] Apply suggestions from code review

Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
---
 validphys2/src/validphys/pseudodata.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 28c150ecc3..f6ee7d8472 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -111,7 +111,7 @@ def read_fit_pseudodata(fitcontext, context_index):
 
 def make_replica(list_of_commondata, seed=None):
     """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
-    objects and returns pseudodata replicas of the data central value accounting for
+    objects and returns a pseudodata replica accounting for
     possible correlations between systematic uncertainties.
 
     The function loops until positive definite pseudodata is generated for any
@@ -208,7 +208,6 @@ def make_replica(list_of_commondata, seed=None):
             else:
                 check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
 
-        # if we sort here (which sorts columns), then permuting datasets doesn't change the result
         # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
         special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
         special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()

From 50f33aa6de88634e8be19cd5196627621e15e8ee Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 16:40:47 +0000
Subject: [PATCH 32/37] Fixing rebase problems

---
 validphys2/src/validphys/covmats.py    | 1 -
 validphys2/src/validphys/pseudodata.py | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 189d61d89b..e913f36105 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -19,7 +19,6 @@
     check_speclabels_different,
     check_data_cuts_match_theorycovmat,
 )
-from validphys.core import PDF, DataGroupSpec, DataSetSpec
 from validphys.results import ThPredictionsResult
 
 log = logging.getLogger(__name__)
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index f6ee7d8472..054a996eb8 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -12,8 +12,6 @@
 import pandas as pd
 
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
-from validphys.commondataparser import load_commondata
-from validphys.coredata import CommonData
 from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
@@ -21,8 +19,6 @@
 import n3fit.io.reader as reader
 from n3fit.performfit import initialize_seeds
 
-import NNPDF
-
 log = logging.getLogger(__name__)
 
 

From 432063d26d071dc32dce7a63f8db02c463fe83a2 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 16:41:33 +0000
Subject: [PATCH 33/37] iterable -> list in docstring

---
 validphys2/src/validphys/pseudodata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 054a996eb8..e083c1f09f 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -116,8 +116,8 @@ def make_replica(list_of_commondata, seed=None):
 
     Parameters
     ---------
-    list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`]
-        Iterable of CommonData objects which stores information about systematic errors,
+    list_of_commondata: list[:py:class:`validphys.coredata.CommonData`]
+        List of CommonData objects which stores information about systematic errors,
         their treatment and description, for each dataset.
 
     seed: int, None

From e00e8d00d58b5e2be3630fa791eee3b4ca9a0251 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 16:49:32 +0000
Subject: [PATCH 34/37] using API to load cd

---
 validphys2/src/validphys/pseudodata.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index e083c1f09f..80b6770972 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -132,13 +132,13 @@ def make_replica(list_of_commondata, seed=None):
 
     Example
     -------
-    >>> from validphys.commondataparser import load_commondata
-    >>> from validphys.loader import Loader
+    >>> from validphys.api import API
     >>> from validphys.pseudodata import make_replica
-    >>> l = Loader()
-    >>> cd1 = l.check_commondata("NMC")
-    >>> cd2 = l.check_commondata("NMCPD")
-    >>> lds = map(load_commondata, (cd1, cd2))
+    >>> lds = API.dataset_inputs_loaded_cd_with_cuts(
+                                    dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}],
+                                    use_cuts="nocuts",
+                                    theoryid=53
+                                )
     >>> make_replica(lds)
     array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
         0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,

From ae2623e0ab42373c78a68a93769db7ac18cb15ea Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 17:47:13 +0000
Subject: [PATCH 35/37] making make replica a provider

---
 validphys2/src/validphys/pseudodata.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 80b6770972..b1ec0c3933 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -105,7 +105,7 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
-def make_replica(list_of_commondata, seed=None):
+def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None):
     """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
     objects and returns a pseudodata replica accounting for
     possible correlations between systematic uncertainties.
@@ -116,7 +116,7 @@ def make_replica(list_of_commondata, seed=None):
 
     Parameters
     ---------
-    list_of_commondata: list[:py:class:`validphys.coredata.CommonData`]
+    dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`]
         List of CommonData objects which stores information about systematic errors,
         their treatment and description, for each dataset.
 
@@ -133,13 +133,11 @@ def make_replica(list_of_commondata, seed=None):
     Example
     -------
     >>> from validphys.api import API
-    >>> from validphys.pseudodata import make_replica
-    >>> lds = API.dataset_inputs_loaded_cd_with_cuts(
+    >>> pseudodata = API.make_replica(
                                     dataset_inputs=[{"dataset":"NMC"}, {"dataset": "NMCPD"}],
                                     use_cuts="nocuts",
                                     theoryid=53
                                 )
-    >>> make_replica(lds)
     array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
         0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
         0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
@@ -155,7 +153,7 @@ def make_replica(list_of_commondata, seed=None):
         special_mult = []
         mult_shifts = []
         check_positive_masks = []
-        for cd in list_of_commondata:
+        for cd in dataset_inputs_loaded_cd_with_cuts:
             # copy here to avoid mutating the central values.
             pseudodata = cd.central_values.to_numpy(copy=True)
 

From bbbd22697fb0843e0f848fe513083ed7423b116a Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 17:49:50 +0000
Subject: [PATCH 36/37] index pseudodata

---
 validphys2/src/validphys/pseudodata.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index b1ec0c3933..dba391e6ad 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -221,6 +221,13 @@ def make_replica(dataset_inputs_loaded_cd_with_cuts, seed=None):
     return all_pseudodata
 
 
+def indexed_make_replica(groups_index, make_replica):
+    """Index the make_replica pseudodata appropriately
+    """
+
+    return pd.DataFrame(make_replica, index=groups_index, columns=["data"])
+
+
 @check_darwin_single_process
 def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
     """A function to obtain information about the pseudodata that went

From 6a12bd628d113f6e5aa7821d47c4db4814e72c3f Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 8 Feb 2021 18:20:20 +0000
Subject: [PATCH 37/37] Adding regression test for make_replica

---
 validphys2/src/validphys/tableloader.py       |   5 +
 .../tests/regressions/test_mcreplica.csv      | 267 ++++++++++++++++++
 .../src/validphys/tests/test_regressions.py   |  15 +-
 3 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 validphys2/src/validphys/tests/regressions/test_mcreplica.csv

diff --git a/validphys2/src/validphys/tableloader.py b/validphys2/src/validphys/tableloader.py
index 34883ad293..8d4a493fbb 100644
--- a/validphys2/src/validphys/tableloader.py
+++ b/validphys2/src/validphys/tableloader.py
@@ -32,6 +32,11 @@ def fixup_header(df, head_index, dtype):
                                   *oldcols.levels[head_index+1:]])
     df.columns = newcols
 
+def parse_data_cv(filename):
+    """Useful for reading DataFrames with just one column."""
+    df = sane_load(filename, index_col=[0, 1, 2])
+    return df
+
 def parse_exp_mat(filename):
     """Parse a dump of a matrix like experiments_covmat."""
     df = sane_load(filename, header=[0,1,2], index_col=[0,1,2])
diff --git a/validphys2/src/validphys/tests/regressions/test_mcreplica.csv b/validphys2/src/validphys/tests/regressions/test_mcreplica.csv
new file mode 100644
index 0000000000..bdfab83f4b
--- /dev/null
+++ b/validphys2/src/validphys/tests/regressions/test_mcreplica.csv
@@ -0,0 +1,267 @@
+group	dataset	id	data
+ATLAS	ATLASWZRAP36PB	0	625874.0721154683
+ATLAS	ATLASWZRAP36PB	1	629690.9940296302
+ATLAS	ATLASWZRAP36PB	2	647804.631311913
+ATLAS	ATLASWZRAP36PB	3	640294.5028097499
+ATLAS	ATLASWZRAP36PB	4	666615.6925633255
+ATLAS	ATLASWZRAP36PB	5	679742.6040870899
+ATLAS	ATLASWZRAP36PB	6	661543.8956116233
+ATLAS	ATLASWZRAP36PB	7	661401.0725677101
+ATLAS	ATLASWZRAP36PB	8	694185.1736757511
+ATLAS	ATLASWZRAP36PB	9	674955.0227261193
+ATLAS	ATLASWZRAP36PB	10	595607.510395848
+ATLAS	ATLASWZRAP36PB	11	461178.58903195267
+ATLAS	ATLASWZRAP36PB	12	454625.3670724297
+ATLAS	ATLASWZRAP36PB	13	481323.87751588045
+ATLAS	ATLASWZRAP36PB	14	458849.11243098835
+ATLAS	ATLASWZRAP36PB	15	439424.94273936
+ATLAS	ATLASWZRAP36PB	16	432160.16046439036
+ATLAS	ATLASWZRAP36PB	17	403846.69581751985
+ATLAS	ATLASWZRAP36PB	18	396368.08282609435
+ATLAS	ATLASWZRAP36PB	19	393885.7902546321
+ATLAS	ATLASWZRAP36PB	20	366443.97889378696
+ATLAS	ATLASWZRAP36PB	21	344324.53993439255
+ATLAS	ATLASWZRAP36PB	22	135144.70842506472
+ATLAS	ATLASWZRAP36PB	23	139058.67466877738
+ATLAS	ATLASWZRAP36PB	24	129580.76369541005
+ATLAS	ATLASWZRAP36PB	25	125247.04589283372
+ATLAS	ATLASWZRAP36PB	26	119478.16024355698
+ATLAS	ATLASWZRAP36PB	27	115032.04083971222
+ATLAS	ATLASWZRAP36PB	28	104865.22589012324
+ATLAS	ATLASWZRAP36PB	29	57082.53327858195
+ATLAS	ATLASZHIGHMASS49FB	0	225.2241265581888
+ATLAS	ATLASZHIGHMASS49FB	1	102.44036519725282
+ATLAS	ATLASZHIGHMASS49FB	2	51.11583760792298
+ATLAS	ATLASZHIGHMASS49FB	3	27.488866078195667
+ATLAS	ATLASZHIGHMASS49FB	4	18.220176599654852
+ATLAS	ATLASZHIGHMASS49FB	5	11.174983683757414
+ATLAS	ATLASZHIGHMASS49FB	6	8.695255033748985
+ATLAS	ATLASZHIGHMASS49FB	7	4.521850917275335
+ATLAS	ATLASZHIGHMASS49FB	8	1.7204940792698935
+ATLAS	ATLASZHIGHMASS49FB	9	0.4698742453309486
+ATLAS	ATLASZHIGHMASS49FB	10	0.14129611171097067
+ATLAS	ATLASZHIGHMASS49FB	11	0.027030707761265878
+ATLAS	ATLASZHIGHMASS49FB	12	0.005449291740977146
+ATLAS	ATLASLOMASSDY11EXT	0	13576.517089454817
+ATLAS	ATLASLOMASSDY11EXT	1	23865.177412820023
+ATLAS	ATLASLOMASSDY11EXT	2	14606.30537341771
+ATLAS	ATLASLOMASSDY11EXT	3	6812.0492301686645
+ATLAS	ATLASLOMASSDY11EXT	4	2873.966448543525
+ATLAS	ATLASLOMASSDY11EXT	5	1270.81693829667
+ATLAS	ATLASWZRAP11	0	575091.3781838007
+ATLAS	ATLASWZRAP11	1	573831.5774693731
+ATLAS	ATLASWZRAP11	2	577308.4133944701
+ATLAS	ATLASWZRAP11	3	581166.6968380663
+ATLAS	ATLASWZRAP11	4	579180.5469411756
+ATLAS	ATLASWZRAP11	5	595392.1602877442
+ATLAS	ATLASWZRAP11	6	590119.459024331
+ATLAS	ATLASWZRAP11	7	598485.3027461147
+ATLAS	ATLASWZRAP11	8	596928.0553084731
+ATLAS	ATLASWZRAP11	9	589065.2544999232
+ATLAS	ATLASWZRAP11	10	554887.8805729451
+ATLAS	ATLASWZRAP11	11	432286.1471328586
+ATLAS	ATLASWZRAP11	12	428885.6752850355
+ATLAS	ATLASWZRAP11	13	426354.80807207135
+ATLAS	ATLASWZRAP11	14	419728.3652439385
+ATLAS	ATLASWZRAP11	15	411428.30053804716
+ATLAS	ATLASWZRAP11	16	402163.10469116265
+ATLAS	ATLASWZRAP11	17	384833.51730835374
+ATLAS	ATLASWZRAP11	18	373988.9640125536
+ATLAS	ATLASWZRAP11	19	360622.88887275255
+ATLAS	ATLASWZRAP11	20	340702.7760869247
+ATLAS	ATLASWZRAP11	21	317134.5980054034
+ATLAS	ATLASWZRAP11	22	133799.96364220293
+ATLAS	ATLASWZRAP11	23	133448.86365596423
+ATLAS	ATLASWZRAP11	24	132637.63442726055
+ATLAS	ATLASWZRAP11	25	130409.92750454393
+ATLAS	ATLASWZRAP11	26	130912.15120182281
+ATLAS	ATLASWZRAP11	27	127353.39305518712
+ATLAS	ATLASWZRAP11	28	117895.4802432277
+ATLAS	ATLASWZRAP11	29	105832.75453175441
+ATLAS	ATLASWZRAP11	30	88155.7125609602
+ATLAS	ATLASWZRAP11	31	67414.70246827461
+ATLAS	ATLASWZRAP11	32	44841.69930108373
+ATLAS	ATLASWZRAP11	33	21730.44158986774
+CMS	CMSZDIFF12	0	9886.334551571706
+CMS	CMSZDIFF12	1	2906.558370791581
+CMS	CMSZDIFF12	2	1065.8985621057848
+CMS	CMSZDIFF12	3	456.26790474310286
+CMS	CMSZDIFF12	4	224.22271558328507
+CMS	CMSZDIFF12	5	110.44990446224114
+CMS	CMSZDIFF12	6	62.579695282891535
+CMS	CMSZDIFF12	7	30.063450520529134
+CMS	CMSZDIFF12	8	13.922543892672062
+CMS	CMSZDIFF12	9	0.6141016049164042
+CMS	CMSZDIFF12	10	9856.366953238841
+CMS	CMSZDIFF12	11	2865.600713840613
+CMS	CMSZDIFF12	12	1059.860323720172
+CMS	CMSZDIFF12	13	445.179296515881
+CMS	CMSZDIFF12	14	212.87853289770084
+CMS	CMSZDIFF12	15	109.79806111700381
+CMS	CMSZDIFF12	16	57.81793265111689
+CMS	CMSZDIFF12	17	30.846128145509127
+CMS	CMSZDIFF12	18	13.294573202916089
+CMS	CMSZDIFF12	19	0.6295615951503276
+CMS	CMSZDIFF12	20	9172.143310236264
+CMS	CMSZDIFF12	21	2584.4159164450075
+CMS	CMSZDIFF12	22	934.5819007730142
+CMS	CMSZDIFF12	23	414.03599558528543
+CMS	CMSZDIFF12	24	202.31230308651277
+CMS	CMSZDIFF12	25	101.61697272749291
+CMS	CMSZDIFF12	26	55.241506126064486
+CMS	CMSZDIFF12	27	28.99689277027626
+CMS	CMSZDIFF12	28	13.300620394041099
+CMS	CMSZDIFF12	29	0.5485904704152681
+CMS	CMSZDIFF12	30	6881.389313938887
+CMS	CMSZDIFF12	31	1929.5303610925562
+CMS	CMSZDIFF12	32	742.7463056241985
+CMS	CMSZDIFF12	33	318.6113512739281
+CMS	CMSZDIFF12	34	166.3854976523764
+CMS	CMSZDIFF12	35	83.77177274421992
+CMS	CMSZDIFF12	36	47.38911931541346
+CMS	CMSZDIFF12	37	23.574007377898496
+CMS	CMSZDIFF12	38	11.935536472481251
+CMS	CMSZDIFF12	39	0.45705487477676426
+CMS	CMSZDIFF12	40	3707.934954547729
+CMS	CMSZDIFF12	41	1021.7408959849994
+CMS	CMSZDIFF12	42	383.9123726446375
+CMS	CMSZDIFF12	43	178.03122251767607
+CMS	CMSZDIFF12	44	92.3591212500585
+CMS	CMSZDIFF12	45	47.898485423688264
+CMS	CMSZDIFF12	46	28.687924568924732
+CMS	CMSZDIFF12	47	15.073356404547805
+CMS	CMSZDIFF12	48	6.8587092533392475
+CMS	CMSZDIFF12	49	0.2537524987802203
+CMS	CMSJETS11	0	3064.8412856542986
+CMS	CMSJETS11	1	1449.5091642980894
+CMS	CMSJETS11	2	676.135506669856
+CMS	CMSJETS11	3	328.6688802887687
+CMS	CMSJETS11	4	170.7883047739323
+CMS	CMSJETS11	5	89.97188258438256
+CMS	CMSJETS11	6	48.1640183282502
+CMS	CMSJETS11	7	26.21942900354327
+CMS	CMSJETS11	8	14.019421384758264
+CMS	CMSJETS11	9	7.99060694491261
+CMS	CMSJETS11	10	4.620071821124748
+CMS	CMSJETS11	11	2.653929526854456
+CMS	CMSJETS11	12	1.5706174029329598
+CMS	CMSJETS11	13	0.8776470184526391
+CMS	CMSJETS11	14	0.5180721939935242
+CMS	CMSJETS11	15	0.3138789290162737
+CMS	CMSJETS11	16	0.18168510415487119
+CMS	CMSJETS11	17	0.10788319709417281
+CMS	CMSJETS11	18	0.06449279142848932
+CMS	CMSJETS11	19	0.03712444722658268
+CMS	CMSJETS11	20	0.021921922929691376
+CMS	CMSJETS11	21	0.013033700911296818
+CMS	CMSJETS11	22	0.007512607144544201
+CMS	CMSJETS11	23	0.004114373157279753
+CMS	CMSJETS11	24	0.0022362655927216464
+CMS	CMSJETS11	25	0.001482939378122927
+CMS	CMSJETS11	26	0.0007342552398618355
+CMS	CMSJETS11	27	0.00031227731387260945
+CMS	CMSJETS11	28	0.00020129807744728782
+CMS	CMSJETS11	29	9.225641840455252e-05
+CMS	CMSJETS11	30	5.8513477579783676e-05
+CMS	CMSJETS11	31	1.4973792398939266e-05
+CMS	CMSJETS11	32	2.257710172411508e-06
+CMS	CMSJETS11	33	2938.4983182356086
+CMS	CMSJETS11	34	1306.838248806242
+CMS	CMSJETS11	35	597.4152766849537
+CMS	CMSJETS11	36	290.0206657424005
+CMS	CMSJETS11	37	152.23116359185778
+CMS	CMSJETS11	38	83.52798139030051
+CMS	CMSJETS11	39	42.09591785153588
+CMS	CMSJETS11	40	22.8506640473896
+CMS	CMSJETS11	41	13.020425758807377
+CMS	CMSJETS11	42	7.290889884016562
+CMS	CMSJETS11	43	4.026941804347156
+CMS	CMSJETS11	44	2.3465671617499315
+CMS	CMSJETS11	45	1.3340712136907575
+CMS	CMSJETS11	46	0.7737929315587418
+CMS	CMSJETS11	47	0.43960904171665677
+CMS	CMSJETS11	48	0.25276210474175886
+CMS	CMSJETS11	49	0.14717720949714796
+CMS	CMSJETS11	50	0.0847412733331661
+CMS	CMSJETS11	51	0.048881737479790216
+CMS	CMSJETS11	52	0.028451560671024542
+CMS	CMSJETS11	53	0.01599524989930021
+CMS	CMSJETS11	54	0.008733853605811368
+CMS	CMSJETS11	55	0.004594104744837374
+CMS	CMSJETS11	56	0.002236004787141291
+CMS	CMSJETS11	57	0.0014403506626033526
+CMS	CMSJETS11	58	0.0007585311754938085
+CMS	CMSJETS11	59	0.0003499911931369715
+CMS	CMSJETS11	60	0.00011161977018081332
+CMS	CMSJETS11	61	7.773134653806228e-05
+CMS	CMSJETS11	62	1.0739797452865742e-05
+CMS	CMSJETS11	63	2582.266754202891
+CMS	CMSJETS11	64	1096.3104931367543
+CMS	CMSJETS11	65	515.8803717520409
+CMS	CMSJETS11	66	247.584409749495
+CMS	CMSJETS11	67	125.56185712566229
+CMS	CMSJETS11	68	64.59934467165961
+CMS	CMSJETS11	69	33.7709164498504
+CMS	CMSJETS11	70	18.28629614870014
+CMS	CMSJETS11	71	9.863997071005198
+CMS	CMSJETS11	72	5.208105065908411
+CMS	CMSJETS11	73	2.890135232927644
+CMS	CMSJETS11	74	1.6089548948524601
+CMS	CMSJETS11	75	0.849452414837028
+CMS	CMSJETS11	76	0.4673888181261503
+CMS	CMSJETS11	77	0.2668471983983226
+CMS	CMSJETS11	78	0.14592329381516939
+CMS	CMSJETS11	79	0.07508557256079366
+CMS	CMSJETS11	80	0.040293907781475916
+CMS	CMSJETS11	81	0.02119984449211859
+CMS	CMSJETS11	82	0.010995803998216362
+CMS	CMSJETS11	83	0.005466812931544958
+CMS	CMSJETS11	84	0.002642109565833648
+CMS	CMSJETS11	85	0.0013200185870372624
+CMS	CMSJETS11	86	0.0005247905933328646
+CMS	CMSJETS11	87	0.00028209984999926743
+CMS	CMSJETS11	88	0.0001720895024153124
+CMS	CMSJETS11	89	1.1238503789800609e-05
+CMS	CMSJETS11	90	2003.922630727413
+CMS	CMSJETS11	91	874.731868408299
+CMS	CMSJETS11	92	392.1741073514707
+CMS	CMSJETS11	93	187.66789571263348
+CMS	CMSJETS11	94	92.02147434259216
+CMS	CMSJETS11	95	44.1730889548527
+CMS	CMSJETS11	96	23.833613441467335
+CMS	CMSJETS11	97	11.819012342827115
+CMS	CMSJETS11	98	6.209444739322469
+CMS	CMSJETS11	99	3.099195938055046
+CMS	CMSJETS11	100	1.5455020113031772
+CMS	CMSJETS11	101	0.8181216756637731
+CMS	CMSJETS11	102	0.3978236711718021
+CMS	CMSJETS11	103	0.1973724877272748
+CMS	CMSJETS11	104	0.09488179243734336
+CMS	CMSJETS11	105	0.04284285858513589
+CMS	CMSJETS11	106	0.02013247664048886
+CMS	CMSJETS11	107	0.00904569048761001
+CMS	CMSJETS11	108	0.003957411563322136
+CMS	CMSJETS11	109	0.0013947956627950616
+CMS	CMSJETS11	110	0.0005423450935199612
+CMS	CMSJETS11	111	0.00019426047276743468
+CMS	CMSJETS11	112	8.641325482048829e-05
+CMS	CMSJETS11	113	1.1205549962724038e-05
+CMS	CMSJETS11	114	1378.9721889654777
+CMS	CMSJETS11	115	586.5280099525319
+CMS	CMSJETS11	116	254.46531592644695
+CMS	CMSJETS11	117	117.48542213423157
+CMS	CMSJETS11	118	50.164169623489386
+CMS	CMSJETS11	119	23.869966504044648
+CMS	CMSJETS11	120	10.595115571480735
+CMS	CMSJETS11	121	4.680940497616173
+CMS	CMSJETS11	122	2.026200900502425
+CMS	CMSJETS11	123	0.9369729158473754
+CMS	CMSJETS11	124	0.3821993863199565
+CMS	CMSJETS11	125	0.14796011741945186
+CMS	CMSJETS11	126	0.05808605576889366
+CMS	CMSJETS11	127	0.020258461063771327
+CMS	CMSJETS11	128	0.006549884219507623
+CMS	CMSJETS11	129	0.0021102678286488382
+CMS	CMSJETS11	130	0.00027998963463139373
+CMS	CMSJETS11	131	8.597370643627551e-05
+CMS	CMSJETS11	132	9.361652706292585e-06
diff --git a/validphys2/src/validphys/tests/test_regressions.py b/validphys2/src/validphys/tests/test_regressions.py
index 9dd2c10333..dd671ee3ec 100644
--- a/validphys2/src/validphys/tests/test_regressions.py
+++ b/validphys2/src/validphys/tests/test_regressions.py
@@ -18,8 +18,9 @@
 import NNPDF
 from validphys import results
 from validphys.api import API
-from validphys.tableloader import (parse_exp_mat, load_perreplica_chi2_table,
+from validphys.tableloader import (parse_data_cv, parse_exp_mat, load_perreplica_chi2_table,
                                    sane_load, load_fits_chi2_table)
+from validphys.tests.test_covmats import CORR_DATA
 
 
 
@@ -51,6 +52,18 @@ def f_(*args, **kwargs):
         return f_
     return decorator
 
+
+@make_table_comp(parse_data_cv)
+def test_mcreplica(data_config):
+    config = dict(data_config)
+    config["dataset_inputs"] = CORR_DATA
+    seed = 123456
+    # Use no cuts because if filter rules change in the
+    # future then this test will end up failing
+    rep = API.indexed_make_replica(**config, seed=seed)
+    return rep
+
+
 @make_table_comp(parse_exp_mat)
 def test_expcovmat(data_config):
     mat = API.groups_covmat_no_table(**data_config)