From 8493c7e892b1181041674a0bf5c0d3f6f17e59df Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 19 Aug 2020 14:57:02 +0100
Subject: [PATCH 01/28] A direct translation of the C++ pseudodata code No
 attempt at optimization has been done Uses the same seed structure as C++ so
 will give the same pseudodata if the same RNG seed is used. This commit is a
 proof of concept.

---
 validphys2/src/validphys/covmats.py | 61 +++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 530ced5815..3bf0b2dbf4 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -12,6 +12,7 @@
 
 from validphys.calcutils import regularize_covmat, get_df_block
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
+from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
     check_norm_threshold,
@@ -21,10 +22,64 @@
 )
 from validphys.results import ThPredictionsResult
 
+import NNPDF
+
 log = logging.getLogger(__name__)
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
+def make_replica(commondata, seed=1):
+    # dataset_input: {dataset: NMC, frac: 0.5}
+
+    # use_cuts: nocuts
+
+    # theoryid: 53
+
+    # template_text: |
+      # {@make_replica@}
+
+    # actions_:
+      # - report(main=True)
+
+    # TODO: eventually remove the inhouse random number generator
+    # in favour of e.g numpy
+    NNPDF.RandomGenerator.InitRNG(0, seed)
+    rng = NNPDF.RandomGenerator.GetRNG()
+
+    ld_cd = load_commondata(commondata)
+    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+
+    rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
+    deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
+    deviates, rand = map(np.array, (deviates, rand))
+
+    correlated_deviates = sampling_matrix @ deviates
+
+    artdata = np.array(ld_cd.central_values) + correlated_deviates
+
+    sys_table = ld_cd.sys_errors
+    for i in range(ld_cd.ndata):
+        xnor = 1
+        for j in range(ld_cd.nsys):
+            sys = sys_table.iloc[i, j]
+            if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
+                continue
+
+            if sys.name == 'UNCORR':
+                if sys.sys_type == 'ADD':
+                    continue
+                xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
+            else:
+                if sys.sys_type == 'ADD':
+                    continue
+                xnor *= (1.0 + rand[j] * sys.mult/100)
+        artdata[i] = xnor * artdata[i]
+
+    # TODO: check the pseudodata is positive
+    # requires implementing the above code in a while loop
+    return artdata
+
 
 def covmat_from_systematics(commondata):
     """Take the statistical uncertainty and systematics table from
@@ -68,6 +123,12 @@ def covmat_from_systematics(commondata):
         CommonData which stores information about systematic errors,
         their treatment and description.
 
+    use_mult_errors: bool
+        Boolean which controls whether we use multiplicative errors
+        when computing the covmat. By default it should be set to ``True``
+        unles one wishes to uses a sampling matrix for pseudodata generation,
+        in which case it should be set to ``False``.
+
     Returns
     -------
     cov_mat : np.array

From 68e9ced95e59d83f3ab84ec1bc94bcee8f27021c Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 20 Aug 2020 11:46:02 +0100
Subject: [PATCH 02/28] Loop until positivity of pseudodata is achieved

---
 validphys2/src/validphys/covmats.py | 54 ++++++++++++++++-------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 3bf0b2dbf4..18c76261ff 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -50,34 +50,40 @@ def make_replica(commondata, seed=1):
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
-    rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
-    deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
-    deviates, rand = map(np.array, (deviates, rand))
-
-    correlated_deviates = sampling_matrix @ deviates
+    # We need to loop until the pseudodata is positive
+    while True:
+        rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
+        deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
+        deviates, rand = map(np.array, (deviates, rand))
+
+        correlated_deviates = sampling_matrix @ deviates
+
+        artdata = np.array(ld_cd.central_values) + correlated_deviates
+
+        sys_table = ld_cd.sys_errors
+        for i in range(ld_cd.ndata):
+            xnor = 1
+            for j in range(ld_cd.nsys):
+                sys = sys_table.iloc[i, j]
+                if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
+                    continue
 
-    artdata = np.array(ld_cd.central_values) + correlated_deviates
+                if sys.name == 'UNCORR':
+                    if sys.sys_type == 'ADD':
+                        continue
+                    xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
+                else:
+                    if sys.sys_type == 'ADD':
+                        continue
+                    xnor *= (1.0 + rand[j] * sys.mult/100)
+            artdata[i] = xnor * artdata[i]
 
-    sys_table = ld_cd.sys_errors
-    for i in range(ld_cd.ndata):
-        xnor = 1
-        for j in range(ld_cd.nsys):
-            sys = sys_table.iloc[i, j]
-            if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
-                continue
+        artdata = np.array(artdata)
 
-            if sys.name == 'UNCORR':
-                if sys.sys_type == 'ADD':
-                    continue
-                xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
-            else:
-                if sys.sys_type == 'ADD':
-                    continue
-                xnor *= (1.0 + rand[j] * sys.mult/100)
-        artdata[i] = xnor * artdata[i]
+        # Check positivity of the pseudodata
+        if all(artdata > 0):
+            break
 
-    # TODO: check the pseudodata is positive
-    # requires implementing the above code in a while loop
     return artdata
 
 

From b2245c8f5f600469f1124a89bdac326c5105d99a Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 20 Aug 2020 15:47:37 +0100
Subject: [PATCH 03/28] Using pandas and numpy to generate the pseudodata

This removes the for loop implementation
---
 validphys2/src/validphys/covmats.py | 73 +++++++++++++++++++----------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 18c76261ff..749bd01ab2 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -50,41 +50,64 @@ def make_replica(commondata, seed=1):
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
+    central_values = ld_cd.central_values
+    nsys, ndata = ld_cd.nsys, ld_cd.ndata
+
+    # Define these variables here to prevent them
+    # from being redefined in the while loop
+    sys_names = ld_cd.systype_table["name"]
+    sys_types = ld_cd.systype_table["type"]
+    systematics = ld_cd.sys_errors
+
+    # Drop any systematics that are related to theory uncertainties
+    # or have explicitly been labelled as a systematic to be skipped.
+    # We are only interested in multiplicative uncertainties
+    mult_uncorr_sys = systematics.loc[
+        :,
+        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
+        & (sys_types == "MULT"),
+    ]
+    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+
+    # We want to know which of the remaining systematics were
+    # uncorrelated. The other we mark as NaN
+    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+
     # We need to loop until the pseudodata is positive
     while True:
-        rand = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.nsys)]
-        deviates = [rng.GetRandomGausDev(1.0) for _ in range(ld_cd.ndata)]
-        deviates, rand = map(np.array, (deviates, rand))
+        # It is useful to have this quantity as a pandas.Series
+        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+        # And this as a numpy.array
+        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
 
         correlated_deviates = sampling_matrix @ deviates
 
-        artdata = np.array(ld_cd.central_values) + correlated_deviates
-
-        sys_table = ld_cd.sys_errors
-        for i in range(ld_cd.ndata):
-            xnor = 1
-            for j in range(ld_cd.nsys):
-                sys = sys_table.iloc[i, j]
-                if sys.name in ('THEORYCORR', 'THEORYUNCORR', 'SKIP'):
-                    continue
-
-                if sys.name == 'UNCORR':
-                    if sys.sys_type == 'ADD':
-                        continue
-                    xnor *= (1.0 + rng.GetRandomGausDev(1.0)*sys.mult/100)
-                else:
-                    if sys.sys_type == 'ADD':
-                        continue
-                    xnor *= (1.0 + rand[j] * sys.mult/100)
-            artdata[i] = xnor * artdata[i]
+        pseudodata = np.array(central_values) + correlated_deviates
+
+        # Matrix of random numbers generated by either sampling from rand in the
+        # correct manner, or by requesting a new random number if the systematic
+        # error type was UNCORR
+        random_matrix = np.array(
+            [
+                [
+                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                    for i, sys_type in mult_uncorr_names.iteritems()
+                ]
+                for _ in range(ndata)
+            ]
+        )
 
-        artdata = np.array(artdata)
+        # The final pseudodata is obtained by doing an elementwise multiplication
+        # of the multiplicative uncertainties with the above random matrix. We then
+        # add unity to all elements and take the product across columns. This vector
+        # must then be elementwise multiplied by central values + correlated deviates.
+        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
 
         # Check positivity of the pseudodata
-        if all(artdata > 0):
+        if all(pseudodata > 0):
             break
 
-    return artdata
+    return pseudodata
 
 
 def covmat_from_systematics(commondata):

From 196f31faf51f85107505af59a1371a8da77febeb Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 24 Aug 2020 15:24:47 +0100
Subject: [PATCH 04/28] Adding docstring for make_replica function

Also correcting some typos in the docstring of covmat_from_systematics
---
 validphys2/src/validphys/covmats.py | 64 ++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 749bd01ab2..541b607cf7 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -12,6 +12,7 @@
 
 from validphys.calcutils import regularize_covmat, get_df_block
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
+from validphys.coredata import CommonData
 from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
@@ -29,24 +30,58 @@
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
 def make_replica(commondata, seed=1):
-    # dataset_input: {dataset: NMC, frac: 0.5}
+    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas
+    of the data central value.
 
-    # use_cuts: nocuts
+    The square root (obtained by the Cholesky decomposition) of the covariance
+    matrix is used as the sampling matrix. The covariance matrix is obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
+    multiplicative systematic uncertainties when obtaining the sampling matrix.
 
-    # theoryid: 53
+    Parameters
+    ---------
+    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
+        CommonData which stores information about systematic errors,
+        their treatment and description.
 
-    # template_text: |
-      # {@make_replica@}
+    seed: int
+        Seed used to initialise the random number generator.
 
-    # actions_:
-      # - report(main=True)
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
 
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> l = Loader()
+    >>> from validphys.covmats import make_replica
+    >>> cd = l.check_commondata("NMC")
+    >>> make_replica(cd)
+    - Random Generator allocated: ranlux
+    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
+       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+
+    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
+    .. todo:: Allow for correlations between datasets within an experiment.
+    """
     # TODO: eventually remove the inhouse random number generator
     # in favour of e.g numpy
     NNPDF.RandomGenerator.InitRNG(0, seed)
     rng = NNPDF.RandomGenerator.GetRNG()
 
-    ld_cd = load_commondata(commondata)
+    if not isinstance(commondata, CommonData):
+        ld_cd = load_commondata(commondata)
+    else:
+        ld_cd = commondata
+
     covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
     sampling_matrix = sqrt_covmat(covmat_for_sampling)
 
@@ -69,8 +104,7 @@ def make_replica(commondata, seed=1):
     ]
     mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
 
-    # We want to know which of the remaining systematics were
-    # uncorrelated. The other we mark as NaN
+    # We want to know the names of the remaining systematics.
     mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
 
     # We need to loop until the pseudodata is positive
@@ -146,9 +180,9 @@ def covmat_from_systematics(commondata):
     `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
     outlining the procedure, specifically equation 2 and surrounding text.
 
-    Paramaters
+    Parameters
     ----------
-    commondata : validphys.coredata.CommonData
+    commondata: validphys.coredata.CommonData
         CommonData which stores information about systematic errors,
         their treatment and description.
 
@@ -160,9 +194,9 @@ def covmat_from_systematics(commondata):
 
     Returns
     -------
-    cov_mat : np.array
-        Numpy array which is N_dat x N_dat (where N_dat is the number of data
-        points after cuts) containing uncertainty and correlation information.
+    cov_mat: np.array
+        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
+        containing uncertainty and correlation information.
 
     Example
     -------

From c5ba90d867071291f651d037ebec48c5b8858a9a Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Wed, 26 Aug 2020 13:38:58 +0100
Subject: [PATCH 05/28] Apply suggestions from code review

Co-authored-by: Cameron Voisey <32741139+voisey@users.noreply.github.com>
---
 validphys2/src/validphys/covmats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 541b607cf7..55b9396017 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -31,7 +31,7 @@
 
 def make_replica(commondata, seed=1):
     """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` and returns pseudodata replicas
+    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
     of the data central value.
 
     The square root (obtained by the Cholesky decomposition) of the covariance
@@ -189,7 +189,7 @@ def covmat_from_systematics(commondata):
     use_mult_errors: bool
         Boolean which controls whether we use multiplicative errors
         when computing the covmat. By default it should be set to ``True``
-        unles one wishes to uses a sampling matrix for pseudodata generation,
+        unless one wishes to uses a sampling matrix for pseudodata generation,
         in which case it should be set to ``False``.
 
     Returns

From ca2f21422e2c33c9a7d586b52aa576e58fad0b60 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 27 Aug 2020 13:00:32 +0100
Subject: [PATCH 06/28] Moving to pseudodata.py

---
 validphys2/src/validphys/covmats.py    |   4 -
 validphys2/src/validphys/pseudodata.py | 120 +++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 55b9396017..51626b6ac4 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -12,8 +12,6 @@
 
 from validphys.calcutils import regularize_covmat, get_df_block
 from validphys.core import PDF, DataGroupSpec, DataSetSpec
-from validphys.coredata import CommonData
-from validphys.commondataparser import load_commondata
 from validphys.checks import (
     check_dataset_cuts_match_theorycovmat,
     check_norm_threshold,
@@ -23,8 +21,6 @@
 )
 from validphys.results import ThPredictionsResult
 
-import NNPDF
-
 log = logging.getLogger(__name__)
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index d62331f467..d0990114ab 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -12,12 +12,17 @@
 import pandas as pd
 
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
+from validphys.commondataparser import load_commondata
+from validphys.coredata import CommonData
+from validphys.covmats import covmat_from_systematics, sqrt_covmat
 
 from reportengine import collect
 
 import n3fit.io.reader as reader
 from n3fit.performfit import initialize_seeds
 
+import NNPDF
+
 log = logging.getLogger(__name__)
 
 
@@ -104,6 +109,121 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
+def make_replica(commondata, seed=1):
+    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
+    of the data central value.
+
+    The square root (obtained by the Cholesky decomposition) of the covariance
+    matrix is used as the sampling matrix. The covariance matrix is obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
+    multiplicative systematic uncertainties when obtaining the sampling matrix.
+
+    Parameters
+    ---------
+    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
+        CommonData which stores information about systematic errors,
+        their treatment and description.
+
+    seed: int
+        Seed used to initialise the random number generator.
+
+    Returns
+    -------
+    pseudodata: np.array
+        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        containing monte carlo samples of data centered around the data central value.
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> l = Loader()
+    >>> from validphys.pseudodata import make_replica
+    >>> cd = l.check_commondata("NMC")
+    >>> make_replica(cd)
+    - Random Generator allocated: ranlux
+    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
+       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+
+    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
+    .. todo:: Allow for correlations between datasets within an experiment.
+    """
+    # TODO: eventually remove the inhouse random number generator
+    # in favour of e.g numpy
+    NNPDF.RandomGenerator.InitRNG(0, seed)
+    rng = NNPDF.RandomGenerator.GetRNG()
+
+    if not isinstance(commondata, CommonData):
+        ld_cd = load_commondata(commondata)
+    else:
+        ld_cd = commondata
+
+    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+
+    central_values = ld_cd.central_values
+    nsys, ndata = ld_cd.nsys, ld_cd.ndata
+
+    # Define these variables here to prevent them
+    # from being redefined in the while loop
+    sys_names = ld_cd.systype_table["name"]
+    sys_types = ld_cd.systype_table["type"]
+    systematics = ld_cd.sys_errors
+
+    # Drop any systematics that are related to theory uncertainties
+    # or have explicitly been labelled as a systematic to be skipped.
+    # We are only interested in multiplicative uncertainties
+    mult_uncorr_sys = systematics.loc[
+        :,
+        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
+        & (sys_types == "MULT"),
+    ]
+    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+
+    # We want to know the names of the remaining systematics.
+    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+
+    # We need to loop until the pseudodata is positive
+    while True:
+        # It is useful to have this quantity as a pandas.Series
+        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+        # And this as a numpy.array
+        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
+
+        correlated_deviates = sampling_matrix @ deviates
+
+        pseudodata = np.array(central_values) + correlated_deviates
+
+        # Matrix of random numbers generated by either sampling from rand in the
+        # correct manner, or by requesting a new random number if the systematic
+        # error type was UNCORR
+        random_matrix = np.array(
+            [
+                [
+                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                    for i, sys_type in mult_uncorr_names.iteritems()
+                ]
+                for _ in range(ndata)
+            ]
+        )
+
+        # The final pseudodata is obtained by doing an elementwise multiplication
+        # of the multiplicative uncertainties with the above random matrix. We then
+        # add unity to all elements and take the product across columns. This vector
+        # must then be elementwise multiplied by central values + correlated deviates.
+        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+
+        # Check positivity of the pseudodata
+        if all(pseudodata > 0):
+            break
+
+    return pseudodata
+
+
 @check_darwin_single_process
 def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
     """A function to obtain information about the pseudodata that went

From 821b6aae77721a1915b063fedb55967cd6d9e263 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 27 Aug 2020 15:54:30 +0100
Subject: [PATCH 07/28] Making the function into an infinite generator

Thus it remembers the RNG state
---
 validphys2/src/validphys/pseudodata.py | 74 ++++++++++++++------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index d0990114ab..69a65e2aef 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -110,9 +110,10 @@ def read_fit_pseudodata(fitcontext, context_index):
 
 
 def make_replica(commondata, seed=1):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
-    of the data central value.
+    """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
+    :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
+    of the data central value. This generator is infinite in the sense that it will
+    never raise a StopIteration exception
 
     The square root (obtained by the Cholesky decomposition) of the covariance
     matrix is used as the sampling matrix. The covariance matrix is obtained using
@@ -141,13 +142,14 @@ def make_replica(commondata, seed=1):
     >>> l = Loader()
     >>> from validphys.pseudodata import make_replica
     >>> cd = l.check_commondata("NMC")
-    >>> make_replica(cd)
+    >>> data_generator = make_replica(cd)
+    >>> next(data_generator)
     - Random Generator allocated: ranlux
     array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
+           0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
+           0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
+           0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
+
 
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
@@ -187,41 +189,45 @@ def make_replica(commondata, seed=1):
     # We want to know the names of the remaining systematics.
     mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
 
-    # We need to loop until the pseudodata is positive
+    # We make this generator infinite in the sense that it shall never
+    # raise a StopIteration exception. Thus we can continuely request
+    # pseudodata from it and it will remember the RNG state.
     while True:
-        # It is useful to have this quantity as a pandas.Series
-        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-        # And this as a numpy.array
-        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
+        # We need to loop until the pseudodata is positive
+        while True:
+            # It is useful to have this quantity as a pandas.Series
+            rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
+            # And this as a numpy.array
+            deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
 
-        correlated_deviates = sampling_matrix @ deviates
+            correlated_deviates = sampling_matrix @ deviates
 
-        pseudodata = np.array(central_values) + correlated_deviates
+            pseudodata = np.array(central_values) + correlated_deviates
 
-        # Matrix of random numbers generated by either sampling from rand in the
-        # correct manner, or by requesting a new random number if the systematic
-        # error type was UNCORR
-        random_matrix = np.array(
-            [
+            # Matrix of random numbers generated by either sampling from rand in the
+            # correct manner, or by requesting a new random number if the systematic
+            # error type was UNCORR
+            random_matrix = np.array(
                 [
-                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                    for i, sys_type in mult_uncorr_names.iteritems()
+                    [
+                        rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
+                        for i, sys_type in mult_uncorr_names.iteritems()
+                    ]
+                    for _ in range(ndata)
                 ]
-                for _ in range(ndata)
-            ]
-        )
+            )
 
-        # The final pseudodata is obtained by doing an elementwise multiplication
-        # of the multiplicative uncertainties with the above random matrix. We then
-        # add unity to all elements and take the product across columns. This vector
-        # must then be elementwise multiplied by central values + correlated deviates.
-        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+            # The final pseudodata is obtained by doing an elementwise multiplication
+            # of the multiplicative uncertainties with the above random matrix. We then
+            # add unity to all elements and take the product across columns. This vector
+            # must then be elementwise multiplied by central values + correlated deviates.
+            pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
 
-        # Check positivity of the pseudodata
-        if all(pseudodata > 0):
-            break
+            # Check positivity of the pseudodata
+            if all(pseudodata > 0):
+                break
 
-    return pseudodata
+        yield pseudodata
 
 
 @check_darwin_single_process

From 7f4c3f567da4d0c2a28b06bb3815c7d0c5602f45 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 4 Sep 2020 10:54:01 +0100
Subject: [PATCH 08/28] Adding comment explaining why we divide the
 multiplicative uncertainties by 100

---
 validphys2/src/validphys/pseudodata.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 69a65e2aef..186747f62c 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -184,6 +184,8 @@ def make_replica(commondata, seed=1):
         (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
         & (sys_types == "MULT"),
     ]
+    # The multiplicative uncertainties are quoted as percentages, so
+    # we change them to fractional values
     mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
 
     # We want to know the names of the remaining systematics.

From 9f28ca9592812afcc214f5ac96561123a8ec88e6 Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Fri, 4 Sep 2020 12:29:26 +0100
Subject: [PATCH 09/28] Update validphys2/src/validphys/covmats.py

Co-authored-by: Zaharid <zk261@cam.ac.uk>
---
 validphys2/src/validphys/covmats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 51626b6ac4..921ae2bfdd 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -185,7 +185,7 @@ def covmat_from_systematics(commondata):
     use_mult_errors: bool
         Boolean which controls whether we use multiplicative errors
         when computing the covmat. By default it should be set to ``True``
-        unless one wishes to uses a sampling matrix for pseudodata generation,
+        unless one wishes to sample pseudodata from a multigaussian distribution,
         in which case it should be set to ``False``.
 
     Returns

From da36db57b96ceb58c3e5f63a193d1af1016a6ef7 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 4 Sep 2020 16:10:04 +0100
Subject: [PATCH 10/28] Changing covmat_for_sampling to additive_covmat

---
 validphys2/src/validphys/pseudodata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 186747f62c..09ce272ad2 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -164,8 +164,8 @@ def make_replica(commondata, seed=1):
     else:
         ld_cd = commondata
 
-    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(covmat_for_sampling)
+    additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False)
+    sampling_matrix = sqrt_covmat(additive_covmat)
 
     central_values = ld_cd.central_values
     nsys, ndata = ld_cd.nsys, ld_cd.ndata

From 5649743c07984f18f8856cabaaa593beffffb132 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 15:30:32 +0100
Subject: [PATCH 11/28] Changing variable names to better reflect their purpose

---
 validphys2/src/validphys/pseudodata.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 09ce272ad2..e111906767 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -179,17 +179,17 @@ def make_replica(commondata, seed=1):
     # Drop any systematics that are related to theory uncertainties
     # or have explicitly been labelled as a systematic to be skipped.
     # We are only interested in multiplicative uncertainties
-    mult_uncorr_sys = systematics.loc[
+    mult_sys = systematics.loc[
         :,
         (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
         & (sys_types == "MULT"),
     ]
     # The multiplicative uncertainties are quoted as percentages, so
     # we change them to fractional values
-    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
+    mult_sys = mult_sys.applymap(lambda x: x.mult / 100)
 
     # We want to know the names of the remaining systematics.
-    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
+    mult_names = sys_names.loc[mult_sys.columns]
 
     # We make this generator infinite in the sense that it shall never
     # raise a StopIteration exception. Thus we can continuely request
@@ -213,7 +213,7 @@ def make_replica(commondata, seed=1):
                 [
                     [
                         rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                        for i, sys_type in mult_uncorr_names.iteritems()
+                        for i, sys_type in mult_names.iteritems()
                     ]
                     for _ in range(ndata)
                 ]
@@ -223,7 +223,7 @@ def make_replica(commondata, seed=1):
             # of the multiplicative uncertainties with the above random matrix. We then
             # add unity to all elements and take the product across columns. This vector
             # must then be elementwise multiplied by central values + correlated deviates.
-            pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
+            pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
             # Check positivity of the pseudodata
             if all(pseudodata > 0):

From fe6a653c51ea133c436c182220e27e95d305cb46 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 16:12:33 +0100
Subject: [PATCH 12/28] Accounting for the asymmetry edge case

---
 validphys2/src/validphys/pseudodata.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index e111906767..1414f8f3ef 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -225,8 +225,9 @@ def make_replica(commondata, seed=1):
             # must then be elementwise multiplied by central values + correlated deviates.
             pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
-            # Check positivity of the pseudodata
-            if all(pseudodata > 0):
+            # Check positivity of the pseudodata. Note that if the dataset is an asymmetry
+            # then it is allowed to be negative.
+            if all(pseudodata > 0) or ("ASY" in commondata.process_type):
                 break
 
         yield pseudodata

From 7d5c9d289a10e42c3233ef79bc6da5866111562c Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 16 Sep 2020 16:55:48 +0100
Subject: [PATCH 13/28] Adding information on the nature of sampling in the
 docstring

---
 validphys2/src/validphys/pseudodata.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 1414f8f3ef..0c462cd083 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -120,6 +120,11 @@ def make_replica(commondata, seed=1):
     :py:func:`validphys.covmats.covmat_from_systematics` and must not use
     multiplicative systematic uncertainties when obtaining the sampling matrix.
 
+    Gaussian sampling is performed using the additive part of the covariance matrix.
+    An additional gaussian sampling for the renormalization of the data having multiplicative
+    uncertainties happens later. All the contributions to the covariance matrix coming from
+    theory errors should be accounted for in the additive part
+
     Parameters
     ---------
     commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`

From 3365dec2687608b33f13f8c25311cf954c540df3 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 6 Oct 2020 17:29:05 +0100
Subject: [PATCH 14/28] Correcting covmat_from_systematics example

---
 validphys2/src/validphys/covmats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 921ae2bfdd..f861f12884 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -198,7 +198,7 @@ def covmat_from_systematics(commondata):
     -------
     >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> from validphys.calcutils import covmat_from_systematics
+    >>> from validphys.covmats import covmat_from_systematics
     >>> l = Loader()
     >>> cd = l.check_commondata("NMC")
     >>> cd = load_commondata(cd)

From a3fe40fe48736388dbc66420706e41611c9a4b47 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 13 Oct 2020 15:15:56 +0100
Subject: [PATCH 15/28] Clarifying doc strings

Adding Zahari's review comments
---
 validphys2/src/validphys/covmats.py    |  7 ++++---
 validphys2/src/validphys/pseudodata.py | 16 ++++++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index f861f12884..17660d76c5 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -183,10 +183,11 @@ def covmat_from_systematics(commondata):
         their treatment and description.
 
     use_mult_errors: bool
-        Boolean which controls whether we use multiplicative errors
+        Boolean which controls whether multiplicative errors are used
         when computing the covmat. By default it should be set to ``True``
-        unless one wishes to sample pseudodata from a multigaussian distribution,
-        in which case it should be set to ``False``.
+        since setting this parameter to ``False`` will completely ignore multiplicative
+        uncertainties. This flag is present for use with pseudodata generation where the
+        multiplicative uncertainties should be ignored.
 
     Returns
     -------
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 0c462cd083..7ed133912d 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -116,14 +116,14 @@ def make_replica(commondata, seed=1):
     never raise a StopIteration exception
 
     The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix. The covariance matrix is obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
-    multiplicative systematic uncertainties when obtaining the sampling matrix.
-
-    Gaussian sampling is performed using the additive part of the covariance matrix.
-    An additional gaussian sampling for the renormalization of the data having multiplicative
-    uncertainties happens later. All the contributions to the covariance matrix coming from
-    theory errors should be accounted for in the additive part
+    matrix is used as the sampling matrix with the covariance matrix being obtained using
+    :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call
+    to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to
+    ignore multiplicative uncertainties. Gaussian sampling is performed using the
+    additive part of the covariance matrix. An additional gaussian sampling for the
+    renormalization of the data having multiplicative uncertainties happens later.
+    All the contributions to the covariance matrix coming from theory errors should be
+    accounted for in the additive part.
 
     Parameters
     ---------

From 79fbe95d7c3364f531a355e33632db8f39802984 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 13 Oct 2020 15:20:40 +0100
Subject: [PATCH 16/28] Setting default seed kwarg to None

Assigning seed randomly if not provided explicilty in function call
---
 validphys2/src/validphys/pseudodata.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 7ed133912d..b804e68853 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -7,6 +7,7 @@
 import multiprocessing as mp
 import os
 import pathlib
+import random
 
 import numpy as np
 import pandas as pd
@@ -109,7 +110,7 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
-def make_replica(commondata, seed=1):
+def make_replica(commondata, seed=None):
     """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
     :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
     of the data central value. This generator is infinite in the sense that it will
@@ -131,8 +132,10 @@ def make_replica(commondata, seed=1):
         CommonData which stores information about systematic errors,
         their treatment and description.
 
-    seed: int
-        Seed used to initialise the random number generator.
+    seed: int, None
+        Seed used to initialise the random number generator. If ``None`` then a random seed is allocated
+        using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing
+        inadvertent correlations.
 
     Returns
     -------
@@ -159,6 +162,11 @@ def make_replica(commondata, seed=1):
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
     """
+    # If seed is None (i.e not explicitly provided in the function call)
+    # then set it randomly
+    if seed is None:
+        seed = random.randint(1, 1000)
+
     # TODO: eventually remove the inhouse random number generator
     # in favour of e.g numpy
     NNPDF.RandomGenerator.InitRNG(0, seed)

From 8905f0ee2e28de97b801379feb9dce3195442cb3 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 17 Nov 2020 11:23:53 +0000
Subject: [PATCH 17/28] Removing make replica from covmats.py

---
 validphys2/src/validphys/covmats.py | 114 ----------------------------
 1 file changed, 114 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 17660d76c5..e184e57da0 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -25,120 +25,6 @@
 
 INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")
 
-def make_replica(commondata, seed=1):
-    """Function that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and returns pseudodata replicas
-    of the data central value.
-
-    The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix. The covariance matrix is obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics` and must not use
-    multiplicative systematic uncertainties when obtaining the sampling matrix.
-
-    Parameters
-    ---------
-    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
-        CommonData which stores information about systematic errors,
-        their treatment and description.
-
-    seed: int
-        Seed used to initialise the random number generator.
-
-    Returns
-    -------
-    pseudodata: np.array
-        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
-        containing monte carlo samples of data centered around the data central value.
-
-
-    Example
-    -------
-    >>> from validphys.loader import Loader
-    >>> l = Loader()
-    >>> from validphys.covmats import make_replica
-    >>> cd = l.check_commondata("NMC")
-    >>> make_replica(cd)
-    - Random Generator allocated: ranlux
-    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-       0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-       0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-       0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-       0.35932347, 0.37967132, 0.37942145, 0.2931131 , 0.3504976 ,
-
-    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
-    .. todo:: Allow for correlations between datasets within an experiment.
-    """
-    # TODO: eventually remove the inhouse random number generator
-    # in favour of e.g numpy
-    NNPDF.RandomGenerator.InitRNG(0, seed)
-    rng = NNPDF.RandomGenerator.GetRNG()
-
-    if not isinstance(commondata, CommonData):
-        ld_cd = load_commondata(commondata)
-    else:
-        ld_cd = commondata
-
-    covmat_for_sampling = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(covmat_for_sampling)
-
-    central_values = ld_cd.central_values
-    nsys, ndata = ld_cd.nsys, ld_cd.ndata
-
-    # Define these variables here to prevent them
-    # from being redefined in the while loop
-    sys_names = ld_cd.systype_table["name"]
-    sys_types = ld_cd.systype_table["type"]
-    systematics = ld_cd.sys_errors
-
-    # Drop any systematics that are related to theory uncertainties
-    # or have explicitly been labelled as a systematic to be skipped.
-    # We are only interested in multiplicative uncertainties
-    mult_uncorr_sys = systematics.loc[
-        :,
-        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
-        & (sys_types == "MULT"),
-    ]
-    mult_uncorr_sys = mult_uncorr_sys.applymap(lambda x: x.mult / 100)
-
-    # We want to know the names of the remaining systematics.
-    mult_uncorr_names = sys_names.loc[mult_uncorr_sys.columns]
-
-    # We need to loop until the pseudodata is positive
-    while True:
-        # It is useful to have this quantity as a pandas.Series
-        rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-        # And this as a numpy.array
-        deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
-
-        correlated_deviates = sampling_matrix @ deviates
-
-        pseudodata = np.array(central_values) + correlated_deviates
-
-        # Matrix of random numbers generated by either sampling from rand in the
-        # correct manner, or by requesting a new random number if the systematic
-        # error type was UNCORR
-        random_matrix = np.array(
-            [
-                [
-                    rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                    for i, sys_type in mult_uncorr_names.iteritems()
-                ]
-                for _ in range(ndata)
-            ]
-        )
-
-        # The final pseudodata is obtained by doing an elementwise multiplication
-        # of the multiplicative uncertainties with the above random matrix. We then
-        # add unity to all elements and take the product across columns. This vector
-        # must then be elementwise multiplied by central values + correlated deviates.
-        pseudodata = np.prod(1 + random_matrix * mult_uncorr_sys.to_numpy(), axis=1) * pseudodata
-
-        # Check positivity of the pseudodata
-        if all(pseudodata > 0):
-            break
-
-    return pseudodata
-
 
 def covmat_from_systematics(commondata):
     """Take the statistical uncertainty and systematics table from

From 0a8ec390a654c1208bb81766ea69bbda6b1c9afc Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 17 Nov 2020 13:35:46 +0000
Subject: [PATCH 18/28] Adding use_mult_errors flag to
 datasets_covmat_from_systematics

---
 validphys2/src/validphys/covmats.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index e184e57da0..85ccc91ee4 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -68,13 +68,6 @@ def covmat_from_systematics(commondata):
         CommonData which stores information about systematic errors,
         their treatment and description.
 
-    use_mult_errors: bool
-        Boolean which controls whether multiplicative errors are used
-        when computing the covmat. By default it should be set to ``True``
-        since setting this parameter to ``False`` will completely ignore multiplicative
-        uncertainties. This flag is present for use with pseudodata generation where the
-        multiplicative uncertainties should be ignored.
-
     Returns
     -------
     cov_mat: np.array
@@ -108,7 +101,7 @@ def covmat_from_systematics(commondata):
         commondata.stat_errors.to_numpy(),  commondata.systematic_errors())
 
 
-def datasets_covmat_from_systematics(list_of_commondata):
+def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True):
     """Given a list containing :py:class:`validphys.coredata.CommonData` s,
     construct the full covariance matrix.
 
@@ -123,6 +116,10 @@ def datasets_covmat_from_systematics(list_of_commondata):
     list_of_commondata : list[validphys.coredata.CommonData]
         list of CommonData objects.
 
+    use_mult_errors: bool
+        Whether or not to include multiplicative systematic errors in the computation of
+        the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`.
+
     Returns
     -------
     cov_mat : np.array
@@ -147,7 +144,10 @@ def datasets_covmat_from_systematics(list_of_commondata):
     block_diags = []
 
     for cd in list_of_commondata:
-        errors = cd.systematic_errors()
+        if use_mult_errors:
+            errors = cd.systematic_errors()
+        else:
+            errors = cd.additive_errors
         # separate out the special uncertainties which can be correlated across
         # datasets
         is_intra_dataset_error = errors.columns.isin(INTRA_DATASET_SYS_NAME)

From a1820d353909dafd10d7358d63aa823984066705 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Mon, 7 Dec 2020 15:17:21 +0000
Subject: [PATCH 19/28] Adding functionality to compute pseudodata for a list
 of commondata Co-authored-by: wilsonmr
 <33907451+wilsonmr@users.noreply.github.com>

---
 validphys2/src/validphys/pseudodata.py | 140 ++++++++++++-------------
 1 file changed, 69 insertions(+), 71 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index b804e68853..2e7d5cbee4 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -15,7 +15,7 @@
 from validphys.checks import check_cuts_fromfit, check_darwin_single_process
 from validphys.commondataparser import load_commondata
 from validphys.coredata import CommonData
-from validphys.covmats import covmat_from_systematics, sqrt_covmat
+from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
 
@@ -110,7 +110,8 @@ def read_fit_pseudodata(fitcontext, context_index):
         yield pseudodata.drop("type", axis=1), tr.index, val.index
 
 
-def make_replica(commondata, seed=None):
+def make_replica(list_of_commondata, seed=None):
+    # TODO: this is the old docstring: update it
     """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
     :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
     of the data central value. This generator is infinite in the sense that it will
@@ -165,85 +166,82 @@ def make_replica(commondata, seed=None):
     # If seed is None (i.e not explicitly provided in the function call)
     # then set it randomly
     if seed is None:
-        seed = random.randint(1, 1000)
+        seed = np.random.randint(1000)
 
-    # TODO: eventually remove the inhouse random number generator
-    # in favour of e.g numpy
-    NNPDF.RandomGenerator.InitRNG(0, seed)
-    rng = NNPDF.RandomGenerator.GetRNG()
-
-    if not isinstance(commondata, CommonData):
-        ld_cd = load_commondata(commondata)
-    else:
-        ld_cd = commondata
-
-    additive_covmat = covmat_from_systematics(ld_cd, use_mult_errors=False)
-    sampling_matrix = sqrt_covmat(additive_covmat)
-
-    central_values = ld_cd.central_values
-    nsys, ndata = ld_cd.nsys, ld_cd.ndata
-
-    # Define these variables here to prevent them
-    # from being redefined in the while loop
-    sys_names = ld_cd.systype_table["name"]
-    sys_types = ld_cd.systype_table["type"]
-    systematics = ld_cd.sys_errors
-
-    # Drop any systematics that are related to theory uncertainties
-    # or have explicitly been labelled as a systematic to be skipped.
-    # We are only interested in multiplicative uncertainties
-    mult_sys = systematics.loc[
-        :,
-        (~sys_names.isin(["THEORYCORR", "THEORYUNCORR", "SKIP"]))
-        & (sys_types == "MULT"),
-    ]
-    # The multiplicative uncertainties are quoted as percentages, so
-    # we change them to fractional values
-    mult_sys = mult_sys.applymap(lambda x: x.mult / 100)
-
-    # We want to know the names of the remaining systematics.
-    mult_names = sys_names.loc[mult_sys.columns]
+    # Seed the numpy RNG with the seed.
+    np.random.seed(seed=seed)
 
     # We make this generator infinite in the sense that it shall never
-    # raise a StopIteration exception. Thus we can continuely request
+    # raise a StopIteration exception. Thus we can continually request
     # pseudodata from it and it will remember the RNG state.
     while True:
-        # We need to loop until the pseudodata is positive
+        # The inner while True loop is for ensuring a positive definite
+        # pseudodata replica
         while True:
-            # It is useful to have this quantity as a pandas.Series
-            rand = pd.Series([rng.GetRandomGausDev(1.0) for _ in range(nsys)], index=sys_types.index)
-            # And this as a numpy.array
-            deviates = np.array([rng.GetRandomGausDev(1.0) for _ in range(ndata)])
-
-            correlated_deviates = sampling_matrix @ deviates
-
-            pseudodata = np.array(central_values) + correlated_deviates
-
-            # Matrix of random numbers generated by either sampling from rand in the
-            # correct manner, or by requesting a new random number if the systematic
-            # error type was UNCORR
-            random_matrix = np.array(
-                [
-                    [
-                        rng.GetRandomGausDev(1.0) if sys_type == "UNCORR" else rand[i]
-                        for i, sys_type in mult_names.iteritems()
-                    ]
-                    for _ in range(ndata)
-                ]
-            )
+            pseudodatas = []
+            special_add = []
+            special_mult = []
+            mult_shifts = []
+            check_positive_masks = []
+            for cd in list_of_commondata:
+                pseudodata = cd.central_values.to_numpy()
+
+                add_errors = cd.additive_errors
+                add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+                pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+
+                # correlated within dataset
+                add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+                pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+
+                # append the partially shifted pseudodata
+                pseudodatas.append(pseudodata)
+                special_add.append(
+                    add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+                )
+                mult_errors = cd.multiplicative_errors
+                mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+                # convert to from percent to fraction
+                mult_shift = (
+                    1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+                ).prod(axis=1)
+
+                mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+                mult_shift *= (
+                    1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+                ).prod(axis=1)
+
+                mult_shifts.append(mult_shift)
+
+                special_mult.append(
+                    mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+                )
+
+                # mask out the data we want to check are all positive
+                if "ASY" in cd.commondataproc:
+                    check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+                else:
+                    check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+
+            # if we sort here (which sorts columns), then permuting datasets doesn't change the result
+            special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy()
+            # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+            special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
-            # The final pseudodata is obtained by doing an elementwise multiplication
-            # of the multiplicative uncertainties with the above random matrix. We then
-            # add unity to all elements and take the product across columns. This vector
-            # must then be elementwise multiplied by central values + correlated deviates.
-            pseudodata = np.prod(1 + random_matrix * mult_sys.to_numpy(), axis=1) * pseudodata
 
-            # Check positivity of the pseudodata. Note that if the dataset is an asymmetry
-            # then it is allowed to be negative.
-            if all(pseudodata > 0) or ("ASY" in commondata.process_type):
+            all_pseudodata = (
+                np.concatenate(pseudodatas, axis=0)
+                + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+            ) * (
+                np.concatenate(mult_shifts, axis=0)
+                * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+            )
+
+            if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
                 break
 
-        yield pseudodata
+        yield all_pseudodata
 
 
 @check_darwin_single_process

From 5b33d2766bf5e06dcf0ce4f4ae88760a7544e954 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Tue, 8 Dec 2020 13:09:15 +0000
Subject: [PATCH 20/28] Adding fillna(0) to special additive errors

---
 validphys2/src/validphys/pseudodata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 2e7d5cbee4..da9311e00b 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -225,8 +225,8 @@ def make_replica(list_of_commondata, seed=None):
                     check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
 
             # if we sort here (which sorts columns), then permuting datasets doesn't change the result
-            special_add_errors = pd.concat(special_add, axis=0, sort=True).to_numpy()
             # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+            special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
             special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
 

From cfdb94619ed92f0edaab849befbebff9e5bb9b19 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Wed, 9 Dec 2020 15:58:08 +0000
Subject: [PATCH 21/28] Changing covmats.py to not use the use_mult_errors flag

---
 validphys2/src/validphys/covmats.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 85ccc91ee4..02b8268b9b 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -101,7 +101,7 @@ def covmat_from_systematics(commondata):
         commondata.stat_errors.to_numpy(),  commondata.systematic_errors())
 
 
-def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True):
+def datasets_covmat_from_systematics(list_of_commondata):
     """Given a list containing :py:class:`validphys.coredata.CommonData` s,
     construct the full covariance matrix.
 
@@ -116,10 +116,6 @@ def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True):
     list_of_commondata : list[validphys.coredata.CommonData]
         list of CommonData objects.
 
-    use_mult_errors: bool
-        Whether or not to include multiplicative systematic errors in the computation of
-        the covariance matrix. This is largely for use in :py:func:`validphys.pseudodata.make_replica`.
-
     Returns
     -------
     cov_mat : np.array
@@ -144,10 +140,7 @@ def datasets_covmat_from_systematics(list_of_commondata, use_mult_errors=True):
     block_diags = []
 
     for cd in list_of_commondata:
-        if use_mult_errors:
-            errors = cd.systematic_errors()
-        else:
-            errors = cd.additive_errors
+        errors = cd.systematic_errors()
         # separate out the special uncertainties which can be correlated across
         # datasets
         is_intra_dataset_error = errors.columns.isin(INTRA_DATASET_SYS_NAME)

From 0129724f5bb1525bcc987c27abe1fbcaa156e211 Mon Sep 17 00:00:00 2001
From: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
Date: Wed, 16 Dec 2020 10:01:31 +0000
Subject: [PATCH 22/28] Apply suggestions from code review - comments to split
 up code

Co-authored-by: Rosalyn Pearson <33020850+RosalynLP@users.noreply.github.com>
---
 validphys2/src/validphys/pseudodata.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index da9311e00b..eb450f0fc9 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -186,6 +186,7 @@ def make_replica(list_of_commondata, seed=None):
             for cd in list_of_commondata:
                 pseudodata = cd.central_values.to_numpy()
 
+                # ~~~ ADDITIVE ERRORS  ~~~
                 add_errors = cd.additive_errors
                 add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
 
@@ -197,9 +198,11 @@ def make_replica(list_of_commondata, seed=None):
 
                 # append the partially shifted pseudodata
                 pseudodatas.append(pseudodata)
+                # store the additive errors with correlations between datasets for later use
                 special_add.append(
                     add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
                 )
+                # ~~~ MULTIPLICATIVE ERRORS ~~~
                 mult_errors = cd.multiplicative_errors
                 mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
                 # convert to from percent to fraction
@@ -214,6 +217,7 @@ def make_replica(list_of_commondata, seed=None):
 
                 mult_shifts.append(mult_shift)
 
+                # store the multiplicative errors with correlations between datasets for later use
                 special_mult.append(
                     mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
                 )

From 6772fb69bcd3dfb8b7cef0669c4923ded6f757a9 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 16 Dec 2020 10:17:23 +0000
Subject: [PATCH 23/28] include stat errors

---
 validphys2/src/validphys/pseudodata.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index eb450f0fc9..c74529b48f 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -186,6 +186,9 @@ def make_replica(list_of_commondata, seed=None):
             for cd in list_of_commondata:
                 pseudodata = cd.central_values.to_numpy()
 
+                # add contribution from statistical uncertainty
+                pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+
                 # ~~~ ADDITIVE ERRORS  ~~~
                 add_errors = cd.additive_errors
                 add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()

From b662abbb5fdfde851e291cdbce3ae0fff09a353f Mon Sep 17 00:00:00 2001
From: siranipour <43517072+siranipour@users.noreply.github.com>
Date: Fri, 15 Jan 2021 14:06:45 +0000
Subject: [PATCH 24/28] Removing if seed is None

Numpy has this as default behaviour

Co-authored-by: wilsonmr <33907451+wilsonmr@users.noreply.github.com>
---
 validphys2/src/validphys/pseudodata.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index c74529b48f..0a1b70e064 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -163,11 +163,6 @@ def make_replica(list_of_commondata, seed=None):
     .. todo:: Replace the inhouse random number generation with a numpy equivalent.
     .. todo:: Allow for correlations between datasets within an experiment.
     """
-    # If seed is None (i.e not explicitly provided in the function call)
-    # then set it randomly
-    if seed is None:
-        seed = np.random.randint(1000)
-
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)
 

From a6423de477a0f55cb7c0bdb6d6ca24a19d8e78c0 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 14:18:24 +0000
Subject: [PATCH 25/28] Changing generator to function

---
 validphys2/src/validphys/pseudodata.py | 138 ++++++++++++-------------
 1 file changed, 67 insertions(+), 71 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 0a1b70e064..b65024a5b7 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -166,84 +166,80 @@ def make_replica(list_of_commondata, seed=None):
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)
 
-    # We make this generator infinite in the sense that it shall never
-    # raise a StopIteration exception. Thus we can continually request
-    # pseudodata from it and it will remember the RNG state.
+    # The inner while True loop is for ensuring a positive definite
+    # pseudodata replica
     while True:
-        # The inner while True loop is for ensuring a positive definite
-        # pseudodata replica
-        while True:
-            pseudodatas = []
-            special_add = []
-            special_mult = []
-            mult_shifts = []
-            check_positive_masks = []
-            for cd in list_of_commondata:
-                pseudodata = cd.central_values.to_numpy()
-
-                # add contribution from statistical uncertainty
-                pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
-
-                # ~~~ ADDITIVE ERRORS  ~~~
-                add_errors = cd.additive_errors
-                add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
-
-                pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
-
-                # correlated within dataset
-                add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
-                pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
-
-                # append the partially shifted pseudodata
-                pseudodatas.append(pseudodata)
-                # store the additive errors with correlations between datasets for later use
-                special_add.append(
-                    add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-                )
-                # ~~~ MULTIPLICATIVE ERRORS ~~~
-                mult_errors = cd.multiplicative_errors
-                mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
-                # convert to from percent to fraction
-                mult_shift = (
-                    1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
-                ).prod(axis=1)
-
-                mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
-                mult_shift *= (
-                    1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
-                ).prod(axis=1)
-
-                mult_shifts.append(mult_shift)
-
-                # store the multiplicative errors with correlations between datasets for later use
-                special_mult.append(
-                    mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
-                )
+        pseudodatas = []
+        special_add = []
+        special_mult = []
+        mult_shifts = []
+        check_positive_masks = []
+        for cd in list_of_commondata:
+            pseudodata = cd.central_values.to_numpy()
+
+            # add contribution from statistical uncertainty
+            pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+
+            # ~~~ ADDITIVE ERRORS  ~~~
+            add_errors = cd.additive_errors
+            add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
+
+            pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+
+            # correlated within dataset
+            add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
+            pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+
+            # append the partially shifted pseudodata
+            pseudodatas.append(pseudodata)
+            # store the additive errors with correlations between datasets for later use
+            special_add.append(
+                add_errors.loc[:, ~add_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
+            # ~~~ MULTIPLICATIVE ERRORS ~~~
+            mult_errors = cd.multiplicative_errors
+            mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
+            # convert to from percent to fraction
+            mult_shift = (
+                1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+            ).prod(axis=1)
+
+            mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
+            mult_shift *= (
+                1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+            ).prod(axis=1)
+
+            mult_shifts.append(mult_shift)
+
+            # store the multiplicative errors with correlations between datasets for later use
+            special_mult.append(
+                mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)]
+            )
 
-                # mask out the data we want to check are all positive
-                if "ASY" in cd.commondataproc:
-                    check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
-                else:
-                    check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
+            # mask out the data we want to check are all positive
+            if "ASY" in cd.commondataproc:
+                check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool))
+            else:
+                check_positive_masks.append(np.ones_like(pseudodata, dtype=bool))
 
-            # if we sort here (which sorts columns), then permuting datasets doesn't change the result
-            # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
-            special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
-            special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
+        # if we sort here (which sorts columns), then permuting datasets doesn't change the result
+        # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
+        special_add_errors = pd.concat(special_add, axis=0, sort=True).fillna(0).to_numpy()
+        special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy()
 
 
-            all_pseudodata = (
-                np.concatenate(pseudodatas, axis=0)
-                + special_add_errors @ np.random.randn(special_add_errors.shape[1])
-            ) * (
-                np.concatenate(mult_shifts, axis=0)
-                * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
-            )
+        all_pseudodata = (
+            np.concatenate(pseudodatas, axis=0)
+            + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+        ) * (
+            np.concatenate(mult_shifts, axis=0)
+            * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+        )
 
-            if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
-                break
+        if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):
+            break
 
-        yield all_pseudodata
+    return all_pseudodata
 
 
 @check_darwin_single_process

From 3419bc66e2185e44b6cc71deff957c934845c4a9 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 14:27:56 +0000
Subject: [PATCH 26/28] Updating doc string

---
 validphys2/src/validphys/pseudodata.py | 58 ++++++++++----------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index b65024a5b7..dcd47203b3 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -111,57 +111,43 @@ def read_fit_pseudodata(fitcontext, context_index):
 
 
 def make_replica(list_of_commondata, seed=None):
-    # TODO: this is the old docstring: update it
-    """Generator that takes in a :py:class:`validphys.coredata.CommonData` or
-    :py:class:`validphys.core.CommonDataSpec` object and yields pseudodata replicas
-    of the data central value. This generator is infinite in the sense that it will
-    never raise a StopIteration exception
-
-    The square root (obtained by the Cholesky decomposition) of the covariance
-    matrix is used as the sampling matrix with the covariance matrix being obtained using
-    :py:func:`validphys.covmats.covmat_from_systematics`. In the internal code, the call
-    to this function has the boolean parameter ``use_mult_errors`` set to ``False`` to
-    ignore multiplicative uncertainties. Gaussian sampling is performed using the
-    additive part of the covariance matrix. An additional gaussian sampling for the
-    renormalization of the data having multiplicative uncertainties happens later.
-    All the contributions to the covariance matrix coming from theory errors should be
-    accounted for in the additive part.
+    """Function that takes in a list of :py:class:`validphys.coredata.CommonData`
+    objects and returns pseudodata replicas of the data central value accounting for
+    possible correlations between systematic uncertainties.
+
+    The function loops until positive definite pseudodata is generated for any
+    non-asymmetry datasets. In the case of an asymmetry dataset negative values are
+    permitted so the loop block executes only once.
 
     Parameters
     ---------
-    commondata: :py:class:`validphys.coredata.CommonData`, :py:class:`validphys.core.CommonDataSpec`
-        CommonData which stores information about systematic errors,
-        their treatment and description.
+    list_of_commondata: iterable[:py:class:`validphys.coredata.CommonData`]
+        Iterable of CommonData objects which stores information about systematic errors,
+        their treatment and description, for each dataset.
 
     seed: int, None
-        Seed used to initialise the random number generator. If ``None`` then a random seed is allocated
-        using ``random.randint(1, 1000)``. This default behaviour is selected to avoid introducing
-        inadvertent correlations.
+        Seed used to initialise the numpy random number generator. If ``None`` then a random seed is
+        allocated using the default numpy behaviour.
 
     Returns
     -------
     pseudodata: np.array
-        Numpy array which is N_dat (where N_dat is the number of data points after cuts)
+        Numpy array which is N_dat (where N_dat is the combined number of data points after cuts)
         containing monte carlo samples of data centered around the data central value.
 
-
     Example
     -------
+    >>> from validphys.commondataparser import load_commondata
     >>> from validphys.loader import Loader
-    >>> l = Loader()
     >>> from validphys.pseudodata import make_replica
-    >>> cd = l.check_commondata("NMC")
-    >>> data_generator = make_replica(cd)
-    >>> next(data_generator)
-    - Random Generator allocated: ranlux
-    array([0.24013561, 0.24060029, 0.26382826, 0.27698772, 0.28603631,
-           0.28491346, 0.30983694, 0.31394779, 0.2987047 , 0.31680287,
-           0.33169788, 0.29497604, 0.30490075, 0.32969979, 0.34781687,
-           0.34964414, 0.30162117, 0.32235222, 0.32743923, 0.33293424,
-
-
-    .. todo:: Replace the inhouse random number generation with a numpy equivalent.
-    .. todo:: Allow for correlations between datasets within an experiment.
+    >>> l = Loader()
+    >>> cd1 = l.check_commondata("NMC")
+    >>> cd2 = l.check_commondata("NMCPD")
+    >>> lds = map(load_commondata, (cd1, cd2))
+    >>> make_replica(lds)
+    array([0.25721162, 0.2709698 , 0.27525357, 0.28903442, 0.3114298 ,
+        0.3005844 , 0.3184538 , 0.31094522, 0.30750703, 0.32673155,
+        0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
     """
     # Seed the numpy RNG with the seed.
     np.random.seed(seed=seed)

From 14abf628273121073e7eeef747c2db66147d6a78 Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Fri, 15 Jan 2021 16:16:34 +0000
Subject: [PATCH 27/28] Using new numpy.random interface

---
 validphys2/src/validphys/pseudodata.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index dcd47203b3..e5ec8a30f6 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -7,7 +7,6 @@
 import multiprocessing as mp
 import os
 import pathlib
-import random
 
 import numpy as np
 import pandas as pd
@@ -150,7 +149,7 @@ def make_replica(list_of_commondata, seed=None):
         0.34843355, 0.34730928, 0.3090914 , 0.32825111, 0.3485292 ,
     """
     # Seed the numpy RNG with the seed.
-    np.random.seed(seed=seed)
+    rng = np.random.default_rng(seed=seed)
 
     # The inner while True loop is for ensuring a positive definite
     # pseudodata replica
@@ -164,17 +163,17 @@ def make_replica(list_of_commondata, seed=None):
             pseudodata = cd.central_values.to_numpy()
 
             # add contribution from statistical uncertainty
-            pseudodata += (cd.stat_errors.to_numpy() * np.random.randn(cd.ndata))
+            pseudodata += (cd.stat_errors.to_numpy() * rng.normal(size=cd.ndata))
 
             # ~~~ ADDITIVE ERRORS  ~~~
             add_errors = cd.additive_errors
             add_uncorr_errors = add_errors.loc[:, add_errors.columns=="UNCORR"].to_numpy()
 
-            pseudodata += (add_uncorr_errors * np.random.randn(*add_uncorr_errors.shape)).sum(axis=1)
+            pseudodata += (add_uncorr_errors * rng.normal(size=add_uncorr_errors.shape)).sum(axis=1)
 
             # correlated within dataset
             add_corr_errors = add_errors.loc[:, add_errors.columns == "CORR"].to_numpy()
-            pseudodata += add_corr_errors @ np.random.randn(add_corr_errors.shape[1])
+            pseudodata += add_corr_errors @ rng.normal(size=add_corr_errors.shape[1])
 
             # append the partially shifted pseudodata
             pseudodatas.append(pseudodata)
@@ -187,12 +186,12 @@ def make_replica(list_of_commondata, seed=None):
             mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy()
             # convert to from percent to fraction
             mult_shift = (
-                1 + mult_uncorr_errors * np.random.randn(*mult_uncorr_errors.shape) / 100
+                1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100
             ).prod(axis=1)
 
             mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy()
             mult_shift *= (
-                1 + mult_corr_errors * np.random.randn(1, mult_corr_errors.shape[1]) / 100
+                1 + mult_corr_errors * rng.normal(size=(1, mult_corr_errors.shape[1])) / 100
             ).prod(axis=1)
 
             mult_shifts.append(mult_shift)
@@ -216,10 +215,10 @@ def make_replica(list_of_commondata, seed=None):
 
         all_pseudodata = (
             np.concatenate(pseudodatas, axis=0)
-            + special_add_errors @ np.random.randn(special_add_errors.shape[1])
+            + special_add_errors @ rng.normal(size=special_add_errors.shape[1])
         ) * (
             np.concatenate(mult_shifts, axis=0)
-            * (1 + special_mult_errors * np.random.randn(1, special_mult_errors.shape[1]) / 100).prod(axis=1)
+            * (1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100).prod(axis=1)
         )
 
         if np.all(all_pseudodata[np.concatenate(check_positive_masks, axis=0)] >= 0):

From 9282cc1941ed3f5e78a3644f733ce2e4d6b9990e Mon Sep 17 00:00:00 2001
From: siranipour <si292@cam.ac.uk>
Date: Thu, 21 Jan 2021 17:49:26 +0000
Subject: [PATCH 28/28] Working on reconstructing pseudodata

---
 validphys2/src/validphys/pseudodata.py | 60 ++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index e5ec8a30f6..fc4cc81324 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -30,6 +30,66 @@
 
 context_index = collect("groups_index", ("fitcontext",))
 
+
+def reconstruct_replica(fit, list_of_commondata, replica: int):
+    """A function that reconstructs the pseudodata seen by replica number ``replica``
+
+    #TODO: add list_of_commondata
+    Parameters
+    ----------
+    fit: validphys.core.fit
+        The input fit
+    replica: int
+        The replica number for which the pseudodata is reconstructed
+
+    Returns
+    -------
+    #XXX: maybe this will be a pd.Series
+    pseudodata: np.array
+        The pseudodata in array format
+
+    Raises
+    ------
+    KeyError:
+        If one of ``trvalseed``, ``nnseed``, or ``mcseed`` is not found under the ``fitting``
+        namespace
+
+    Example:
+    >>> from validphys.api import API
+    >>> API.reconstruct_replica(fit="NNPDF40_nnlo_as_0118", replica=0)
+    #TODO: finish this example
+    """
+    log.warning(f"Attempt to reconstruct pseudodata for MC replica {replica} "
+                f"of {fit}. Be advised that this functionality is not available "
+                "for all fits, but only those fitted after N3FIT switched to the "
+                "Python MakeReplica function."
+    )
+
+    runcard = fit.as_input()
+
+    try:
+        fit_ns = runcard["fitting"]
+        trvlseed = fit_ns["trvlseed"]
+        nnseed = fit_ns["nnseed"]
+        mcseed = fit_ns["mcseed"]
+    except KeyError as e:
+        raise KeyError(f"The following keys were not found in the fitting namespace {str(e)}, "
+                        "please ensure the input fit is a valid N3FIT fit."
+        ) from e
+
+    seeds = initialize_seeds(replica, trvlseed, nnseed, mcseed, genrep=True)
+
+    pseudodata = make_replica(list_of_commondata, seed=mcseed)
+
+    # TODO: change the need for datasets and exp_name
+    trmasks, vlmasks = reader.make_tr_val_mask(datasets=None, exp_name=None, seed=trvlseed)
+
+    training = pseudodata[:, trmasks]
+    validation = pseudodata[:, vlmasks]
+
+    return training, validation
+
+
 @check_cuts_fromfit
 def read_fit_pseudodata(fitcontext, context_index):
     """Generator to handle the reading of training and validation splits for a fit that has been