NNPDF · giacomomagni · Jul 16, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 12, 2024
diff --git a/nnpdf_data/nnpdf_data/filter_utils/correlations.py b/nnpdf_data/nnpdf_data/filter_utils/correlations.py
@@ -0,0 +1,90 @@
+import numpy as np
+from numpy.linalg import eig
+
+
+def upper_triangular_to_symmetric(ut, dim):
+    """Build a symmetric matrix from the upper diagonal"""
+    corr = np.zeros((dim, dim))
+    last = dim
+    first = 0
+    for i in range(dim):
+        corr[i, i:] = ut[first:last]
+        last += dim - i - 1
+        first += dim - i
+    return corr
+
+
+def compute_covmat(corrmat: np.ndarray, unc: np.ndarray, ndata: int) -> list:
+    """Compute the covariance matrix with the artificial stat uncertainties."""
+    # multiply by stat err
+    cov_mat = np.einsum("i,ij,j->ij", unc, corrmat, unc)
+    return covmat_to_artunc(ndata, cov_mat.flatten().tolist())
+
+
+def covmat_to_artunc(ndata, covmat_list, no_of_norm_mat=0):
+    r"""Convert the covariance matrix to a matrix of
+    artificial uncertainties.
+
+    NOTE: This function has been taken from validphys.newcommondata_utils.
+    If those utils get merged in the future, we can replace this.
+
+    Parameters
+    ----------
+    ndata : integer
+        Number of data points
+    covmat_list : list
+        A one dimensional list which contains the elements of
+        the covariance matrix row by row. Since experimental
+        datasets provide these matrices in a list form, this
+        simplifies the implementation for the user.
+    no_of_norm_mat : int
+        Normalized covariance matrices may have an eigenvalue
+        of 0 due to the last data point not being linearly
+        independent. To allow for this, the user should input
+        the number of normalized matrices that are being treated
+        in an instance. For example, if a single covariance matrix
+        of a normalized distribution is being processed, the input
+        would be 1. If a covariance matrix contains pertains to
+        3 normalized datasets (i.e. cross covmat for 3
+        distributions), the input would be 3. The default value is
+        0 for when the covariance matrix pertains to an absolute
+        distribution.
+
+    Returns
+    -------
+    artunc : list
+        A two dimensional matrix (given as a list of lists)
+        which contains artificial uncertainties to be added
+        to the commondata. i^th row (or list) contains the
+        artificial uncertainties of the i^th data point.
+
+    """
+    epsilon = -0.0000000001
+    neg_eval_count = 0
+    psd_check = True
+    covmat = np.zeros((ndata, ndata))
+    artunc = np.zeros((ndata, ndata))
+    for i in range(len(covmat_list)):
+        a = i // ndata
+        b = i % ndata
+        covmat[a][b] = covmat_list[i]
+    eigval, eigvec = eig(covmat)
+    for j in range(len(eigval)):
+        if eigval[j] < epsilon:
+            psd_check = False
+        elif eigval[j] > epsilon and eigval[j] <= 0:
+            neg_eval_count = neg_eval_count + 1
+            if neg_eval_count == (no_of_norm_mat + 1):
+                psd_check = False
+        elif eigval[j] > 0:
+            continue
+    if psd_check == False:
+        raise ValueError("The covariance matrix is not positive-semidefinite")
+    else:
+        for i in range(ndata):
+            for j in range(ndata):
+                if eigval[j] < 0:
+                    continue
+                else:
+                    artunc[i][j] = eigvec[i][j] * np.sqrt(eigval[j])
+    return artunc.tolist()
diff --git a/nnpdf_data/nnpdf_data/filter_utils/uncertainties.py b/nnpdf_data/nnpdf_data/filter_utils/uncertainties.py
@@ -0,0 +1,27 @@
+
+import numpy as np
+
+def symmetrize_errors(delta_plus, delta_minus):
+    r"""Compute the symmetrized uncertainty and the shift in data point.
+
+    Parameters
+    ----------
+    delta_plus : float
+        The top/plus uncertainty with sign
+    delta_minus : float
+        The bottom/minus uncertainty with sign
+
+    Returns
+    -------
+    se_delta : float
+        The value to be added to the data point
+    se_sigma : float
+        The symmetrized uncertainty to be used in commondata
+
+    """
+    semi_diff = (delta_plus + delta_minus) / 2
+    average = (delta_plus - delta_minus) / 2
+    se_delta = semi_diff
+    se_sigma = np.sqrt(average * average + 2 * semi_diff * semi_diff)
+    return se_delta, se_sigma
+
diff --git a/nnpdf_data/nnpdf_data/new_commondata/HERMES_NC_7GEV_ED/filter.py b/nnpdf_data/nnpdf_data/new_commondata/HERMES_NC_7GEV_ED/filter.py
@@ -9,8 +9,7 @@
 HERE = pathlib.Path(__file__).parent
 sys.path = [str(HERE.parent / "HERMES_NC_7GEV_EP")] + sys.path
 
-from filter import compute_covmat
-
+from nnpdf_data.filter_utils.correlations import compute_covmat
 
 def read_data(fnames):
     df = pd.DataFrame()
@@ -81,11 +80,9 @@ def write_data(df):
     # Extract the correlation matrix and compute artificial systematics
     ndata_points = len(data_central)
     corrmatrix = read_corrmatrix(nb_datapoints=ndata_points)
-    # Compute the covariance matrix
-    compute_covmat(corrmatrix, df, ndata_points)
 
     # Compute the covariance matrix
-    art_sys = compute_covmat(corrmatrix, df, ndata_points)
+    art_sys = compute_covmat(corrmatrix, df['stat'], ndata_points)
 
     error = []
     for i in range(ndata_points):

diff --git a/nnpdf_data/nnpdf_data/new_commondata/HERMES_NC_7GEV_EP/filter.py b/nnpdf_data/nnpdf_data/new_commondata/HERMES_NC_7GEV_EP/filter.py
@@ -2,10 +2,10 @@
 import pathlib
 
 import numpy as np
-from numpy.linalg import eig
 import pandas as pd
 import yaml
 
+from nnpdf_data.filter_utils.correlations import compute_covmat
 
 def read_data(fnames):
     df = pd.DataFrame()
@@ -49,84 +49,6 @@ def read_corrmatrix(nb_datapoints: int = 15) -> np.ndarray:
 
     return df_corrs.value.values.reshape((nb_datapoints, nb_datapoints))
 
-
-def covmat_to_artunc(ndata, covmat_list, no_of_norm_mat=0):
-    r"""Convert the covariance matrix to a matrix of
-    artificial uncertainties.
-
-    NOTE: This function has been taken from validphys.newcommondata_utils.
-    If those utils get merged in the future, we can replace this.
-
-    Parameters
-    ----------
-    ndata : integer
-        Number of data points
-    covmat_list : list
-        A one dimensional list which contains the elements of
-        the covariance matrix row by row. Since experimental
-        datasets provide these matrices in a list form, this
-        simplifies the implementation for the user.
-    no_of_norm_mat : int
-        Normalized covariance matrices may have an eigenvalue
-        of 0 due to the last data point not being linearly
-        independent. To allow for this, the user should input
-        the number of normalized matrices that are being treated
-        in an instance. For example, if a single covariance matrix
-        of a normalized distribution is being processed, the input
-        would be 1. If a covariance matrix contains pertains to
-        3 normalized datasets (i.e. cross covmat for 3
-        distributions), the input would be 3. The default value is
-        0 for when the covariance matrix pertains to an absolute
-        distribution.
-
-    Returns
-    -------
-    artunc : list
-        A two dimensional matrix (given as a list of lists)
-        which contains artificial uncertainties to be added
-        to the commondata. i^th row (or list) contains the
-        artificial uncertainties of the i^th data point.
-
-    """
-    epsilon = -0.0000000001
-    neg_eval_count = 0
-    psd_check = True
-    covmat = np.zeros((ndata, ndata))
-    artunc = np.zeros((ndata, ndata))
-    for i in range(len(covmat_list)):
-        a = i // ndata
-        b = i % ndata
-        covmat[a][b] = covmat_list[i]
-    eigval, eigvec = eig(covmat)
-    for j in range(len(eigval)):
-        if eigval[j] < epsilon:
-            psd_check = False
-        elif eigval[j] > epsilon and eigval[j] <= 0:
-            neg_eval_count = neg_eval_count + 1
-            if neg_eval_count == (no_of_norm_mat + 1):
-                psd_check = False
-        elif eigval[j] > 0:
-            continue
-    if psd_check == False:
-        raise ValueError('The covariance matrix is not positive-semidefinite')
-    else:
-        for i in range(ndata):
-            for j in range(ndata):
-                if eigval[j] < 0:
-                    continue
-                else:
-                    artunc[i][j] = eigvec[i][j] * np.sqrt(eigval[j])
-    return artunc.tolist()
-
-
-def compute_covmat(corrmat: np.ndarray, df: pd.DataFrame, ndata: int) -> list:
-    """Compute the covariance matrix with the artificial stat uncertanties."""
-    # multiply by stat err
-    stat = df["stat"]
-    cov_mat = np.einsum("i,ij,j->ij", stat, corrmat, stat)
-    return covmat_to_artunc(ndata, cov_mat.flatten().tolist())
-
-
 def write_data(df):
     data_central = []
     for i in range(len(df["G"])):
@@ -153,11 +75,9 @@ def write_data(df):
     # Extract the correlation matrix and compute artificial systematics
     ndata_points = len(data_central)
     corrmatrix = read_corrmatrix(nb_datapoints=ndata_points)
-    # Compute the covariance matrix
-    compute_covmat(corrmatrix, df, ndata_points)
 
     # Compute the covariance matrix
-    art_sys = compute_covmat(corrmatrix, df, ndata_points)
+    art_sys = compute_covmat(corrmatrix, df['stat'], ndata_points)
 
     error = []
     for i in range(ndata_points):

diff --git a/nnpdf_data/nnpdf_data/new_commondata/PHENIX_1JET_200GEV/data.yaml b/nnpdf_data/nnpdf_data/new_commondata/PHENIX_1JET_200GEV/data.yaml
@@ -0,0 +1,7 @@
+data_central:
+- -0.0014
+- -0.0005
+- 0.0058
+- 0.0034
+- 0.0077
+- -0.0181
diff --git a/nnpdf_data/nnpdf_data/new_commondata/PHENIX_1JET_200GEV/filter.py b/nnpdf_data/nnpdf_data/new_commondata/PHENIX_1JET_200GEV/filter.py
@@ -0,0 +1,101 @@
+import pandas as pd
+import yaml
+
+POL_UNC = 0.094
+
+
+def read_data():
+    df = pd.DataFrame()
+
+    with open("rawdata/Table4.yaml", "r") as file:
+        data = yaml.safe_load(file)
+
+    pTbsub = data["independent_variables"][0]["values"]
+    pTsub = data["dependent_variables"][0]["values"]
+    ALLsub = data["dependent_variables"][1]["values"]
+
+    for i in range(len(ALLsub)):
+        df = pd.concat(
+            [
+                df,
+                pd.DataFrame(
+                    {
+                        "pT": [pTsub[i]["value"]],
+                        "pTmin": [pTbsub[i]["low"]],
+                        "pTmax": [pTbsub[i]["high"]],
+                        "eta": [0.0],
+                        "eta_min": [-0.35],
+                        "eta_max": [0.35],
+                        "sqrts": [200],
+                        "ALL": [ALLsub[i]["value"]],
+                        "stat": [ALLsub[i]["errors"][0]["symerror"]],
+                    }
+                ),
+            ],
+            ignore_index=True,
+        )
+
+    df["pol"] = POL_UNC * abs(df["ALL"])
+    return df
+
+
+def write_data(df):
+    data_central = []
+    for i in range(len(df["ALL"])):
+        data_central.append(float(df.loc[i, "ALL"]))
+
+    data_central_yaml = {"data_central": data_central}
+    with open("data.yaml", "w") as file:
+        yaml.dump(data_central_yaml, file, sort_keys=False)
+
+    # Write kin file
+    kin = []
+    for i in range(len(df["ALL"])):
+        kin_value = {
+            "pT": {
+                "min": float(df.loc[i, "pTmin"]),
+                "mid": float(df.loc[i, "pT"]),
+                "max": float(df.loc[i, "pTmax"]),
+            },
+            "sqrts": {"min": None, "mid": float(df.loc[i, "sqrts"]), "max": None},
+            "eta": {
+                "min": float(df.loc[i, "eta_min"]),
+                "mid": float(df.loc[i, "eta"]),
+                "max": float(df.loc[i, "eta_max"]),
+            },
+        }
+        kin.append(kin_value)
+
+    kinematics_yaml = {"bins": kin}
+
+    with open("kinematics.yaml", "w") as file:
+        yaml.dump(kinematics_yaml, file, sort_keys=False)
+
+    # Write unc file
+    error = []
+    for i in range(len(df)):
+        e = {"stat": float(df.loc[i, "stat"]), "pol": float(df.loc[i, "pol"])}
+        error.append(e)
+
+    error_definition = {
+        "stat": {
+            "description": "statistical uncertainty",
+            "treatment": "ADD",
+            "type": "UNCORR",
+        },
+        "pol": {
+            "description": "beam polarization uncertainty",
+            "treatment": "MULT",
+            "type": "RHIC2005POL",
+        },
+    }
+
+    uncertainties_yaml = {"definitions": error_definition, "bins": error}
+
+    with open("uncertainties.yaml", "w") as file:
+        yaml.dump(uncertainties_yaml, file, sort_keys=False)
+
+
+if __name__ == "__main__":
+    df = read_data()
+    write_data(df)