From 4fbe4894610285ad10d83ceb5c3b9792509525f9 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Tue, 5 May 2020 10:49:26 +0100 Subject: [PATCH 01/58] copying code from PR476 for data loading --- validphys2/src/validphys/core.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 7971a27091..74bdbebbda 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -232,7 +232,25 @@ def grid_values_index(self): else: raise RuntimeError("Unknown error type") - +def load_dataset(datafile): + """Reads commondata file for dataset_name and returns a panda DataFrame with: + entry process kin1 kin2 kin3 data stat \ + sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N + """ + # read raw commondata file + table = pd.read_csv(datafile, sep=r'\s+', skiprows=1, header=None) + + # remove NaNs + # TODO: replace commondata files with bad formatting + table.dropna(axis='columns', inplace=True) + + # build header + header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] + for i in range((table.shape[1]-len(header))//2): + header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] + table.columns = header + table.set_index('entry', inplace=True) + return table kinlabels_latex = CommonData.kinLabel_latex.asdict() _kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True) From d9b632bf817a8d35de952ba50c33d2e9ae5c002a Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Tue, 5 May 2020 10:51:55 +0100 Subject: [PATCH 02/58] altering load to use new function --- validphys2/src/validphys/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 74bdbebbda..37a6c11e63 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -327,9 +327,9 @@ def __iter__(self): return iter((self.datafile, self.sysfile, self.plotfiles)) @functools.lru_cache() - def load(self)->CommonData: + def load(self): #TODO: Use better path handling in python 3.6 - return CommonData.ReadFile(str(self.datafile), str(self.sysfile)) + return load_dataset(self.datafile) @property def plot_kinlabels(self): From 6c0351c504d8355dee76731f61f73bada70ed1e7 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Tue, 5 May 2020 11:10:24 +0100 Subject: [PATCH 03/58] Changing dataset -> name in config --- validphys2/src/validphys/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 59c2c9b7e7..8e0d72c129 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -241,14 +241,14 @@ def produce_fitpdfandbasis(self, fit): def parse_dataset_input(self, dataset:Mapping): """The mapping that corresponds to the dataset specifications in the fit files""" - known_keys = {'dataset', 'sys', 'cfac', 'frac', 'weight'} + known_keys = {'name', 'sys', 'cfac', 'frac', 'weight'} try: - name = dataset['dataset'] + name = dataset['name'] if not isinstance(name, str): - raise ConfigError(f"'dataset' must be a string, not {type(name)}") + raise ConfigError(f"'name' must be a string, not {type(name)}") except KeyError: - raise ConfigError("'dataset' must be a mapping with " - "'dataset' and 'sysnum'") + raise ConfigError("'name' must be a mapping with " + "'name' and 'sysnum'") sysnum = dataset.get('sys') From a840f2c48970901e8ade389af0320fdc0859ffcb Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Tue, 5 May 2020 12:00:15 +0100 Subject: [PATCH 04/58] Importing pandas --- validphys2/src/validphys/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 37a6c11e63..c48fc40351 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -15,6 +15,7 @@ import logging import numpy as np +import pandas as pd from reportengine import namespaces from reportengine.baseexceptions import AsInputError From 28336642557f19bf331da48c6ab99afd450b1f51 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 12:03:16 +0100 Subject: [PATCH 05/58] data container for commondata --- validphys2/coredata.py | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 validphys2/coredata.py diff --git a/validphys2/coredata.py b/validphys2/coredata.py new file mode 100644 index 0000000000..6e1cad6272 --- /dev/null +++ b/validphys2/coredata.py @@ -0,0 +1,51 @@ +""" +Data containers backed by Python managed memory (Numpy arrays and Pandas +dataframes). This module is intended to substitute large parts of the C++ +wrappers. +""" +import dataclasses +import numpy as np +import pandas as pd + +@dataclasses.dataclass(eq=False) +class CommonData: + """ + Data contained in Commondata files + Parameters + ---------- + setname: str + Name of the dataset + ndata: int + Number of data points + data: array of floats with length ndata + Data values + commondataproc: str + Process type, one of 21 options. + nkin: int + Number of kinematics specified + kinematics: list of str with length nkin + Kinematic variables kin1, kin2, kin3 ... + nsys: int + Number of systematics + sysid: list of str with length nsys + ID for systematic + stat: array of floats with length ndata + Statistical uncertainties on each data point + (separate ADD and MULT here?) + sys: array of floats with dimensions ndat x nsys + Systematic uncertainties on each data point + (separate ADD and MULT here?) + + + + """ + setname: str + ndata: int + data: np.array + commondataproc: str + nkin: int + kinematics: list(str) + nsys: int + sysid: list(str) + stat: np.array + sys: np.array From 08ba1d419aef49f74d4d21b57cfb6356a02dd5ff Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 14:59:07 +0100 Subject: [PATCH 06/58] moving coredata.py to correct loc --- validphys2/coredata.py | 51 ---------------------------- validphys2/src/validphys/coredata.py | 40 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 51 deletions(-) delete mode 100644 validphys2/coredata.py diff --git a/validphys2/coredata.py b/validphys2/coredata.py deleted file mode 100644 index 6e1cad6272..0000000000 --- a/validphys2/coredata.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Data containers backed by Python managed memory (Numpy arrays and Pandas -dataframes). This module is intended to substitute large parts of the C++ -wrappers. -""" -import dataclasses -import numpy as np -import pandas as pd - -@dataclasses.dataclass(eq=False) -class CommonData: - """ - Data contained in Commondata files - Parameters - ---------- - setname: str - Name of the dataset - ndata: int - Number of data points - data: array of floats with length ndata - Data values - commondataproc: str - Process type, one of 21 options. - nkin: int - Number of kinematics specified - kinematics: list of str with length nkin - Kinematic variables kin1, kin2, kin3 ... - nsys: int - Number of systematics - sysid: list of str with length nsys - ID for systematic - stat: array of floats with length ndata - Statistical uncertainties on each data point - (separate ADD and MULT here?) - sys: array of floats with dimensions ndat x nsys - Systematic uncertainties on each data point - (separate ADD and MULT here?) - - - - """ - setname: str - ndata: int - data: np.array - commondataproc: str - nkin: int - kinematics: list(str) - nsys: int - sysid: list(str) - stat: np.array - sys: np.array diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 27245a5126..a733bd4215 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -117,3 +117,43 @@ class CFactorData: description: str central_value: np.array uncertainty: np.array + +@dataclasses.dataclass(eq=False) +class CommonData: + """ + Data contained in Commondata files, relevant cuts applied. + Parameters + ---------- + setname: str + Name of the dataset + ndata: int + Number of data points + data: array of floats with length ndata + Data values + commondataproc: str + Process type, one of 21 options. + nkin: int + Number of kinematics specified + kinematics: list of str with length nkin + Kinematic variables kin1, kin2, kin3 ... + nsys: int + Number of systematics + sysid: list of str with length nsys + ID for systematic + stat: array of floats with length ndata + Statistical uncertainties on each data point + (separate ADD and MULT here?) + sys: array of floats with dimensions ndat x nsys + Systematic uncertainties on each data point + (separate ADD and MULT here?) + """ + setname: str + ndata: int + data: np.array + commondataproc: str + nkin: int + kinematics: list(str) + nsys: int + sysid: list(str) + stat: np.array + sys: np.array From e31a47d8239b20f98dbba7b5112e333131dfe629 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 15:02:14 +0100 Subject: [PATCH 07/58] adding commondata parser script --- validphys2/src/validphys/commondataparser.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 validphys2/src/validphys/commondataparser.py diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py new file mode 100644 index 0000000000..f05862a4d0 --- /dev/null +++ b/validphys2/src/validphys/commondataparser.py @@ -0,0 +1,17 @@ +""" +This module implements parsers for vommondata and systype files into useful +datastructures, contained in the :py:mod:`validphys.coredata` module, which are +not backed by C++ managed memory, and so they can be easily pickled and +interfaces with common Python libraries. The integration of these objects into +the codebase is currently work in progress, and at the moment this module +serves as a proof of concept. +""" +import io +import functools +import tarfile +import dataclasses + +import numpy as np +import pandas as pd + +from validphys.coredata import CommonData From 971c5354f93f278408787391e9689b156af73c57 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 15:09:48 +0100 Subject: [PATCH 08/58] move load_data to commondataparser --- validphys2/src/validphys/commondataparser.py | 25 +++++++++++++++++++- validphys2/src/validphys/core.py | 22 ++--------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index f05862a4d0..c4f4a89c22 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -1,5 +1,5 @@ """ -This module implements parsers for vommondata and systype files into useful +This module implements parsers for commondata and systype files into useful datastructures, contained in the :py:mod:`validphys.coredata` module, which are not backed by C++ managed memory, and so they can be easily pickled and interfaces with common Python libraries. The integration of these objects into @@ -15,3 +15,26 @@ import pandas as pd from validphys.coredata import CommonData + +class BadCommonDataError(Exception): + """Exception raised when a commondata file cannot be parsed correctly""" + +def load_dataset(datafile): + """Reads commondata file for dataset_name and returns a panda DataFrame with: + entry process kin1 kin2 kin3 data stat \ + sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N + """ + # read raw commondata file + table = pd.read_csv(datafile, sep=r'\s+', skiprows=1, header=None) + + # remove NaNs + # TODO: replace commondata files with bad formatting + table.dropna(axis='columns', inplace=True) + + # build header + header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] + for i in range((table.shape[1]-len(header))//2): + header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] + table.columns = header + table.set_index('entry', inplace=True) + return table \ No newline at end of file diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index c48fc40351..8409d8fea3 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -21,6 +21,8 @@ from reportengine.baseexceptions import AsInputError from reportengine.compat import yaml +from commondataparser import load_dataset + from NNPDF import (LHAPDFSet, CommonData, FKTable, @@ -233,26 +235,6 @@ def grid_values_index(self): else: raise RuntimeError("Unknown error type") -def load_dataset(datafile): - """Reads commondata file for dataset_name and returns a panda DataFrame with: - entry process kin1 kin2 kin3 data stat \ - sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N - """ - # read raw commondata file - table = pd.read_csv(datafile, sep=r'\s+', skiprows=1, header=None) - - # remove NaNs - # TODO: replace commondata files with bad formatting - table.dropna(axis='columns', inplace=True) - - # build header - header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - for i in range((table.shape[1]-len(header))//2): - header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] - table.columns = header - table.set_index('entry', inplace=True) - return table - kinlabels_latex = CommonData.kinLabel_latex.asdict() _kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True) From e99bcb33b4dcb63a5a10392c38640a44f01c6329 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 15:10:57 +0100 Subject: [PATCH 09/58] remove pandas import from core --- validphys2/src/validphys/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 8409d8fea3..c87b893932 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -15,7 +15,6 @@ import logging import numpy as np -import pandas as pd from reportengine import namespaces from reportengine.baseexceptions import AsInputError From 3c702fb8bbee86eb747cc7787055d1631e9d26ea Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Wed, 13 May 2020 15:14:34 +0100 Subject: [PATCH 10/58] adding CommonDataInfo class --- validphys2/src/validphys/commondataparser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index c4f4a89c22..66db782626 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -19,6 +19,14 @@ class BadCommonDataError(Exception): """Exception raised when a commondata file cannot be parsed correctly""" +@dataclasses.dataclass(frozen=True) +class CommonDataInfo: + """Class containing the basic properties of a commondata file.""" + setname: str + ndata: int + proc: str + nsys: int + def load_dataset(datafile): """Reads commondata file for dataset_name and returns a panda DataFrame with: entry process kin1 kin2 kin3 data stat \ From 3aa32e1982a401d816c15aaa454406e763eb642b Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Thu, 14 May 2020 15:05:29 +0100 Subject: [PATCH 11/58] reverting back to old behvaiour in core --- validphys2/src/validphys/core.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index c87b893932..a8c551a7e2 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -20,8 +20,6 @@ from reportengine.baseexceptions import AsInputError from reportengine.compat import yaml -from commondataparser import load_dataset - from NNPDF import (LHAPDFSet, CommonData, FKTable, @@ -309,9 +307,9 @@ def __iter__(self): return iter((self.datafile, self.sysfile, self.plotfiles)) @functools.lru_cache() - def load(self): + def load(self)->CommonData: #TODO: Use better path handling in python 3.6 - return load_dataset(self.datafile) + return CommonData.ReadFile(str(self.datafile), str(self.sysfile)) @property def plot_kinlabels(self): From dd90cae4e0784b23c76b97ad1297ba67f2696761 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 15:37:23 +0100 Subject: [PATCH 12/58] populating CommonData object --- validphys2/src/validphys/commondataparser.py | 22 ++++++++++++++++---- validphys2/src/validphys/coredata.py | 6 +++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 66db782626..44f40b409c 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -27,9 +27,10 @@ class CommonDataInfo: proc: str nsys: int -def load_dataset(datafile): - """Reads commondata file for dataset_name and returns a panda DataFrame with: - entry process kin1 kin2 kin3 data stat \ +def load_commondata(datafile): + """ + Reads commondata file for dataset_name and returns a pandas DataFrame with: + entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N """ # read raw commondata file @@ -45,4 +46,17 @@ def load_dataset(datafile): header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] table.columns = header table.set_index('entry', inplace=True) - return table \ No newline at end of file + + # Populate CommonData object + return CommonData( + setname: "Stevland Judkins" + ndata: 1 + data: np.zeros(1) + commondataproc: "DIS" + nkin: 1 + kinematics: ["x"] + nsys: 1 + sysid: "PRAWN" + stat: np.zeros(1) + sys: np.zeros((1,2)) + table = out \ No newline at end of file diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index a733bd4215..1e507eb31a 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -117,7 +117,7 @@ class CFactorData: description: str central_value: np.array uncertainty: np.array - + @dataclasses.dataclass(eq=False) class CommonData: """ @@ -152,8 +152,8 @@ class CommonData: data: np.array commondataproc: str nkin: int - kinematics: list(str) + kinematics: list nsys: int - sysid: list(str) + sysid: list stat: np.array sys: np.array From 01ff097a747224334fca7c30ca6a69419355fa5e Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 16:37:27 +0100 Subject: [PATCH 13/58] removing space in core --- validphys2/src/validphys/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index a8c551a7e2..3e818286ff 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -235,8 +235,6 @@ def grid_values_index(self): kinlabels_latex = CommonData.kinLabel_latex.asdict() _kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True) - - def get_plot_kinlabels(commondata): """Return the LaTex kinematic labels for a given Commondata""" key = commondata.process_type From 7b38cba11f28bb6b6ba97b81edeb10451a3e626c Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 16:37:58 +0100 Subject: [PATCH 14/58] separating structure into parse_commondata and load_commondata --- validphys2/src/validphys/commondataparser.py | 45 ++++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 44f40b409c..960c302035 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -27,14 +27,32 @@ class CommonDataInfo: proc: str nsys: int -def load_commondata(datafile): +def load_commondata(spec): """ + Load the data corresponding to a CommonDataSpec object. + Reads commondata file for dataset_name and returns a pandas DataFrame with: entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N """ - # read raw commondata file - table = pd.read_csv(datafile, sep=r'\s+', skiprows=1, header=None) + tabledata = parse_commondata(spec.datafile) + + return tabledata + +def parse_commondata(f): + + """Parse a commondata file into a CommonData. Raise a BadCommondATAError + if problems are encountered. + Parameters + ---------- + f : file + Open file-like object. + Returns + ------- + commondata : CommonData + An object containing the data and information from the commondata file. + """ + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) # remove NaNs # TODO: replace commondata files with bad formatting @@ -49,14 +67,13 @@ def load_commondata(datafile): # Populate CommonData object return CommonData( - setname: "Stevland Judkins" - ndata: 1 - data: np.zeros(1) - commondataproc: "DIS" - nkin: 1 - kinematics: ["x"] - nsys: 1 - sysid: "PRAWN" - stat: np.zeros(1) - sys: np.zeros((1,2)) - table = out \ No newline at end of file + setname= "Stevland Judkins", + ndata= 1, + data= np.zeros(1), + commondataproc= "DIS", + nkin= 1 , + kinematics= ["x"], + nsys= 1, + sysid= "PRAWN", + stat= np.zeros(1), + sys= np.zeros((1,2))) From 0756b3b33e906c033a1bb5c89265261adc9450c1 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 16:41:32 +0100 Subject: [PATCH 15/58] changing structure of CommonData object --- validphys2/src/validphys/coredata.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 1e507eb31a..7993fce71c 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -128,8 +128,6 @@ class CommonData: Name of the dataset ndata: int Number of data points - data: array of floats with length ndata - Data values commondataproc: str Process type, one of 21 options. nkin: int @@ -140,20 +138,14 @@ class CommonData: Number of systematics sysid: list of str with length nsys ID for systematic - stat: array of floats with length ndata - Statistical uncertainties on each data point - (separate ADD and MULT here?) - sys: array of floats with dimensions ndat x nsys - Systematic uncertainties on each data point - (separate ADD and MULT here?) + data: + Pandas dataframe containing the commondata. """ setname: str - ndata: int - data: np.array + ndata: int commondataproc: str nkin: int kinematics: list nsys: int sysid: list - stat: np.array - sys: np.array + data: pd.DataFrame From 2dac35c4dedd69e7e56e533f7075023928fe6919 Mon Sep 17 00:00:00 2001 From: RosalynLP <33020850+RosalynLP@users.noreply.github.com> Date: Fri, 15 May 2020 16:44:10 +0100 Subject: [PATCH 16/58] Update core.py --- validphys2/src/validphys/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 3e818286ff..eb1f548540 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -231,10 +231,14 @@ def grid_values_index(self): return range(0, len(self)) else: raise RuntimeError("Unknown error type") + + kinlabels_latex = CommonData.kinLabel_latex.asdict() _kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True) + + def get_plot_kinlabels(commondata): """Return the LaTex kinematic labels for a given Commondata""" key = commondata.process_type From 64a964573b559cc82cfee6aa8db83a6ec0462d96 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 17:19:10 +0100 Subject: [PATCH 17/58] searhing for setname in file name --- validphys2/src/validphys/commondataparser.py | 28 +++++++++++--------- validphys2/src/validphys/coredata.py | 2 -- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 960c302035..fa9112caf3 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -10,6 +10,7 @@ import functools import tarfile import dataclasses +import re import numpy as np import pandas as pd @@ -35,11 +36,15 @@ def load_commondata(spec): entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N """ - tabledata = parse_commondata(spec.datafile) + commondatafile = spec.datafile + + # Getting set name from commondata file name + setname = re.search('DATA_(.*).dat', str(commondatafile)).group(1) + tabledata = parse_commondata(commondatafile, setname) return tabledata -def parse_commondata(f): +def parse_commondata(f, setname): """Parse a commondata file into a CommonData. Raise a BadCommondATAError if problems are encountered. @@ -60,20 +65,17 @@ def parse_commondata(f): # build header header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - for i in range((table.shape[1]-len(header))//2): + nsys = (table.shape[1]-len(header))//2 + for i in range(nsys): header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] table.columns = header table.set_index('entry', inplace=True) # Populate CommonData object return CommonData( - setname= "Stevland Judkins", - ndata= 1, - data= np.zeros(1), - commondataproc= "DIS", - nkin= 1 , - kinematics= ["x"], - nsys= 1, - sysid= "PRAWN", - stat= np.zeros(1), - sys= np.zeros((1,2))) + setname = setname, + ndata = len(table), + commondataproc = table["process"][1], + nkin = 3 , + nsys = nsys, + data = table) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 7993fce71c..f4f6251d8a 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -145,7 +145,5 @@ class CommonData: ndata: int commondataproc: str nkin: int - kinematics: list nsys: int - sysid: list data: pd.DataFrame From e457a6cb62baaffcc2c5fe2ded1dd56cdb19bc81 Mon Sep 17 00:00:00 2001 From: Rosalyn Date: Fri, 15 May 2020 17:26:58 +0100 Subject: [PATCH 18/58] adding test for commondata parser --- .../src/validphys/tests/test_commondataparser.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 validphys2/src/validphys/tests/test_commondataparser.py diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py new file mode 100644 index 0000000000..3c7adc187c --- /dev/null +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -0,0 +1,11 @@ +import pandas as pd + +from validphys.commondataparser import load_commondata +from validphys.loader import Loader + +def test_basic_commondata_loading(): + l = Loader() + cd = l.check_commondata(setname='H1HERAF2B') + res = load_commondata(cd) + assert res.ndata == 12 + assert isinstance(res.data, pd.DataFrame) \ No newline at end of file From fa32bd7b860aed24b2e7660536f92e7eed030374 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Wed, 20 May 2020 16:24:32 +0100 Subject: [PATCH 19/58] changing name to dataset in config --- validphys2/src/validphys/commondataparser.py | 4 ++-- validphys2/src/validphys/config.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index fa9112caf3..e807373601 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -32,7 +32,7 @@ def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - Reads commondata file for dataset_name and returns a pandas DataFrame with: + Returns a CommonData instance with data arranged like entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N """ @@ -46,7 +46,7 @@ def load_commondata(spec): def parse_commondata(f, setname): - """Parse a commondata file into a CommonData. Raise a BadCommondATAError + """Parse a commondata file into a CommonData. Raise a BadCommondDataError if problems are encountered. Parameters ---------- diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 8e0d72c129..5c59486d7e 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -241,14 +241,14 @@ def produce_fitpdfandbasis(self, fit): def parse_dataset_input(self, dataset:Mapping): """The mapping that corresponds to the dataset specifications in the fit files""" - known_keys = {'name', 'sys', 'cfac', 'frac', 'weight'} + known_keys = {'dataset', 'sys', 'cfac', 'frac', 'weight'} try: - name = dataset['name'] - if not isinstance(name, str): - raise ConfigError(f"'name' must be a string, not {type(name)}") + dataset = dataset['dataset'] + if not isinstance(dataset, str): + raise ConfigError(f"'dataset' must be a string, not {type(name)}") except KeyError: - raise ConfigError("'name' must be a mapping with " - "'name' and 'sysnum'") + raise ConfigError("'dataset' must be a mapping with " + "'dataset' and 'sysnum'") sysnum = dataset.get('sys') From 1ac749257f9f8fb87b0fe1a6822655aeab220b17 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Wed, 20 May 2020 16:26:02 +0100 Subject: [PATCH 20/58] reverting to old config behaviour --- validphys2/src/validphys/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 5c59486d7e..59c2c9b7e7 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -243,8 +243,8 @@ def parse_dataset_input(self, dataset:Mapping): fit files""" known_keys = {'dataset', 'sys', 'cfac', 'frac', 'weight'} try: - dataset = dataset['dataset'] - if not isinstance(dataset, str): + name = dataset['dataset'] + if not isinstance(name, str): raise ConfigError(f"'dataset' must be a string, not {type(name)}") except KeyError: raise ConfigError("'dataset' must be a mapping with " From 26710eefc53837243956966cce3b4cc34b52c406 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 11:50:17 +0100 Subject: [PATCH 21/58] adding class for SystypeData --- validphys2/src/validphys/coredata.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index f4f6251d8a..a1ae3dd008 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -141,9 +141,30 @@ class CommonData: data: Pandas dataframe containing the commondata. """ + #TODO: Apply cuts setname: str ndata: int commondataproc: str nkin: int nsys: int data: pd.DataFrame + +@dataclasses.dataclass(eq=False) +class SystypeData: + """ + Data contained in Systype file. + Parameters: + ----------- + setname: str + Name of the dataset + nsys: int + Number of systematics + systypes: pd.DataFrame + Pandas dataframe containing the systype index + for each systematic alongside the uncertainty + treatment (ADD/MULT/RAND) and description + (CORR/UNCORR/THEORYCORR/SKIP) + """ + setname: str + nsys: int + systypes: pd.DataFrame From eda4a5d56f03a2b091e3bf6b01625999b5469fc3 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 11:51:55 +0100 Subject: [PATCH 22/58] adding systypeinfo object --- validphys2/src/validphys/commondataparser.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index e807373601..0bb5cd8fdd 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -28,6 +28,12 @@ class CommonDataInfo: proc: str nsys: int +@dataclasses.dataclass(frozen=True) +class SystypeInfo: + """Class containing the basic properties of a systype file.""" + setname: str + nsys: int + def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. From b3ea44e1be9ba221dfef9b57953bc2274ae0c1b5 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 12:11:10 +0100 Subject: [PATCH 23/58] parse systype files as well --- validphys2/src/validphys/commondataparser.py | 58 +++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 0bb5cd8fdd..37fe78452e 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -15,10 +15,14 @@ import numpy as np import pandas as pd -from validphys.coredata import CommonData +from collections import namedtuple + +from validphys.coredata import CommonData, SystypeData class BadCommonDataError(Exception): """Exception raised when a commondata file cannot be parsed correctly""" +class BadSystypeError(Exception): + """Exception raised when a systype file cannot be parsed correctly""" @dataclasses.dataclass(frozen=True) class CommonDataInfo: @@ -34,26 +38,40 @@ class SystypeInfo: setname: str nsys: int +CommondataTables = namedtuple( + "CommondataTables", ("commondata_table", "systype_table") +) def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - - Returns a CommonData instance with data arranged like + + Returns an instance of the namedtuple CommondataTables, + with: + + commondata_table being a CommonData instance with data arranged like entry process kin1 kin2 kin3 data stat \ - sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N + sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N ; + + systype_table being a SystypeData instance with data arranged like + sys_index treatment description. """ commondatafile = spec.datafile # Getting set name from commondata file name setname = re.search('DATA_(.*).dat', str(commondatafile)).group(1) - tabledata = parse_commondata(commondatafile, setname) + commondata = parse_commondata(commondatafile, setname) - return tabledata + systypefile = spec.sysfile + systypedata = parse_systype(systypefile, setname) + + return CommondataTables( + commondata_table=commondata, systype_table=systypedata + ) def parse_commondata(f, setname): """Parse a commondata file into a CommonData. Raise a BadCommondDataError - if problems are encountered. + if problems are encountered. Parameters ---------- f : file @@ -85,3 +103,29 @@ def parse_commondata(f, setname): nkin = 3 , nsys = nsys, data = table) + + +def parse_systype(f, setname): + + """Parse a systype file into a SystypeData. Raise a BadSystypeDataError + if problems are encountered. + Parameters + ---------- + f : file + Open file-like object. + Returns + ------- + systypes : SystypeData + An object containing the data and information from the systype file. + """ + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + + # build header + header = ["sys_index", "treatment", "description"] + table.columns = header + + # Populate SystypeData object + return SystypeData( + setname = setname, + nsys = len(table), + systypes = table) \ No newline at end of file From 039b168ecdf61f7e32c5f56e74a81bcc241b1784 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:11:57 +0100 Subject: [PATCH 24/58] removing info objects --- validphys2/src/validphys/commondataparser.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 37fe78452e..4ecaefb86d 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -24,20 +24,6 @@ class BadCommonDataError(Exception): class BadSystypeError(Exception): """Exception raised when a systype file cannot be parsed correctly""" -@dataclasses.dataclass(frozen=True) -class CommonDataInfo: - """Class containing the basic properties of a commondata file.""" - setname: str - ndata: int - proc: str - nsys: int - -@dataclasses.dataclass(frozen=True) -class SystypeInfo: - """Class containing the basic properties of a systype file.""" - setname: str - nsys: int - CommondataTables = namedtuple( "CommondataTables", ("commondata_table", "systype_table") ) From 45749a0750aca10845d6d242828c14d8a2e08f39 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:17:12 +0100 Subject: [PATCH 25/58] adding error message --- validphys2/src/validphys/commondataparser.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 4ecaefb86d..1bc85d337f 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -67,8 +67,11 @@ def parse_commondata(f, setname): commondata : CommonData An object containing the data and information from the commondata file. """ - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - + try: + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + except Exception as e: + raise BadCommonDataError(f"Could not read file {f}. Please + check there is a valid COMMONDATA file at this location.") from e # remove NaNs # TODO: replace commondata files with bad formatting table.dropna(axis='columns', inplace=True) @@ -104,8 +107,11 @@ def parse_systype(f, setname): systypes : SystypeData An object containing the data and information from the systype file. """ - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - + try: + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + except Exception as e: + raise BadSystypeError(f"Could not read file {f}. Please check + there is a valid SYSTYPES file at this location.") from e # build header header = ["sys_index", "treatment", "description"] table.columns = header From 05b87a9ad68413e945076288f942e30540deb6d9 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:27:34 +0100 Subject: [PATCH 26/58] test for systype loading --- validphys2/src/validphys/commondataparser.py | 10 +++++----- .../src/validphys/tests/test_commondataparser.py | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 1bc85d337f..139fbcdd40 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -24,14 +24,14 @@ class BadCommonDataError(Exception): class BadSystypeError(Exception): """Exception raised when a systype file cannot be parsed correctly""" -CommondataTables = namedtuple( - "CommondataTables", ("commondata_table", "systype_table") +CommondataInfo = namedtuple( + "CommondataTables", ("commondata", "systypes") ) def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - Returns an instance of the namedtuple CommondataTables, + Returns an instance of the namedtuple CommondataInfo, with: commondata_table being a CommonData instance with data arranged like @@ -50,8 +50,8 @@ def load_commondata(spec): systypefile = spec.sysfile systypedata = parse_systype(systypefile, setname) - return CommondataTables( - commondata_table=commondata, systype_table=systypedata + return CommondataInfo( + commondata=commondata, systypes=systypedata ) def parse_commondata(f, setname): diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index 3c7adc187c..dfa337e2ff 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -7,5 +7,8 @@ def test_basic_commondata_loading(): l = Loader() cd = l.check_commondata(setname='H1HERAF2B') res = load_commondata(cd) - assert res.ndata == 12 - assert isinstance(res.data, pd.DataFrame) \ No newline at end of file + # Test commondata loading + assert res.commondata.ndata == 12 + assert isinstance(res.commondata.data, pd.DataFrame) + # Test systype loading + assert res.systypes.nsys == 25 From d869c33dfabfb2567900c9995c86aa205eaf96ca Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:30:01 +0100 Subject: [PATCH 27/58] fixing bug in string --- validphys2/src/validphys/commondataparser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 139fbcdd40..435818a2dd 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -70,8 +70,8 @@ def parse_commondata(f, setname): try: table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) except Exception as e: - raise BadCommonDataError(f"Could not read file {f}. Please - check there is a valid COMMONDATA file at this location.") from e + raise BadCommonDataError(f"Could not read file {f}. Please" + + "check there is a valid COMMONDATA file at this location.") from e # remove NaNs # TODO: replace commondata files with bad formatting table.dropna(axis='columns', inplace=True) @@ -110,8 +110,8 @@ def parse_systype(f, setname): try: table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) except Exception as e: - raise BadSystypeError(f"Could not read file {f}. Please check - there is a valid SYSTYPES file at this location.") from e + raise BadSystypeError(f"Could not read file {f}. Please check" + + "there is a valid SYSTYPES file at this location.") from e # build header header = ["sys_index", "treatment", "description"] table.columns = header From d6005fc81e806e45f8e668afdc7382a78bd1608f Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:30:45 +0100 Subject: [PATCH 28/58] renaming namedtuple --- validphys2/src/validphys/commondataparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 435818a2dd..d43f14fd89 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -25,7 +25,7 @@ class BadSystypeError(Exception): """Exception raised when a systype file cannot be parsed correctly""" CommondataInfo = namedtuple( - "CommondataTables", ("commondata", "systypes") + "CommondataInfo", ("commondata", "systypes") ) def load_commondata(spec): """ From 04a75b7e4d8c7349a3ecdc39703d84d469404624 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:39:10 +0100 Subject: [PATCH 29/58] choosing sys_index as the index for systype table --- validphys2/src/validphys/commondataparser.py | 6 ++++-- validphys2/src/validphys/coredata.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index d43f14fd89..bb245dd60d 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -91,7 +91,7 @@ def parse_commondata(f, setname): commondataproc = table["process"][1], nkin = 3 , nsys = nsys, - data = table) + commondata_table = table) def parse_systype(f, setname): @@ -112,12 +112,14 @@ def parse_systype(f, setname): except Exception as e: raise BadSystypeError(f"Could not read file {f}. Please check" + "there is a valid SYSTYPES file at this location.") from e + table.dropna(axis='columns', inplace=True) # build header header = ["sys_index", "treatment", "description"] table.columns = header + table.set_index("sys_index", inplace=True) # Populate SystypeData object return SystypeData( setname = setname, nsys = len(table), - systypes = table) \ No newline at end of file + systype_table = table) \ No newline at end of file diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index a1ae3dd008..8ab918afa9 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -138,7 +138,7 @@ class CommonData: Number of systematics sysid: list of str with length nsys ID for systematic - data: + commondata_table: Pandas dataframe containing the commondata. """ #TODO: Apply cuts @@ -147,7 +147,7 @@ class CommonData: commondataproc: str nkin: int nsys: int - data: pd.DataFrame + commondata_table: pd.DataFrame @dataclasses.dataclass(eq=False) class SystypeData: @@ -159,7 +159,7 @@ class SystypeData: Name of the dataset nsys: int Number of systematics - systypes: pd.DataFrame + systype_table: pd.DataFrame Pandas dataframe containing the systype index for each systematic alongside the uncertainty treatment (ADD/MULT/RAND) and description @@ -167,4 +167,4 @@ class SystypeData: """ setname: str nsys: int - systypes: pd.DataFrame + systype_table: pd.DataFrame From 1c22b8facdda90fb5a31fa47b0b0151c3aa1aa2e Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 25 May 2020 14:41:30 +0100 Subject: [PATCH 30/58] updating test --- validphys2/src/validphys/tests/test_commondataparser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index dfa337e2ff..8a825d0df1 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -9,6 +9,7 @@ def test_basic_commondata_loading(): res = load_commondata(cd) # Test commondata loading assert res.commondata.ndata == 12 - assert isinstance(res.commondata.data, pd.DataFrame) + assert isinstance(res.commondata.commondata_table, pd.DataFrame) # Test systype loading assert res.systypes.nsys == 25 + assert isinstance(res.systypes.systype_table, pd.DataFrame) From a72813e3bc52b5b691ef02b4a453b9bedbd0ce9a Mon Sep 17 00:00:00 2001 From: Cameron Voisey Date: Wed, 27 May 2020 12:00:35 +0100 Subject: [PATCH 31/58] Format with black --- validphys2/src/validphys/commondataparser.py | 71 ++++++++++--------- .../validphys/tests/test_commondataparser.py | 3 +- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index bb245dd60d..4054685a50 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -19,14 +19,18 @@ from validphys.coredata import CommonData, SystypeData + class BadCommonDataError(Exception): """Exception raised when a commondata file cannot be parsed correctly""" + + class BadSystypeError(Exception): """Exception raised when a systype file cannot be parsed correctly""" -CommondataInfo = namedtuple( - "CommondataInfo", ("commondata", "systypes") -) + +CommondataInfo = namedtuple("CommondataInfo", ("commondata", "systypes")) + + def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. @@ -44,18 +48,17 @@ def load_commondata(spec): commondatafile = spec.datafile # Getting set name from commondata file name - setname = re.search('DATA_(.*).dat', str(commondatafile)).group(1) + setname = re.search("DATA_(.*).dat", str(commondatafile)).group(1) commondata = parse_commondata(commondatafile, setname) systypefile = spec.sysfile systypedata = parse_systype(systypefile, setname) - - return CommondataInfo( - commondata=commondata, systypes=systypedata - ) + + return CommondataInfo(commondata=commondata, systypes=systypedata) + def parse_commondata(f, setname): - + """Parse a commondata file into a CommonData. Raise a BadCommondDataError if problems are encountered. Parameters @@ -68,34 +71,37 @@ def parse_commondata(f, setname): An object containing the data and information from the commondata file. """ try: - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + table = pd.read_csv(f, sep=r"\s+", skiprows=1, header=None) except Exception as e: - raise BadCommonDataError(f"Could not read file {f}. Please" - + "check there is a valid COMMONDATA file at this location.") from e + raise BadCommonDataError( + f"Could not read file {f}. Please" + + "check there is a valid COMMONDATA file at this location." + ) from e # remove NaNs # TODO: replace commondata files with bad formatting - table.dropna(axis='columns', inplace=True) + table.dropna(axis="columns", inplace=True) # build header - header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - nsys = (table.shape[1]-len(header))//2 + header = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"] + nsys = (table.shape[1] - len(header)) // 2 for i in range(nsys): - header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] + header += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] table.columns = header - table.set_index('entry', inplace=True) + table.set_index("entry", inplace=True) # Populate CommonData object return CommonData( - setname = setname, - ndata = len(table), - commondataproc = table["process"][1], - nkin = 3 , - nsys = nsys, - commondata_table = table) + setname=setname, + ndata=len(table), + commondataproc=table["process"][1], + nkin=3, + nsys=nsys, + commondata_table=table, + ) def parse_systype(f, setname): - + """Parse a systype file into a SystypeData. Raise a BadSystypeDataError if problems are encountered. Parameters @@ -107,19 +113,18 @@ def parse_systype(f, setname): systypes : SystypeData An object containing the data and information from the systype file. """ - try: - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + try: + table = pd.read_csv(f, sep=r"\s+", skiprows=1, header=None) except Exception as e: - raise BadSystypeError(f"Could not read file {f}. Please check" - + "there is a valid SYSTYPES file at this location.") from e - table.dropna(axis='columns', inplace=True) + raise BadSystypeError( + f"Could not read file {f}. Please check" + + "there is a valid SYSTYPES file at this location." + ) from e + table.dropna(axis="columns", inplace=True) # build header header = ["sys_index", "treatment", "description"] table.columns = header table.set_index("sys_index", inplace=True) # Populate SystypeData object - return SystypeData( - setname = setname, - nsys = len(table), - systype_table = table) \ No newline at end of file + return SystypeData(setname=setname, nsys=len(table), systype_table=table) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index 8a825d0df1..b03c4f84d8 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -3,9 +3,10 @@ from validphys.commondataparser import load_commondata from validphys.loader import Loader + def test_basic_commondata_loading(): l = Loader() - cd = l.check_commondata(setname='H1HERAF2B') + cd = l.check_commondata(setname="H1HERAF2B") res = load_commondata(cd) # Test commondata loading assert res.commondata.ndata == 12 From 21b41ed999f8c058c2fe4973a731eaf495a07c69 Mon Sep 17 00:00:00 2001 From: Cameron Voisey Date: Wed, 27 May 2020 12:02:36 +0100 Subject: [PATCH 32/58] Remove unnecessary imports --- validphys2/src/validphys/commondataparser.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 4054685a50..30f7a8b665 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,10 +6,6 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ -import io -import functools -import tarfile -import dataclasses import re import numpy as np From 8bcd766b647b8b9f3d92ecf02fd095a7497ad8d9 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Wed, 27 May 2020 12:07:33 +0100 Subject: [PATCH 33/58] amending review comments --- validphys2/src/validphys/commondataparser.py | 51 ++++++-------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index bb245dd60d..13ec1cde2f 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,38 +6,27 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ -import io -import functools -import tarfile -import dataclasses import re +from collections import namedtuple import numpy as np import pandas as pd -from collections import namedtuple - from validphys.coredata import CommonData, SystypeData -class BadCommonDataError(Exception): - """Exception raised when a commondata file cannot be parsed correctly""" -class BadSystypeError(Exception): - """Exception raised when a systype file cannot be parsed correctly""" - CommondataInfo = namedtuple( "CommondataInfo", ("commondata", "systypes") ) + + def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - Returns an instance of the namedtuple CommondataInfo, with: - commondata_table being a CommonData instance with data arranged like entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N ; - systype_table being a SystypeData instance with data arranged like sys_index treatment description. """ @@ -55,7 +44,6 @@ def load_commondata(spec): ) def parse_commondata(f, setname): - """Parse a commondata file into a CommonData. Raise a BadCommondDataError if problems are encountered. Parameters @@ -67,18 +55,14 @@ def parse_commondata(f, setname): commondata : CommonData An object containing the data and information from the commondata file. """ - try: - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - except Exception as e: - raise BadCommonDataError(f"Could not read file {f}. Please" - + "check there is a valid COMMONDATA file at this location.") from e + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) # remove NaNs # TODO: replace commondata files with bad formatting table.dropna(axis='columns', inplace=True) # build header header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - nsys = (table.shape[1]-len(header))//2 + nsys = (table.shape[1] - len(header)) // 2 for i in range(nsys): header += [f'sys.add.{i+1}', f'sys.mult.{i+1}'] table.columns = header @@ -86,16 +70,16 @@ def parse_commondata(f, setname): # Populate CommonData object return CommonData( - setname = setname, - ndata = len(table), - commondataproc = table["process"][1], - nkin = 3 , - nsys = nsys, - commondata_table = table) + setname=setname, + ndata=len(table), + commondataproc=table["process"][1], + nkin=3, + sys=nsys, + commondata_table=table + ) def parse_systype(f, setname): - """Parse a systype file into a SystypeData. Raise a BadSystypeDataError if problems are encountered. Parameters @@ -107,11 +91,7 @@ def parse_systype(f, setname): systypes : SystypeData An object containing the data and information from the systype file. """ - try: - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - except Exception as e: - raise BadSystypeError(f"Could not read file {f}. Please check" - + "there is a valid SYSTYPES file at this location.") from e + table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) table.dropna(axis='columns', inplace=True) # build header header = ["sys_index", "treatment", "description"] @@ -119,7 +99,4 @@ def parse_systype(f, setname): table.set_index("sys_index", inplace=True) # Populate SystypeData object - return SystypeData( - setname = setname, - nsys = len(table), - systype_table = table) \ No newline at end of file + return SystypeData(setname=setname, nsys=len(table), systype_table=table) \ No newline at end of file From 583e5d82c34c81d3bf1e472b3ae7af6dc89868e6 Mon Sep 17 00:00:00 2001 From: Cameron Voisey Date: Wed, 27 May 2020 12:42:10 +0100 Subject: [PATCH 34/58] Make comments and docstrings consistent --- validphys2/src/validphys/commondataparser.py | 16 +++++--- validphys2/src/validphys/coredata.py | 42 +++++++++++++------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 7ad3316b25..8878de2feb 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -45,21 +45,23 @@ def load_commondata(spec): def parse_commondata(f, setname): """Parse a commondata file into a CommonData. Raise a BadCommondDataError if problems are encountered. + Parameters ---------- f : file - Open file-like object. + Open file-like object. + Returns ------- commondata : CommonData An object containing the data and information from the commondata file. """ table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - # remove NaNs + # Remove NaNs # TODO: replace commondata files with bad formatting table.dropna(axis="columns", inplace=True) - # build header + # Build header header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] nsys = (table.shape[1] - len(header)) // 2 for i in range(nsys): @@ -80,11 +82,13 @@ def parse_commondata(f, setname): def parse_systype(f, setname): """Parse a systype file into a SystypeData. Raise a BadSystypeDataError - if problems are encountered. + if problems are encountered. + Parameters ---------- f : file - Open file-like object. + Open file-like object. + Returns ------- systypes : SystypeData @@ -92,7 +96,7 @@ def parse_systype(f, setname): """ table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) table.dropna(axis='columns', inplace=True) - # build header + # Build header header = ["sys_index", "treatment", "description"] table.columns = header table.set_index("sys_index", inplace=True) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 8ab918afa9..01d82dbe42 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -28,7 +28,7 @@ class FKTableData: xgrid : array, shape (nx) The points in x at which the PDFs should be evaluated. - sigma : DataFrame + sigma : pd.DataFrame For hadronic data, the columns are the indexes in the ``NfxNf`` list of possible flavour combinations of two PDFs. The MultiIndex contains three keys, the data index, an index into ``xgrid`` for the first PDF @@ -122,23 +122,32 @@ class CFactorData: class CommonData: """ Data contained in Commondata files, relevant cuts applied. + Parameters ---------- - setname: str + + setname : str Name of the dataset - ndata: int + + ndata : int Number of data points - commondataproc: str - Process type, one of 21 options. - nkin: int + + commondataproc : str + Process type, one of 21 options + + nkin : int Number of kinematics specified - kinematics: list of str with length nkin + + kinematics : list of str with length nkin Kinematic variables kin1, kin2, kin3 ... - nsys: int + + nsys : int Number of systematics - sysid: list of str with length nsys + + sysid : list of str with length nsys ID for systematic - commondata_table: + + commondata_table : pd.DataFrame Pandas dataframe containing the commondata. """ #TODO: Apply cuts @@ -153,13 +162,16 @@ class CommonData: class SystypeData: """ Data contained in Systype file. - Parameters: - ----------- - setname: str + + Parameters + ---------- + setname : str Name of the dataset - nsys: int + + nsys : int Number of systematics - systype_table: pd.DataFrame + + systype_table : pd.DataFrame Pandas dataframe containing the systype index for each systematic alongside the uncertainty treatment (ADD/MULT/RAND) and description From 4e1b6d3f4114a75c2a5606dfa74e807eb8eb10e2 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 27 May 2020 13:22:50 +0100 Subject: [PATCH 35/58] Removing explanation of raises in docstrings --- validphys2/src/validphys/commondataparser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 8878de2feb..f99b5b9754 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -43,8 +43,7 @@ def load_commondata(spec): def parse_commondata(f, setname): - """Parse a commondata file into a CommonData. Raise a BadCommondDataError - if problems are encountered. + """Parse a commondata file into a CommonData. Parameters ---------- @@ -81,8 +80,7 @@ def parse_commondata(f, setname): def parse_systype(f, setname): - """Parse a systype file into a SystypeData. Raise a BadSystypeDataError - if problems are encountered. + """Parse a systype file into a SystypeData. Parameters ---------- From 7e7758b2309f7a59019550b2f0ebe138812fb200 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 27 May 2020 13:42:15 +0100 Subject: [PATCH 36/58] Removing blank lines --- validphys2/src/validphys/coredata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 01d82dbe42..313cd0f66d 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -152,9 +152,9 @@ class CommonData: """ #TODO: Apply cuts setname: str - ndata: int + ndata: int commondataproc: str - nkin: int + nkin: int nsys: int commondata_table: pd.DataFrame From 028452677353beb8602e15bb15576b68874442c4 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 27 May 2020 13:49:12 +0100 Subject: [PATCH 37/58] Adding some more properties --- validphys2/src/validphys/coredata.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 313cd0f66d..38772cc4ef 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -158,6 +158,21 @@ class CommonData: nsys: int commondata_table: pd.DataFrame + @property + def central_values(self): + return self.commondata_table["data"] + + @property + def stat_err(self): + return self.commondata_table["stat"] + + @property + def sys_err(self): + return self.commondata_table.drop( + columns=["process", "kin1", "kin2", "kin3", "data", "stat"] + ) + + @dataclasses.dataclass(eq=False) class SystypeData: """ From c50394ac2a4b702d3399d402033c0b2eed1938fb Mon Sep 17 00:00:00 2001 From: Cameron Voisey Date: Wed, 27 May 2020 13:46:28 +0100 Subject: [PATCH 38/58] Fix name of argument given to CommonData --- validphys2/src/validphys/commondataparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index f99b5b9754..1f86229cb2 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -74,7 +74,7 @@ def parse_commondata(f, setname): ndata=len(table), commondataproc=table["process"][1], nkin=3, - sys=nsys, + nsys=nsys, commondata_table=table ) From c3a40f3fdb93f23743d63406b845624a5808f6a8 Mon Sep 17 00:00:00 2001 From: siranipour Date: Wed, 27 May 2020 14:19:49 +0100 Subject: [PATCH 39/58] Using pathlib to handle setname parsing --- validphys2/src/validphys/commondataparser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 1f86229cb2..1291c79a88 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,7 +6,6 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ -import re from collections import namedtuple import numpy as np @@ -33,7 +32,7 @@ def load_commondata(spec): commondatafile = spec.datafile # Getting set name from commondata file name - setname = re.search("DATA_(.*).dat", str(commondatafile)).group(1) + setname = commondatafile.name[:-4] # removing the .dat suffix commondata = parse_commondata(commondatafile, setname) systypefile = spec.sysfile From d666c2be79cb4a46fe79b6bfc08c38115f0d5186 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Wed, 27 May 2020 14:32:43 +0100 Subject: [PATCH 40/58] getting process type from 1st entry --- validphys2/src/validphys/commondataparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 1291c79a88..740cfb3331 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -71,7 +71,7 @@ def parse_commondata(f, setname): return CommonData( setname=setname, ndata=len(table), - commondataproc=table["process"][1], + commondataproc=table["process"][0], nkin=3, nsys=nsys, commondata_table=table From 21a9dfe59e449a0509d166fc6a955c29f44a47ba Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Thu, 28 May 2020 09:53:12 +0100 Subject: [PATCH 41/58] merging CommonData and SystypeData objects --- validphys2/src/validphys/commondataparser.py | 86 ++++++++------------ validphys2/src/validphys/coredata.py | 32 ++------ 2 files changed, 41 insertions(+), 77 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 740cfb3331..1888647421 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,97 +6,77 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ -from collections import namedtuple - import numpy as np import pandas as pd -from validphys.coredata import CommonData, SystypeData - -CommondataInfo = namedtuple( - "CommondataInfo", ("commondata", "systypes") -) - +from validphys.coredata import CommonData def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - Returns an instance of the namedtuple CommondataInfo, + Returns an instance of CommonData, with: - commondata_table being a CommonData instance with data arranged like + commondata_table being a pandas dataframe with data arranged like entry process kin1 kin2 kin3 data stat \ sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N ; - systype_table being a SystypeData instance with data arranged like + systype_table being a pandas dataframe with data arranged like sys_index treatment description. """ commondatafile = spec.datafile - # Getting set name from commondata file name setname = commondatafile.name[:-4] # removing the .dat suffix - commondata = parse_commondata(commondatafile, setname) - systypefile = spec.sysfile - systypedata = parse_systype(systypefile, setname) - return CommondataInfo(commondata=commondata, systypes=systypedata) + commondata = parse_commondata(commondatafile, systypefile, setname) + + return commondata -def parse_commondata(f, setname): - """Parse a commondata file into a CommonData. +def parse_commondata(commondatafile, systypefile, setname): + """Parse a commondata file and a systype file into a CommonData. Parameters ---------- - f : file + commondatafile : file + Open file-like object. + systypefile : file Open file-like object. Returns ------- commondata : CommonData - An object containing the data and information from the commondata file. + An object containing the data and information from the commondata + and systype files. """ - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) + # First parse commondata file + commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None) # Remove NaNs # TODO: replace commondata files with bad formatting - table.dropna(axis="columns", inplace=True) - + commondatatable.dropna(axis="columns", inplace=True) # Build header - header = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] - nsys = (table.shape[1] - len(header)) // 2 + commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] + nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 for i in range(nsys): - header += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] - table.columns = header - table.set_index("entry", inplace=True) + commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] + commondataable.columns = commondataheader + commmondatatable.set_index("entry", inplace=True) + + # Now parse systyle file + systypetable = pd.read_csv(systypefile, sep=r'\s+', skiprows=1, header=None) + systypetable.dropna(axis='columns', inplace=True) + # Build header + systypeheader = ["sys_index", "type", "name"] + systypetable.columns = systypeheader + systypetable.set_index("sys_index", inplace=True) # Populate CommonData object return CommonData( setname=setname, ndata=len(table), - commondataproc=table["process"][0], + commondataproc=table["process"][1], nkin=3, nsys=nsys, - commondata_table=table + commondata_table=commondatatable + systype_table=systypetable ) - -def parse_systype(f, setname): - """Parse a systype file into a SystypeData. - - Parameters - ---------- - f : file - Open file-like object. - - Returns - ------- - systypes : SystypeData - An object containing the data and information from the systype file. - """ - table = pd.read_csv(f, sep=r'\s+', skiprows=1, header=None) - table.dropna(axis='columns', inplace=True) - # Build header - header = ["sys_index", "treatment", "description"] - table.columns = header - table.set_index("sys_index", inplace=True) - - # Populate SystypeData object - return SystypeData(setname=setname, nsys=len(table), systype_table=table) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 38772cc4ef..8073c795e1 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -148,7 +148,13 @@ class CommonData: ID for systematic commondata_table : pd.DataFrame - Pandas dataframe containing the commondata. + Pandas dataframe containing the commondata + + systype_table : pd.DataFrame + Pandas dataframe containing the systype index + for each systematic alongside the uncertainty + type (ADD/MULT/RAND) and name + (CORR/UNCORR/THEORYCORR/SKIP) """ #TODO: Apply cuts setname: str @@ -157,6 +163,7 @@ class CommonData: nkin: int nsys: int commondata_table: pd.DataFrame + systype_table: pd.DataFrame @property def central_values(self): @@ -172,26 +179,3 @@ def sys_err(self): columns=["process", "kin1", "kin2", "kin3", "data", "stat"] ) - -@dataclasses.dataclass(eq=False) -class SystypeData: - """ - Data contained in Systype file. - - Parameters - ---------- - setname : str - Name of the dataset - - nsys : int - Number of systematics - - systype_table : pd.DataFrame - Pandas dataframe containing the systype index - for each systematic alongside the uncertainty - treatment (ADD/MULT/RAND) and description - (CORR/UNCORR/THEORYCORR/SKIP) - """ - setname: str - nsys: int - systype_table: pd.DataFrame From fa84a064a7e988229d4d9eb036210d8789bb1e5c Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 28 May 2020 12:01:54 +0100 Subject: [PATCH 42/58] Correcting typos --- validphys2/src/validphys/commondataparser.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 1888647421..0195a7ac0d 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -45,7 +45,7 @@ def parse_commondata(commondatafile, systypefile, setname): Returns ------- commondata : CommonData - An object containing the data and information from the commondata + An object containing the data and information from the commondata and systype files. """ # First parse commondata file @@ -58,8 +58,8 @@ def parse_commondata(commondatafile, systypefile, setname): nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 for i in range(nsys): commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] - commondataable.columns = commondataheader - commmondatatable.set_index("entry", inplace=True) + commondatatable.columns = commondataheader + commondatatable.set_index("entry", inplace=True) # Now parse systyle file systypetable = pd.read_csv(systypefile, sep=r'\s+', skiprows=1, header=None) @@ -72,11 +72,11 @@ def parse_commondata(commondatafile, systypefile, setname): # Populate CommonData object return CommonData( setname=setname, - ndata=len(table), - commondataproc=table["process"][1], + ndata=len(commondatatable), + commondataproc=commondatatable["process"][1], nkin=3, nsys=nsys, - commondata_table=commondatatable + commondata_table=commondatatable, systype_table=systypetable ) From f0e8a03d55c7848cf36746660ef651f050b3f73d Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 28 May 2020 12:29:20 +0100 Subject: [PATCH 43/58] Fixing tests --- validphys2/src/validphys/tests/test_commondataparser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index b03c4f84d8..76528ca302 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -9,8 +9,8 @@ def test_basic_commondata_loading(): cd = l.check_commondata(setname="H1HERAF2B") res = load_commondata(cd) # Test commondata loading - assert res.commondata.ndata == 12 - assert isinstance(res.commondata.commondata_table, pd.DataFrame) + assert res.ndata == 12 + assert isinstance(res.commondata_table, pd.DataFrame) # Test systype loading - assert res.systypes.nsys == 25 - assert isinstance(res.systypes.systype_table, pd.DataFrame) + assert res.nsys == 25 + assert isinstance(res.systype_table, pd.DataFrame) From c2312b33cc165fb08276d23bdc3f34f880e6389f Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 28 May 2020 13:03:21 +0100 Subject: [PATCH 44/58] Adding systype table to CommonData class --- validphys2/src/validphys/coredata.py | 33 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 8073c795e1..48f3264e83 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -118,6 +118,15 @@ class CFactorData: central_value: np.array uncertainty: np.array + +@dataclasses.dataclass(eq=False) +class SystematicError: + add: float + mult: float + sys_type: str #e.g ADD + name: str #e.g UNCORR + + @dataclasses.dataclass(eq=False) class CommonData: """ @@ -170,12 +179,28 @@ def central_values(self): return self.commondata_table["data"] @property - def stat_err(self): + def stat_errors(self): return self.commondata_table["stat"] @property - def sys_err(self): - return self.commondata_table.drop( + def sys_errors(self): + sys_table = self.commondata_table.drop( columns=["process", "kin1", "kin2", "kin3", "data", "stat"] ) - + table = [ + [ + SystematicError( + add=sys_table[f"sys.add.{j+1}"][i + 1], + mult=sys_table[f"sys.mult.{j+1}"][i + 1], + sys_type=self.systype_table["type"][j + 1], + name=self.systype_table["name"][j + 1], + ) + for j in range(self.nsys) + ] + for i in range(self.ndata) + ] + return pd.DataFrame( + table, + columns=[f"sys.{i+1}" for i in range(self.nsys)], + index=range(1, self.ndata + 1), + ) From 1195554ebcefeba4104b252d2293808efdc8a36b Mon Sep 17 00:00:00 2001 From: siranipour Date: Thu, 28 May 2020 13:49:18 +0100 Subject: [PATCH 45/58] Adding __repr__ and __str__ methods for pretty printing the systematic errors table --- validphys2/src/validphys/coredata.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 48f3264e83..fa9a3347c0 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -126,6 +126,17 @@ class SystematicError: sys_type: str #e.g ADD name: str #e.g UNCORR + def __repr__(self): + return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult}," + "sys_type={self.sys_type}, name={self.name})") + + def __str__(self): + pretty_print = (f"add: {'%.2g' % self.add} " + f"mult: {'%.2g' % self.mult} " + f"type: {self.sys_type} " + f"name: {self.name}") + return pretty_print + @dataclasses.dataclass(eq=False) class CommonData: From d922ae0c85b93e8b8a2c21c1de90d86c8eae0277 Mon Sep 17 00:00:00 2001 From: siranipour Date: Sat, 6 Jun 2020 13:42:45 +0100 Subject: [PATCH 46/58] Adding a __repr__ method --- validphys2/src/validphys/coredata.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index fa9a3347c0..a23931c435 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -130,13 +130,6 @@ def __repr__(self): return (f"{self.__class__.__name__}(add={self.add}, mult={self.mult}," "sys_type={self.sys_type}, name={self.name})") - def __str__(self): - pretty_print = (f"add: {'%.2g' % self.add} " - f"mult: {'%.2g' % self.mult} " - f"type: {self.sys_type} " - f"name: {self.name}") - return pretty_print - @dataclasses.dataclass(eq=False) class CommonData: From 0e2bf010d5803cdce65dc2693947511165640735 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Fri, 29 May 2020 10:31:52 +0100 Subject: [PATCH 47/58] changing comments --- validphys2/src/validphys/commondataparser.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 0195a7ac0d..82bd82c30b 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -14,13 +14,7 @@ def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. - Returns an instance of CommonData, - with: - commondata_table being a pandas dataframe with data arranged like - entry process kin1 kin2 kin3 data stat \ - sys.add.0 sys.mult.0 .... sys.add.N sys.mult.N ; - systype_table being a pandas dataframe with data arranged like - sys_index treatment description. + Returns an instance of CommonData """ commondatafile = spec.datafile # Getting set name from commondata file name @@ -37,10 +31,8 @@ def parse_commondata(commondatafile, systypefile, setname): Parameters ---------- - commondatafile : file - Open file-like object. - systypefile : file - Open file-like object. + commondatafile : file or path to file + systypefile : file or path to file Returns ------- From a92b92e6da9aa08352f31759469028cdafe559e5 Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 29 May 2020 10:44:20 +0100 Subject: [PATCH 48/58] Removing blank lines --- validphys2/src/validphys/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index eb1f548540..7971a27091 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -231,8 +231,8 @@ def grid_values_index(self): return range(0, len(self)) else: raise RuntimeError("Unknown error type") - - + + kinlabels_latex = CommonData.kinLabel_latex.asdict() _kinlabels_keys = sorted(kinlabels_latex, key=len, reverse=True) From c11746e842a85772fb80332b1ca66339597fb019 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Fri, 29 May 2020 12:07:14 +0100 Subject: [PATCH 49/58] loading empty systematics as empty dataframe --- validphys2/src/validphys/commondataparser.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 82bd82c30b..8f9687bad6 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -53,13 +53,16 @@ def parse_commondata(commondatafile, systypefile, setname): commondatatable.columns = commondataheader commondatatable.set_index("entry", inplace=True) - # Now parse systyle file - systypetable = pd.read_csv(systypefile, sep=r'\s+', skiprows=1, header=None) - systypetable.dropna(axis='columns', inplace=True) - # Build header - systypeheader = ["sys_index", "type", "name"] - systypetable.columns = systypeheader - systypetable.set_index("sys_index", inplace=True) + # Now parse systype file + try: + systypetable = pd.read_csv(systypefile, sep=r'\s+', skiprows=1, header=None) + systypetable.dropna(axis='columns', inplace=True) + # Build header + systypeheader = ["sys_index", "type", "name"] + systypetable.columns = systypeheader + systypetable.set_index("sys_index", inplace=True) + except pd.io.common.EmptyDataError: + systypetable = pd.DataFrame() # Populate CommonData object return CommonData( From 54956380fdc0351c955f94b23a86fc00b9bb56c6 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Fri, 29 May 2020 12:10:27 +0100 Subject: [PATCH 50/58] comment explaining try/except for sys load --- validphys2/src/validphys/commondataparser.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 8f9687bad6..83319f87ab 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -54,15 +54,17 @@ def parse_commondata(commondatafile, systypefile, setname): commondatatable.set_index("entry", inplace=True) # Now parse systype file + systypeheader = ["sys_index", "type", "name"] try: - systypetable = pd.read_csv(systypefile, sep=r'\s+', skiprows=1, header=None) + systypetable = pd.read_csv( + systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None + ) systypetable.dropna(axis='columns', inplace=True) - # Build header - systypeheader = ["sys_index", "type", "name"] - systypetable.columns = systypeheader - systypetable.set_index("sys_index", inplace=True) - except pd.io.common.EmptyDataError: - systypetable = pd.DataFrame() + # Some datasets e.g. CMSWCHARMRAT have no systematics + except pd.errors.EmptyDataError: + systypetable = pd.DataFrame(columns=systypeheader) + + systypetable.set_index("sys_index", inplace=True) # Populate CommonData object return CommonData( From a3bd404c0351d0cba1bb3099b417aa6c8d6e1996 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 1 Jun 2020 11:20:05 +0100 Subject: [PATCH 51/58] remove dropna line --- validphys2/src/validphys/commondataparser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 83319f87ab..4ad7c976f5 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -44,7 +44,6 @@ def parse_commondata(commondatafile, systypefile, setname): commondatatable = pd.read_csv(commondatafile, sep=r'\s+', skiprows=1, header=None) # Remove NaNs # TODO: replace commondata files with bad formatting - commondatatable.dropna(axis="columns", inplace=True) # Build header commondataheader = ['entry', 'process', 'kin1', 'kin2', 'kin3', 'data', 'stat'] nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 From 78e20101a0ba6141f405f0cba99b64ee3102d49f Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 1 Jun 2020 11:56:23 +0100 Subject: [PATCH 52/58] test for ds with no systematics --- validphys2/src/validphys/tests/test_commondataparser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index 76528ca302..954eb1ea69 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -14,3 +14,8 @@ def test_basic_commondata_loading(): # Test systype loading assert res.nsys == 25 assert isinstance(res.systype_table, pd.DataFrame) + # Test a dataset with no systematics + emptysyscd = l.check_commondata(setname="CMSWCHARMRAT") + emptysysres = load_commondata(emptysyscd) + assert emptysysres.nsys == 0 + assert emptysysres.systype_table.empty is True \ No newline at end of file From 84ed62ea937dd71b98738053ce4b77bc1b264b53 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 1 Jun 2020 14:40:25 +0100 Subject: [PATCH 53/58] separating parsing of systype files --- validphys2/src/validphys/commondataparser.py | 30 ++++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 4ad7c976f5..4425d2fbc9 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -51,8 +51,24 @@ def parse_commondata(commondatafile, systypefile, setname): commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] commondatatable.columns = commondataheader commondatatable.set_index("entry", inplace=True) + + # Now parse the systype file + systypetable = parse_systypes(systypefile, setname) - # Now parse systype file + # Populate CommonData object + return CommonData( + setname=setname, + ndata=len(commondatatable), + commondataproc=commondatatable["process"][1], + nkin=3, + nsys=nsys, + commondata_table=commondatatable, + systype_table=systypetable + ) + +def parse_systypes(systypefile, setname): + """Parses a systype file and returns a pandas dataframe. + """ systypeheader = ["sys_index", "type", "name"] try: systypetable = pd.read_csv( @@ -65,14 +81,4 @@ def parse_commondata(commondatafile, systypefile, setname): systypetable.set_index("sys_index", inplace=True) - # Populate CommonData object - return CommonData( - setname=setname, - ndata=len(commondatatable), - commondataproc=commondatatable["process"][1], - nkin=3, - nsys=nsys, - commondata_table=commondatatable, - systype_table=systypetable - ) - + return systypetable \ No newline at end of file From c89c33e918f5b0e68345fd6075d521b62e809296 Mon Sep 17 00:00:00 2001 From: RosalynLP Date: Mon, 1 Jun 2020 15:07:12 +0100 Subject: [PATCH 54/58] checking table info against metadata from peek_commondata_metadata --- validphys2/src/validphys/commondataparser.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 4425d2fbc9..2630e4ba69 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd +from validphys.core import peek_commondata_metadata from validphys.coredata import CommonData def load_commondata(spec): @@ -18,7 +19,7 @@ def load_commondata(spec): """ commondatafile = spec.datafile # Getting set name from commondata file name - setname = commondatafile.name[:-4] # removing the .dat suffix + setname = commondatafile.name[5:-4] # DATA prefix and .dat suffix systypefile = spec.sysfile commondata = parse_commondata(commondatafile, systypefile, setname) @@ -51,15 +52,23 @@ def parse_commondata(commondatafile, systypefile, setname): commondataheader += [f"sys.add.{i+1}", f"sys.mult.{i+1}"] commondatatable.columns = commondataheader commondatatable.set_index("entry", inplace=True) - + ndata = len(commondatatable) + commondataproc = commondatatable["process"][1] + # Check for consistency with commondata metadata + cdmetadata = peek_commondata_metadata(commondatafile) + assert setname == cdmetadata.name and \ + nsys == cdmetadata.nsys and ndata == cdmetadata.ndata \ + and commondataproc == cdmetadata.process_type, \ + "Commondata table information does not match metadata" + # Now parse the systype file systypetable = parse_systypes(systypefile, setname) # Populate CommonData object return CommonData( setname=setname, - ndata=len(commondatatable), - commondataproc=commondatatable["process"][1], + ndata=ndata, + commondataproc=commondataproc, nkin=3, nsys=nsys, commondata_table=commondatatable, From c5a39ec42114fa1fdf2465c2906ed02509658009 Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 1 Jun 2020 17:01:14 +0100 Subject: [PATCH 55/58] Adding a with_cuts method --- validphys2/src/validphys/commondataparser.py | 7 ++-- validphys2/src/validphys/coredata.py | 39 +++++++++++++++---- .../validphys/tests/test_commondataparser.py | 3 +- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 2630e4ba69..069e97c891 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,7 +6,6 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ -import numpy as np import pandas as pd from validphys.core import peek_commondata_metadata @@ -62,7 +61,7 @@ def parse_commondata(commondatafile, systypefile, setname): "Commondata table information does not match metadata" # Now parse the systype file - systypetable = parse_systypes(systypefile, setname) + systypetable = parse_systypes(systypefile) # Populate CommonData object return CommonData( @@ -75,7 +74,7 @@ def parse_commondata(commondatafile, systypefile, setname): systype_table=systypetable ) -def parse_systypes(systypefile, setname): +def parse_systypes(systypefile): """Parses a systype file and returns a pandas dataframe. """ systypeheader = ["sys_index", "type", "name"] @@ -90,4 +89,4 @@ def parse_systypes(systypefile, setname): systypetable.set_index("sys_index", inplace=True) - return systypetable \ No newline at end of file + return systypetable diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index a23931c435..d916bf55c9 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -178,6 +178,29 @@ class CommonData: commondata_table: pd.DataFrame systype_table: pd.DataFrame + def with_cuts(self, cuts): + """A method to return a CommonData object where + an integer mask has been applied, keeping only data + points which pass cuts. + + Note if the first data point passes cuts, the first entry + of ``cuts`` should be ``1`` not ``0``. + + Paramters + --------- + cuts: list or validphys.core.Cuts or None + """ + + if hasattr(cuts, 'load'): + cuts = cuts.load() + if cuts is None: + return self + newndata = len(cuts) + new_commondata_table = self.commondata_table.loc[cuts] + return dataclasses.replace( + self, ndata=newndata, commondata_table=new_commondata_table + ) + @property def central_values(self): return self.commondata_table["data"] @@ -194,17 +217,17 @@ def sys_errors(self): table = [ [ SystematicError( - add=sys_table[f"sys.add.{j+1}"][i + 1], - mult=sys_table[f"sys.mult.{j+1}"][i + 1], - sys_type=self.systype_table["type"][j + 1], - name=self.systype_table["name"][j + 1], + add=sys_table[f"sys.add.{j}"][i], + mult=sys_table[f"sys.mult.{j}"][i], + sys_type=self.systype_table["type"][j], + name=self.systype_table["name"][j], ) - for j in range(self.nsys) + for j in self.systype_table.index ] - for i in range(self.ndata) + for i in self.commondata_table.index ] return pd.DataFrame( table, - columns=[f"sys.{i+1}" for i in range(self.nsys)], - index=range(1, self.ndata + 1), + columns=[f"sys.{i}" for i in self.systype_table.index], + index=self.commondata_table.index, ) diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index 954eb1ea69..f020370f2c 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -1,5 +1,6 @@ import pandas as pd +from validphys.api import API from validphys.commondataparser import load_commondata from validphys.loader import Loader @@ -18,4 +19,4 @@ def test_basic_commondata_loading(): emptysyscd = l.check_commondata(setname="CMSWCHARMRAT") emptysysres = load_commondata(emptysyscd) assert emptysysres.nsys == 0 - assert emptysysres.systype_table.empty is True \ No newline at end of file + assert emptysysres.systype_table.empty is True From e79bd2df6a60eb68c273bf8721e49fdff2e387ad Mon Sep 17 00:00:00 2001 From: siranipour Date: Fri, 5 Jun 2020 14:18:58 +0100 Subject: [PATCH 56/58] Adding tests for with_cuts method --- validphys2/src/validphys/coredata.py | 7 +++- .../validphys/tests/test_commondataparser.py | 37 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index d916bf55c9..8d30e6c4aa 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -169,7 +169,6 @@ class CommonData: type (ADD/MULT/RAND) and name (CORR/UNCORR/THEORYCORR/SKIP) """ - #TODO: Apply cuts setname: str ndata: int commondataproc: str @@ -190,11 +189,17 @@ def with_cuts(self, cuts): --------- cuts: list or validphys.core.Cuts or None """ + # Ensure that the cuts we're applying applies to this dataset + # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` + if hasattr(cuts, 'name') and self.setname != cuts.name: + raise ValueError(f"The cuts provided are for {cuts.name} which does not apply " + f"to this commondata file: {self.setname}") if hasattr(cuts, 'load'): cuts = cuts.load() if cuts is None: return self + newndata = len(cuts) new_commondata_table = self.commondata_table.loc[cuts] return dataclasses.replace( diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index f020370f2c..83607c5b89 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -1,8 +1,9 @@ +import pytest import pandas as pd from validphys.api import API from validphys.commondataparser import load_commondata -from validphys.loader import Loader +from validphys.loader import FallbackLoader as Loader def test_basic_commondata_loading(): @@ -20,3 +21,37 @@ def test_basic_commondata_loading(): emptysysres = load_commondata(emptysyscd) assert emptysysres.nsys == 0 assert emptysysres.systype_table.empty is True + + +def test_commondata_with_cuts(): + l = Loader() + setname = "NMC" + + cd = l.check_commondata(setname=setname) + loaded_cd = load_commondata(cd) + + fit_cuts = l.check_fit_cuts(fit="191015-mw-001", setname=setname) + internal_cuts = l.check_internal_cuts( + cd, API.rules(theoryid=162, use_cuts="internal") + ) + + loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts) + assert all(loaded_cd_fit_cuts.commondata_table.index == fit_cuts.load()) + assert all(loaded_cd_fit_cuts.sys_errors.index == fit_cuts.load()) + + loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts) + assert all(loaded_cd_internal_cuts.commondata_table.index == internal_cuts.load()) + + loaded_cd_nocuts = loaded_cd.with_cuts(None) + assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1)) + + preloaded_fit_cuts = fit_cuts.load() + loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts) + assert all(loaded_cd_preloaded_cuts.commondata_table.index == preloaded_fit_cuts) + + assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index == [1, 2, 3]) + + # Check that giving cuts for another dataset raises the correct ValueError exception + bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD") + with pytest.raises(ValueError): + loaded_cd.with_cuts(bad_cuts) From ad1ab6bd0f3e3300854acb3baa0692711e5f337c Mon Sep 17 00:00:00 2001 From: siranipour Date: Sat, 6 Jun 2020 13:33:33 +0100 Subject: [PATCH 57/58] Correcting consistency check --- validphys2/src/validphys/commondataparser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 069e97c891..c1f95bb07b 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -6,6 +6,8 @@ the codebase is currently work in progress, and at the moment this module serves as a proof of concept. """ +from operator import attrgetter + import pandas as pd from validphys.core import peek_commondata_metadata @@ -55,10 +57,8 @@ def parse_commondata(commondatafile, systypefile, setname): commondataproc = commondatatable["process"][1] # Check for consistency with commondata metadata cdmetadata = peek_commondata_metadata(commondatafile) - assert setname == cdmetadata.name and \ - nsys == cdmetadata.nsys and ndata == cdmetadata.ndata \ - and commondataproc == cdmetadata.process_type, \ - "Commondata table information does not match metadata" + if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata): + raise ValueError("Commondata table information does not match metadata") # Now parse the systype file systypetable = parse_systypes(systypefile) From 11c4b5dd49af6528918d1ea1029f0d85bf59cdc3 Mon Sep 17 00:00:00 2001 From: siranipour Date: Mon, 8 Jun 2020 14:28:53 +0100 Subject: [PATCH 58/58] Incrementing cuts by 1 We must do this as cuts indexing starts at 0 while commondata indexing starts at 1. --- validphys2/src/validphys/coredata.py | 6 +++++- .../src/validphys/tests/test_commondataparser.py | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 8d30e6c4aa..8d1ad57dfa 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -183,7 +183,7 @@ def with_cuts(self, cuts): points which pass cuts. Note if the first data point passes cuts, the first entry - of ``cuts`` should be ``1`` not ``0``. + of ``cuts`` should be ``0``. Paramters --------- @@ -200,6 +200,10 @@ def with_cuts(self, cuts): if cuts is None: return self + # We must shift the cuts up by 1 since a cut of 0 implies the first data point + # while commondata indexing starts at 1. + cuts = list(map(lambda x: x + 1, cuts)) + newndata = len(cuts) new_commondata_table = self.commondata_table.loc[cuts] return dataclasses.replace( diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index 83607c5b89..3645545b18 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -36,20 +36,22 @@ def test_commondata_with_cuts(): ) loaded_cd_fit_cuts = loaded_cd.with_cuts(fit_cuts) - assert all(loaded_cd_fit_cuts.commondata_table.index == fit_cuts.load()) - assert all(loaded_cd_fit_cuts.sys_errors.index == fit_cuts.load()) + # We must do these - 1 subtractions due to the fact that cuts indexing + # starts at 0 while commondata indexing starts at 1 + assert all(loaded_cd_fit_cuts.commondata_table.index - 1 == fit_cuts.load()) + assert all(loaded_cd_fit_cuts.sys_errors.index - 1 == fit_cuts.load()) loaded_cd_internal_cuts = loaded_cd.with_cuts(internal_cuts) - assert all(loaded_cd_internal_cuts.commondata_table.index == internal_cuts.load()) + assert all(loaded_cd_internal_cuts.commondata_table.index - 1 == internal_cuts.load()) loaded_cd_nocuts = loaded_cd.with_cuts(None) assert all(loaded_cd_nocuts.commondata_table.index == range(1, cd.ndata + 1)) preloaded_fit_cuts = fit_cuts.load() loaded_cd_preloaded_cuts = loaded_cd.with_cuts(fit_cuts) - assert all(loaded_cd_preloaded_cuts.commondata_table.index == preloaded_fit_cuts) + assert all(loaded_cd_preloaded_cuts.commondata_table.index - 1 == preloaded_fit_cuts) - assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index == [1, 2, 3]) + assert all(loaded_cd.with_cuts([1, 2, 3]).commondata_table.index - 1 == [1, 2, 3]) # Check that giving cuts for another dataset raises the correct ValueError exception bad_cuts = l.check_fit_cuts(fit="191015-mw-001", setname="NMCPD")