From 9f7e5a15a0ec03f7f2f5eef681240a49e6e3d7ab Mon Sep 17 00:00:00 2001 From: Zahari Date: Tue, 19 Mar 2019 22:29:35 +0000 Subject: [PATCH 01/13] Add pure Python FK parser Add functionality to read fktables in pure python. This supports transparently both compressed and plain tables. We have functionality to process both types of sections defined in the spec, namely blobs and option lists. We support types casting (and more general parsing) for the option values and define various specialized parsing for various blobs (at the moment based on numpy). These still need some work. At the moment the code relies on the untested assumption that the FastKernel section comes last. For the moment, return a dictionary with all the fields. --- validphys2/src/validphys/fkparser.py | 151 +++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 validphys2/src/validphys/fkparser.py diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py new file mode 100644 index 0000000000..16d4ec8657 --- /dev/null +++ b/validphys2/src/validphys/fkparser.py @@ -0,0 +1,151 @@ +""" +fkparser.py + +Parse FKtables into useful datastructures +""" +import io +import functools +import tarfile + +import numpy as np + + +class BadFKTableError(Exception): + pass + +def load_fktable(spec): + with open_fkpath(spec.fkpath) as handle: + return parse_fktable(handle) + + +def _get_compressed_buffer(path): + archive = tarfile.open(path) + members = archive.getmembers() + l = len(members) + if l != 1: + raise BadFKTableError( + f"Archive {path} should contain one file, but it contains {l}.") + return archive.extractfile(members[0]) + + +def open_fkpath(path): + """Return a file-like object from the fktable path, regardless of whether + it is compressed""" + if tarfile.is_tarfile(path): + return _get_compressed_buffer(path) + return open(path, 'rb') + + +def _is_header_line(line): + return line.startswith((b'_', b'{')) + +def _parse_fk_options(line_and_stream, value_parsers=None): + """Parse a sequence of lines of the form + *OPTION: VALUE + into a dictionary. + """ + res = {} + if value_parsers is None: + value_parsers = {} + for lineno, next_line in line_and_stream: + if _is_header_line(next_line): + return res, lineno, next_line + if not next_line.startswith(b'*'): + raise BadFKTableError(f"Error on line {lineno}: Expecting an option starting with '*'") + try: + keybytes, valuebytes = next_line.split(b':', maxsplit=1) + except ValueError: + raise BadFKTableError(f"Error on line {lineno}: Expecting an option containing ':'") + key = keybytes[1:].strip().decode() + if key in value_parsers: + try: + value = value_parsers[key](valuebytes) + except Exception as e: + raise BadFKTableError(f"Could not parse key {key} on line {lineno}") from e + else: + value = valuebytes.strip().decode() + res[key] = value + + raise BadFKTableError("FKTable should end with FastKernel spec, not with a set of options") + + +def _segment_parser(f): + @functools.wraps(f) + def f_(line_and_stream): + buf = io.BytesIO() + for lineno, next_line in line_and_stream: + if _is_header_line(next_line): + processed = f(buf) + return processed, lineno, next_line + buf.write(next_line) + raise BadFKTableError("FKTable should end with FastKernel spec, not with a segment string") + return f_ + +@_segment_parser +def _parse_string(buf): + return buf.getvalue().decode() + +@_segment_parser +def _parse_flavour_map(buf): + buf.seek(0) + return np.loadtxt(buf, dtype=bool) + +@_segment_parser +def _parse_xgrid(buf): + return np.fromstring(buf.getvalue(), sep='\n') + +# This used a differen interface from segment parser because we want it to +# be fast. +# We assume it is going to be the last section. +def _parse_fast_kernel(f): + return np.loadtxt(f) + + +def _parse_header(lineno, header): + if not _is_header_line(header): + raise BadFKTableError(f"Bad header at line {lineno}: First " + "character should be either '_' or '{'") + try: + endname = header.index(b'_', 1) + except ValueError: + raise BadFKTableError(f"Bad header at line {lineno}: Expected '_' after name") from None + header_name = header[1:endname] + #Note: This is not the same as header[0]. Bytes iterate as ints. + return header[0:1], header_name.decode() + + +_KNOWN_SEGMENTS = { + "GridDesc": _parse_string, + "VersionInfo": _parse_fk_options, + "GridInfo": functools.partial( + _parse_fk_options, value_parsers={"HADRONIC": bool, "NDATA": int, "NX": int} + ), + "FlavourMap": _parse_flavour_map, + "xGrid": _parse_xgrid, +} + +def parse_fktable(f): + line_and_stream = enumerate(f, start=1) + res = {} + lineno, header = next(line_and_stream) + while True: + marker, header_name = _parse_header(lineno, header) + if header_name in _KNOWN_SEGMENTS: + parser = _KNOWN_SEGMENTS[header_name] + elif marker == b'{': + parser = _parse_string + elif marker == b'_': + parser = _parse_fk_options + else: + raise RuntimeError("Should not be here") + if header_name == 'FastKernel': + res['FastKernel'] = _parse_fast_kernel(f) + return res + try: + out, lineno, header = parser(line_and_stream) + except Exception as e: + #Note that the old lineno is the one we want + raise BadFKTableError(f"Failed processing header {header_name} on line {lineno}") from e + res[header_name] = out + if not header: + break From 3d535614e086027549c44585c3cb96eb594d401e Mon Sep 17 00:00:00 2001 From: Zahari Date: Wed, 20 Mar 2019 11:30:43 +0000 Subject: [PATCH 02/13] Use pandas for parsing This is the easiest way to correctly process the indexes. Moreover we need something more complicated than a one character separator because there are fktables that use '\t' and ' ' at the same time. For the moment this only works for hadronic tables, as DIS have different indexing. --- validphys2/src/validphys/fkparser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 16d4ec8657..188f92c4f3 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -8,6 +8,7 @@ import tarfile import numpy as np +import pandas as pd class BadFKTableError(Exception): @@ -98,7 +99,10 @@ def _parse_xgrid(buf): # be fast. # We assume it is going to be the last section. def _parse_fast_kernel(f): - return np.loadtxt(f) + # Note that we need the slower whitespace here because it turns out + # that there are fktables where space and tab are used as separators + # within the same table. + return pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1,2)) def _parse_header(lineno, header): From dd14faa5bf861dd26c7e7caa8e6d11ab3cb9303f Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 21 Mar 2019 10:44:15 +0000 Subject: [PATCH 03/13] Add support for DIS tables --- validphys2/src/validphys/fkparser.py | 65 +++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 188f92c4f3..7fb23fa1f2 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -6,15 +6,23 @@ import io import functools import tarfile +import dataclasses import numpy as np import pandas as pd +@dataclasses.dataclass(frozen=True) +class GridInfo: + setname: str + hadronic: bool + ndata: int + nx: int class BadFKTableError(Exception): - pass + """Exception raised when an FKTable cannot be parsed correctly""" def load_fktable(spec): + """Load the data corresponding to a FKSpec object""" with open_fkpath(spec.fkpath) as handle: return parse_fktable(handle) @@ -98,11 +106,37 @@ def _parse_xgrid(buf): # This used a differen interface from segment parser because we want it to # be fast. # We assume it is going to be the last section. -def _parse_fast_kernel(f): +def _parse_hadronic_fast_kernel(f): + """Parse the FastKernel secrion of an hadronic FKTable into a DataFrame. + ``f`` should be a stream containing only the section""" # Note that we need the slower whitespace here because it turns out # that there are fktables where space and tab are used as separators # within the same table. - return pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1,2)) + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1,2)) + df.columns = list(range(14*14)) + df.index.names = ['data', 'x1', 'x2'] + return df + +def _parse_dis_fast_kernel(f): + """Parse the FastKernel section of a DIS FKTable into a DataFrame. + ``f`` should be a stream containing only the section""" + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1)) + df.columns = list(range(14)) + df.index.names = ['data', 'x'] + return df + + +def _parse_gridinfo(line_and_stream): + d, l, h = _parse_fk_options( + line_and_stream, + value_parsers={ + "HADRONIC": lambda x: bool(int(x)), + "NDATA": int, + "NX": int + }) + gi = GridInfo(**{k.lower(): v for k, v in d.items()}) + return gi, l, h + def _parse_header(lineno, header): @@ -121,9 +155,7 @@ def _parse_header(lineno, header): _KNOWN_SEGMENTS = { "GridDesc": _parse_string, "VersionInfo": _parse_fk_options, - "GridInfo": functools.partial( - _parse_fk_options, value_parsers={"HADRONIC": bool, "NDATA": int, "NX": int} - ), + "GridInfo": _parse_gridinfo, "FlavourMap": _parse_flavour_map, "xGrid": _parse_xgrid, } @@ -134,7 +166,21 @@ def parse_fktable(f): lineno, header = next(line_and_stream) while True: marker, header_name = _parse_header(lineno, header) - if header_name in _KNOWN_SEGMENTS: + if header_name == "FastKernel": + try: + gi = res["GridInfo"] + except KeyError: + raise BadFKTableError( + "'GridInfo' section must come before 'FastKernel' section " + f"at {lineno}" + ) from None + res["FastKernel"] = ( + _parse_hadronic_fast_kernel(f) + if gi.hadronic + else _parse_dis_fast_kernel(f) + ) + return res + elif header_name in _KNOWN_SEGMENTS: parser = _KNOWN_SEGMENTS[header_name] elif marker == b'{': parser = _parse_string @@ -142,14 +188,9 @@ def parse_fktable(f): parser = _parse_fk_options else: raise RuntimeError("Should not be here") - if header_name == 'FastKernel': - res['FastKernel'] = _parse_fast_kernel(f) - return res try: out, lineno, header = parser(line_and_stream) except Exception as e: #Note that the old lineno is the one we want raise BadFKTableError(f"Failed processing header {header_name} on line {lineno}") from e res[header_name] = out - if not header: - break From a2e7521ef20328b4f6b549aae6d8715a8af14022 Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 21 Mar 2019 19:15:38 +0000 Subject: [PATCH 04/13] Improve parsing Return a higher level object, with the required information to do a convolution, and a bunch of metadata. Parse the ThoryInfo (although we might not use much of it in practice) and refactor a bit the error checking so as to not clutter the main code path. --- validphys2/src/validphys/fkparser.py | 90 ++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 13 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 7fb23fa1f2..3b7d832977 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd + @dataclasses.dataclass(frozen=True) class GridInfo: setname: str @@ -18,6 +19,14 @@ class GridInfo: ndata: int nx: int +@dataclasses.dataclass(eq=False) +class FKTableData: + hadronic: bool + Q0: float + xgrid: np.array + sigma: pd.DataFrame + metadata: dict = dataclasses.field(default_factory=dict, repr=False) + class BadFKTableError(Exception): """Exception raised when an FKTable cannot be parsed correctly""" @@ -48,6 +57,9 @@ def open_fkpath(path): def _is_header_line(line): return line.startswith((b'_', b'{')) +def _bytes_to_bool(x): + return bool(int(x)) + def _parse_fk_options(line_and_stream, value_parsers=None): """Parse a sequence of lines of the form *OPTION: VALUE @@ -130,7 +142,7 @@ def _parse_gridinfo(line_and_stream): d, l, h = _parse_fk_options( line_and_stream, value_parsers={ - "HADRONIC": lambda x: bool(int(x)), + "HADRONIC": _bytes_to_bool, "NDATA": int, "NX": int }) @@ -152,14 +164,71 @@ def _parse_header(lineno, header): return header[0:1], header_name.decode() +def _build_sigma(lineno, f, res): + gi = res["GridInfo"] + fm = res["FlavourMap"] + table = ( + _parse_hadronic_fast_kernel(f) if gi.hadronic else _parse_dis_fast_kernel(f) + ) + # Filter out empty flavour indices + table = table.loc[:, fm.ravel()] + return table + _KNOWN_SEGMENTS = { "GridDesc": _parse_string, "VersionInfo": _parse_fk_options, "GridInfo": _parse_gridinfo, "FlavourMap": _parse_flavour_map, "xGrid": _parse_xgrid, + "TheoryInfo": functools.partial( + _parse_fk_options, + value_parsers={ + "ID": int, + "PTO": int, + "DAMP": _bytes_to_bool, + "IC": _bytes_to_bool, + "XIR": float, + "XIF": float, + "NfFF": int, + "MaxNfAs": int, + "MaxNfPdf": int, + "Q0": float, + "alphas": float, + "Qref": float, + "QED": _bytes_to_bool, + "alphaqed": float, + "Qedref": float, + "SxRes": _bytes_to_bool, + "mc": float, + "Qmc": float, + "kcThr": float, + "mb": float, + "Qmb": float, + "kbThr": float, + "mt": float, + "Qmt": float, + "ktThr": float, + "MZ": float, + "MW": float, + "GF": float, + "SIN2TW": float, + "TMC": _bytes_to_bool, + "MP": float, + "global_nx": int, + "EScaleVar": _bytes_to_bool, + }, + ), } +def _check_required_sections(res, lineno): + """Check that we have found all the required sections by the time we + reach 'FastKernel'""" + for section in _KNOWN_SEGMENTS: + if section not in res: + raise BadFKTableError( + f"{section} must come before 'FastKernel' section at {lineno}" + ) + def parse_fktable(f): line_and_stream = enumerate(f, start=1) res = {} @@ -167,19 +236,14 @@ def parse_fktable(f): while True: marker, header_name = _parse_header(lineno, header) if header_name == "FastKernel": - try: - gi = res["GridInfo"] - except KeyError: - raise BadFKTableError( - "'GridInfo' section must come before 'FastKernel' section " - f"at {lineno}" - ) from None - res["FastKernel"] = ( - _parse_hadronic_fast_kernel(f) - if gi.hadronic - else _parse_dis_fast_kernel(f) + _check_required_sections(res, lineno) + Q0 = res['TheoryInfo']['Q0'] + sigma = _build_sigma(lineno, f, res) + hadronic = res['GridInfo'].hadronic + xgrid = res.pop('xGrid') + return FKTableData( + sigma=sigma, Q0=Q0, metadata=res, hadronic=hadronic, xgrid=xgrid ) - return res elif header_name in _KNOWN_SEGMENTS: parser = _KNOWN_SEGMENTS[header_name] elif marker == b'{': From 9dbc03978fc6fed8c52616216ee22d7ce59bbdfa Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 21 Mar 2019 19:25:50 +0000 Subject: [PATCH 05/13] Add dataclasses dependency for python 3.6 --- conda-recipe/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 7f1f1932e3..7df8c3d589 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -44,6 +44,7 @@ requirements: - sphinx # documentation - recommonmark - sphinx_rtd_theme + - dataclasses # [py==36] test: requires: From fb1a8bbf4e90f6799470b2e259aa5434b9250aa1 Mon Sep 17 00:00:00 2001 From: Zahari Date: Tue, 26 Mar 2019 17:19:54 +0000 Subject: [PATCH 06/13] Add support for cfactors Add some lower level functionality to parse the files, which could be useful if we e.g. did something with the Monte Carlo uncertainty. At high level, all the cfactors simply multiply the fktable. --- validphys2/src/validphys/fkparser.py | 71 +++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 3b7d832977..7837441b2e 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -1,7 +1,7 @@ """ fkparser.py -Parse FKtables into useful datastructures +Parse FKtables and CFactors into useful datastructures. """ import io import functools @@ -23,18 +23,45 @@ class GridInfo: class FKTableData: hadronic: bool Q0: float + ndata: int xgrid: np.array sigma: pd.DataFrame metadata: dict = dataclasses.field(default_factory=dict, repr=False) +@dataclasses.dataclass(eq=False) +class CFactorData: + description: str + central_value: np.array + uncertainty: np.array + +class BadCFactorError(Exception): + """Exception raised when an CFactor cannot be parsed correctly""" + + class BadFKTableError(Exception): """Exception raised when an FKTable cannot be parsed correctly""" def load_fktable(spec): - """Load the data corresponding to a FKSpec object""" + """Load the data corresponding to a FKSpec object. The cfactors + will be applied to the grid.""" with open_fkpath(spec.fkpath) as handle: - return parse_fktable(handle) - + tabledata = parse_fktable(handle) + if not spec.cfactors: + return tabledata + + ndata = tabledata.ndata + cfprod = np.ones(ndata) + for cf in spec.cfactors: + with open(cf, "rb") as f: + cfdata = parse_cfactor(f) + if len(cfdata.central_value) != ndata: + raise BadCFactorError( + "Length of cfactor data does not match the length of the fktable." + ) + cfprod *= cfdata.central_value + # TODO: Find a way to do this in place + tabledata.sigma = tabledata.sigma.multiply(pd.Series(cfprod), axis=0, level=0) + return tabledata def _get_compressed_buffer(path): archive = tarfile.open(path) @@ -240,9 +267,15 @@ def parse_fktable(f): Q0 = res['TheoryInfo']['Q0'] sigma = _build_sigma(lineno, f, res) hadronic = res['GridInfo'].hadronic + ndata = res['GridInfo'].ndata xgrid = res.pop('xGrid') return FKTableData( - sigma=sigma, Q0=Q0, metadata=res, hadronic=hadronic, xgrid=xgrid + sigma=sigma, + ndata=ndata, + Q0=Q0, + metadata=res, + hadronic=hadronic, + xgrid=xgrid, ) elif header_name in _KNOWN_SEGMENTS: parser = _KNOWN_SEGMENTS[header_name] @@ -255,6 +288,30 @@ def parse_fktable(f): try: out, lineno, header = parser(line_and_stream) except Exception as e: - #Note that the old lineno is the one we want - raise BadFKTableError(f"Failed processing header {header_name} on line {lineno}") from e + # Note that the old lineno is the one we want + raise BadFKTableError( + f"Failed processing header {header_name} on line {lineno}" + ) from e res[header_name] = out + + +def parse_cfactor(f): + """Parse an open byte stream into a ``CFactorData""" + stars = f.readline() + if not stars.startswith(b'*'): + raise BadCFactorError("First line should start with '*'.") + descbytes = io.BytesIO() + for line in f: + if line.startswith(b'*'): + break + descbytes.write(line) + description = descbytes.getvalue().decode() + try: + data = np.loadtxt(f) + except Exception as e: + raise BadCFactorError(e) from e + central_value = data[:, 0] + uncertainty = data[:, 1] + return CFactorData( + description=description, central_value=central_value, uncertainty=uncertainty + ) From f90ef0d8b6cfd70eb120203a91028c1a0936e118 Mon Sep 17 00:00:00 2001 From: Zahari Date: Tue, 26 Mar 2019 17:58:34 +0000 Subject: [PATCH 07/13] Add some documentation --- validphys2/src/validphys/fkparser.py | 69 +++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 7837441b2e..14b20ff325 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -1,7 +1,19 @@ """ fkparser.py -Parse FKtables and CFactors into useful datastructures. +Parse FKtables and CFactors into useful datastructures. This module include +some functionality to process FKTables and cfactors. Most users will be +interested in using the high level interface ``loaf_fktable``. Given an +``FKTableSpec``, it returns an instance of ``FHTableData``, an object with the +required information to compute a convolution, with the CFactors applied. + +.. code-block:: python + + from validphys.fkparser import load_fktable + from validphys.loader import Loader + l = Loader() + fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) + res = load_fktable(fk) """ import io import functools @@ -14,6 +26,7 @@ @dataclasses.dataclass(frozen=True) class GridInfo: + """Class containig the basic properties of an FKTable grid.""" setname: str hadronic: bool ndata: int @@ -21,6 +34,39 @@ class GridInfo: @dataclasses.dataclass(eq=False) class FKTableData: + """ + Data contained in an FKTable + + Parameters + ---------- + + hadronic : bool + Whether an hadronic (two PDFs) or a DIS (one PDF) convolution is needed. + + Q0 : float + The scale at which the PDFs should be evaluated (in GeV). + + ndata : int + The number of data points in the grid. + + xgrid : array, shape (nx) + The points in x at which the PDFs should be evaluated. + + sigma : DataFrame + For hadronic data, the columns are the indexes in the ``NfxNf`` list of + possible flavour combinations of two PDFs. The MultiIndex contains + three keys, the data index, an index into ``xgrid`` for the first PDF + and an idex into ``xgrid`` for the second PDF, indicatinf the points in + ``x`` where the PDF should be evaluated. + + For DIS data, the columns are indexes in the ``Nf`` list of flavours. + The MultiIndex contains two keys, the data index and an index into + ``xgrid`` indicating the points in ``x`` where the PDF should be + evaluated. + + metadata : dict + Other information contained in the FKTable. + """ hadronic: bool Q0: float ndata: int @@ -30,6 +76,22 @@ class FKTableData: @dataclasses.dataclass(eq=False) class CFactorData: + """ + Data contained in a CFactor + + Parameters + ---------- + + description : str + Information on how the data was obtained. + + central_value : array, shape(ndata) + The value of the cfactor for each data point. + + uncertainty : array, shape(ndata) + The absolute uncerainty on the cfactor if available. Otherwise a list + of zeros. + """ description: str central_value: np.array uncertainty: np.array @@ -257,6 +319,8 @@ def _check_required_sections(res, lineno): ) def parse_fktable(f): + """Parse an open byte stream into an FKTableData. Raise a BaadFKTableError + if problems are encountered.""" line_and_stream = enumerate(f, start=1) res = {} lineno, header = next(line_and_stream) @@ -296,7 +360,8 @@ def parse_fktable(f): def parse_cfactor(f): - """Parse an open byte stream into a ``CFactorData""" + """Parse an open byte stream into a ``CFactorData``. Raise a + BadCFactorError if problems are encontered.""" stars = f.readline() if not stars.startswith(b'*'): raise BadCFactorError("First line should start with '*'.") From a77422356fae28775d6fab52bd7092574696e7ae Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 31 Mar 2020 16:33:16 +0200 Subject: [PATCH 08/13] fix typos --- validphys2/src/validphys/fkparser.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 14b20ff325..15d412d1cd 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -1,7 +1,7 @@ """ fkparser.py -Parse FKtables and CFactors into useful datastructures. This module include +Parse FKtables and CFactors into useful datastructures. This module includes some functionality to process FKTables and cfactors. Most users will be interested in using the high level interface ``loaf_fktable``. Given an ``FKTableSpec``, it returns an instance of ``FHTableData``, an object with the @@ -41,7 +41,7 @@ class FKTableData: ---------- hadronic : bool - Whether an hadronic (two PDFs) or a DIS (one PDF) convolution is needed. + Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed. Q0 : float The scale at which the PDFs should be evaluated (in GeV). @@ -56,7 +56,7 @@ class FKTableData: For hadronic data, the columns are the indexes in the ``NfxNf`` list of possible flavour combinations of two PDFs. The MultiIndex contains three keys, the data index, an index into ``xgrid`` for the first PDF - and an idex into ``xgrid`` for the second PDF, indicatinf the points in + and an idex into ``xgrid`` for the second PDF, indicating if the points in ``x`` where the PDF should be evaluated. For DIS data, the columns are indexes in the ``Nf`` list of flavours. @@ -89,7 +89,7 @@ class CFactorData: The value of the cfactor for each data point. uncertainty : array, shape(ndata) - The absolute uncerainty on the cfactor if available. Otherwise a list + The absolute uncertainty on the cfactor if available. Otherwise a list of zeros. """ description: str @@ -204,7 +204,7 @@ def _parse_flavour_map(buf): def _parse_xgrid(buf): return np.fromstring(buf.getvalue(), sep='\n') -# This used a differen interface from segment parser because we want it to +# This used a different interface from segment parser because we want it to # be fast. # We assume it is going to be the last section. def _parse_hadronic_fast_kernel(f): @@ -361,7 +361,7 @@ def parse_fktable(f): def parse_cfactor(f): """Parse an open byte stream into a ``CFactorData``. Raise a - BadCFactorError if problems are encontered.""" + BadCFactorError if problems are encountered.""" stars = f.readline() if not stars.startswith(b'*'): raise BadCFactorError("First line should start with '*'.") From 387bfe8866524e9eb2e5842952727f75179d0f7c Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 31 Mar 2020 16:39:16 +0200 Subject: [PATCH 09/13] missed two typos --- validphys2/src/validphys/fkparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 15d412d1cd..dca4543c32 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -3,7 +3,7 @@ Parse FKtables and CFactors into useful datastructures. This module includes some functionality to process FKTables and cfactors. Most users will be -interested in using the high level interface ``loaf_fktable``. Given an +interested in using the high level interface ``load_fktable``. Given an ``FKTableSpec``, it returns an instance of ``FHTableData``, an object with the required information to compute a convolution, with the CFactors applied. @@ -26,7 +26,7 @@ @dataclasses.dataclass(frozen=True) class GridInfo: - """Class containig the basic properties of an FKTable grid.""" + """Class containing the basic properties of an FKTable grid.""" setname: str hadronic: bool ndata: int From bd375876163db581e1e199a4eb93ae30d77d57bb Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 2 Apr 2020 17:11:17 +0100 Subject: [PATCH 10/13] Split data structures in a separate module These could potentially be obtained in various other ways, so it makes sense to septate them and make it clear that they are not tied to the file format. --- validphys2/src/validphys/coredata.py | 72 +++++++++++++++++++++ validphys2/src/validphys/fkparser.py | 97 ++++++---------------------- 2 files changed, 93 insertions(+), 76 deletions(-) create mode 100644 validphys2/src/validphys/coredata.py diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py new file mode 100644 index 0000000000..508a843bea --- /dev/null +++ b/validphys2/src/validphys/coredata.py @@ -0,0 +1,72 @@ +""" +Data containers backed by Python managed memory (Numpy arrays and Pandas +dataframes). This module is intended to substitute large parts of the C++ +wrappers. + +""" +import dataclasses +import numpy as np +import pandas as pd + +@dataclasses.dataclass(eq=False) +class FKTableData: + """ + Data contained in an FKTable + + Parameters + ---------- + + hadronic : bool + Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed. + + Q0 : float + The scale at which the PDFs should be evaluated (in GeV). + + ndata : int + The number of data points in the grid. + + xgrid : array, shape (nx) + The points in x at which the PDFs should be evaluated. + + sigma : DataFrame + For hadronic data, the columns are the indexes in the ``NfxNf`` list of + possible flavour combinations of two PDFs. The MultiIndex contains + three keys, the data index, an index into ``xgrid`` for the first PDF + and an idex into ``xgrid`` for the second PDF, indicating if the points in + ``x`` where the PDF should be evaluated. + + For DIS data, the columns are indexes in the ``Nf`` list of flavours. + The MultiIndex contains two keys, the data index and an index into + ``xgrid`` indicating the points in ``x`` where the PDF should be + evaluated. + + metadata : dict + Other information contained in the FKTable. + """ + hadronic: bool + Q0: float + ndata: int + xgrid: np.array + sigma: pd.DataFrame + metadata: dict = dataclasses.field(default_factory=dict, repr=False) + +@dataclasses.dataclass(eq=False) +class CFactorData: + """ + Data contained in a CFactor + + Parameters + ---------- + + description : str + Information on how the data was obtained. + + central_value : array, shape(ndata) + The value of the cfactor for each data point. + + uncertainty : array, shape(ndata) + The absolute uncertainty on the cfactor if available. + """ + description: str + central_value: np.array + uncertainty: np.array diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index dca4543c32..af295c9fc5 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -1,11 +1,15 @@ """ -fkparser.py - -Parse FKtables and CFactors into useful datastructures. This module includes -some functionality to process FKTables and cfactors. Most users will be -interested in using the high level interface ``load_fktable``. Given an -``FKTableSpec``, it returns an instance of ``FHTableData``, an object with the -required information to compute a convolution, with the CFactors applied. +This module implements parsers for FKtable and CFactor files into useful +datastructures, contained in the :py:mod:`validphys.coredata` module, which are +not backed by C++ managed memory, and so they can be easily pickled and +interfaces with common Python libraries. The integration of these objects into +the codebase is currently work in progrress, and at the moment this module +serves as a proof of concept. + +Most users will be interested in using the high level interface +``load_fktable``. Given an :py:class:`coredata.FKTableSpec`, it returns an +instance of ``FKTableData``, an object with the required information to compute +a convolution, with the CFactors applied. .. code-block:: python @@ -23,78 +27,10 @@ import numpy as np import pandas as pd +from validphys.coredata import FKTableData, CFactorData -@dataclasses.dataclass(frozen=True) -class GridInfo: - """Class containing the basic properties of an FKTable grid.""" - setname: str - hadronic: bool - ndata: int - nx: int - -@dataclasses.dataclass(eq=False) -class FKTableData: - """ - Data contained in an FKTable - - Parameters - ---------- - - hadronic : bool - Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed. - - Q0 : float - The scale at which the PDFs should be evaluated (in GeV). - - ndata : int - The number of data points in the grid. - xgrid : array, shape (nx) - The points in x at which the PDFs should be evaluated. - sigma : DataFrame - For hadronic data, the columns are the indexes in the ``NfxNf`` list of - possible flavour combinations of two PDFs. The MultiIndex contains - three keys, the data index, an index into ``xgrid`` for the first PDF - and an idex into ``xgrid`` for the second PDF, indicating if the points in - ``x`` where the PDF should be evaluated. - - For DIS data, the columns are indexes in the ``Nf`` list of flavours. - The MultiIndex contains two keys, the data index and an index into - ``xgrid`` indicating the points in ``x`` where the PDF should be - evaluated. - - metadata : dict - Other information contained in the FKTable. - """ - hadronic: bool - Q0: float - ndata: int - xgrid: np.array - sigma: pd.DataFrame - metadata: dict = dataclasses.field(default_factory=dict, repr=False) - -@dataclasses.dataclass(eq=False) -class CFactorData: - """ - Data contained in a CFactor - - Parameters - ---------- - - description : str - Information on how the data was obtained. - - central_value : array, shape(ndata) - The value of the cfactor for each data point. - - uncertainty : array, shape(ndata) - The absolute uncertainty on the cfactor if available. Otherwise a list - of zeros. - """ - description: str - central_value: np.array - uncertainty: np.array class BadCFactorError(Exception): """Exception raised when an CFactor cannot be parsed correctly""" @@ -103,6 +39,15 @@ class BadCFactorError(Exception): class BadFKTableError(Exception): """Exception raised when an FKTable cannot be parsed correctly""" + +@dataclasses.dataclass(frozen=True) +class GridInfo: + """Class containing the basic properties of an FKTable grid.""" + setname: str + hadronic: bool + ndata: int + nx: int + def load_fktable(spec): """Load the data corresponding to a FKSpec object. The cfactors will be applied to the grid.""" From bf8e5eeae2e02f6ff49787531a81c1668ab2f263 Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 2 Apr 2020 19:23:45 +0100 Subject: [PATCH 11/13] Add docs and minimal internal code improvements Make minor cosmetic changes and extend and improve docs. --- doc/sphinx/source/theory/FastInterface.rst | 2 + doc/sphinx/source/vp/index.rst | 1 + doc/sphinx/source/vp/pydataobjs.rst | 29 ++++++++++ validphys2/src/validphys/coredata.py | 1 - validphys2/src/validphys/fkparser.py | 65 ++++++++++++++++++---- 5 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 doc/sphinx/source/vp/pydataobjs.rst diff --git a/doc/sphinx/source/theory/FastInterface.rst b/doc/sphinx/source/theory/FastInterface.rst index eaec53fb29..6a9d650afb 100644 --- a/doc/sphinx/source/theory/FastInterface.rst +++ b/doc/sphinx/source/theory/FastInterface.rst @@ -1,3 +1,5 @@ +.. _fktables: + ============================================================ Fast Interface (FK tables) ============================================================ diff --git a/doc/sphinx/source/vp/index.rst b/doc/sphinx/source/vp/index.rst index c31753ba1a..236e6f4f92 100644 --- a/doc/sphinx/source/vp/index.rst +++ b/doc/sphinx/source/vp/index.rst @@ -14,3 +14,4 @@ vp-guide ./nnprofile.md ./scripts.md ./theorycov/index + ./pydataobjs.rst diff --git a/doc/sphinx/source/vp/pydataobjs.rst b/doc/sphinx/source/vp/pydataobjs.rst new file mode 100644 index 0000000000..27e4b9973b --- /dev/null +++ b/doc/sphinx/source/vp/pydataobjs.rst @@ -0,0 +1,29 @@ +.. _pyobjs: + +Python based data objects +========================= + +Internal data formats such as PDF sets, CommonData, or :ref:`FKTables +` files are currently accessed though the `libnnpdf` C++ code +(interfaced trough the SWIG wrappers). However there is a :ref:`project +` underway +to make these resources available in terms of containers standard Python +(particularly numpy arrays and pandas dataframes). The objectives include +simplifying the codebase, increasing the ease of use and enabling more advanced +computation and storage strategies. + +Loading FKTables +---------------- + +Currently only FKTables can be directly without C++ code. This is implemented +in the :py:mod:`validphys.fkarser` module. For example:: + + from validphys.fkparser import load_fktable + from validphys.loader import Loader + l = Loader() + fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) + res = load_fktable(fk) + +results in an object containing all the information needed to compute a +convolution. In particular the ``sigma`` property contains a dataframe +representing the partonic cross-section (including the cfactors). diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 508a843bea..e307c7e6ea 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -15,7 +15,6 @@ class FKTableData: Parameters ---------- - hadronic : bool Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed. diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index af295c9fc5..82b22220cb 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -7,9 +7,10 @@ serves as a proof of concept. Most users will be interested in using the high level interface -``load_fktable``. Given an :py:class:`coredata.FKTableSpec`, it returns an -instance of ``FKTableData``, an object with the required information to compute -a convolution, with the CFactors applied. +:py:func:`load_fktable`. Given a :py:class:`validphys.core.FKTableSpec` +object, it returns an instance of :py:class:`validphys.coredata.FKTableData`, +an object with the required information to compute a convolution, with the +CFactors applied. .. code-block:: python @@ -82,7 +83,19 @@ def _get_compressed_buffer(path): def open_fkpath(path): """Return a file-like object from the fktable path, regardless of whether - it is compressed""" + it is compressed + + Parameters + .......... + path: Path or str + Path like file containing a valid FKTable. It can be either inside a + tarball or in plain text. + + Returns + ------- + f: file + A file like object for further processing. + """ if tarfile.is_tarfile(path): return _get_compressed_buffer(path) return open(path, 'rb') @@ -173,15 +186,15 @@ def _parse_dis_fast_kernel(f): def _parse_gridinfo(line_and_stream): - d, l, h = _parse_fk_options( + dict_result, line_number, next_line = _parse_fk_options( line_and_stream, value_parsers={ "HADRONIC": _bytes_to_bool, "NDATA": int, "NX": int }) - gi = GridInfo(**{k.lower(): v for k, v in d.items()}) - return gi, l, h + gi = GridInfo(**{k.lower(): v for k, v in dict_result.items()}) + return gi, line_number, next_line @@ -198,7 +211,7 @@ def _parse_header(lineno, header): return header[0:1], header_name.decode() -def _build_sigma(lineno, f, res): +def _build_sigma(f, res): gi = res["GridInfo"] fm = res["FlavourMap"] table = ( @@ -265,7 +278,24 @@ def _check_required_sections(res, lineno): def parse_fktable(f): """Parse an open byte stream into an FKTableData. Raise a BaadFKTableError - if problems are encountered.""" + if problems are encountered. + + Parameters + ---------- + f : file + Open file-like object. See :func:`open_fkpath`to obtain it. + + Returns + ------- + fktable : FKTableData + An object containing the FKTable data and information. + + Notes + ----- + This function operates at the level of a single file, and therefore it does + not apply CFactors (see :py:func:`load_fktable` for that) or handle operations + within COMPOUND ensembles. + """ line_and_stream = enumerate(f, start=1) res = {} lineno, header = next(line_and_stream) @@ -274,7 +304,7 @@ def parse_fktable(f): if header_name == "FastKernel": _check_required_sections(res, lineno) Q0 = res['TheoryInfo']['Q0'] - sigma = _build_sigma(lineno, f, res) + sigma = _build_sigma(f, res) hadronic = res['GridInfo'].hadronic ndata = res['GridInfo'].ndata xgrid = res.pop('xGrid') @@ -305,8 +335,19 @@ def parse_fktable(f): def parse_cfactor(f): - """Parse an open byte stream into a ``CFactorData``. Raise a - BadCFactorError if problems are encountered.""" + """Parse an open byte stream into a :py:class`CFactorData`. Raise a + BadCFactorError if problems are encountered. + + Parameters + ---------- + f : file + Binary file-like object + + Returns + ------- + cfac : CFactorData + An object containing the data on the cfactor for each point. + """ stars = f.readline() if not stars.startswith(b'*'): raise BadCFactorError("First line should start with '*'.") From 7a6f8b8287dad9974c264c6bb89a04973f645bf9 Mon Sep 17 00:00:00 2001 From: Zahari Date: Thu, 2 Apr 2020 20:57:28 +0100 Subject: [PATCH 12/13] Add basic tests In fairness one should also test the error paths but these are unimportant and difficult enough that I can't be bothered. However more quantitative tests are in fact needed, but those require some convolution code. --- .../src/validphys/tests/test_fkparser.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 validphys2/src/validphys/tests/test_fkparser.py diff --git a/validphys2/src/validphys/tests/test_fkparser.py b/validphys2/src/validphys/tests/test_fkparser.py new file mode 100644 index 0000000000..5b8057683d --- /dev/null +++ b/validphys2/src/validphys/tests/test_fkparser.py @@ -0,0 +1,19 @@ +import pandas as pd + +from validphys.fkparser import load_fktable +from validphys.loader import Loader + + +def test_basic_loading(): + l = Loader() + # Test both with and without cfactors, and load both DIS and hadronic + for cfac in ((), ('QCD',)): + fk = l.check_fktable(setname='ATLASTTBARTOT', theoryID=162, cfac=cfac) + res = load_fktable(fk) + assert res.ndata == 3 + assert isinstance(res.sigma, pd.DataFrame) + fk = l.check_fktable(setname='H1HERAF2B', theoryID=162, cfac=()) + res = load_fktable(fk) + assert res.ndata == 12 + assert isinstance(res.sigma, pd.DataFrame) + From a0adaf6068b5ec33059b83f0940a09891be761ec Mon Sep 17 00:00:00 2001 From: juacrumar Date: Fri, 3 Apr 2020 09:46:45 +0200 Subject: [PATCH 13/13] fixed a few typos --- doc/sphinx/source/vp/pydataobjs.rst | 8 ++++---- validphys2/src/validphys/fkparser.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/sphinx/source/vp/pydataobjs.rst b/doc/sphinx/source/vp/pydataobjs.rst index 27e4b9973b..e1a897d900 100644 --- a/doc/sphinx/source/vp/pydataobjs.rst +++ b/doc/sphinx/source/vp/pydataobjs.rst @@ -4,10 +4,10 @@ Python based data objects ========================= Internal data formats such as PDF sets, CommonData, or :ref:`FKTables -` files are currently accessed though the `libnnpdf` C++ code +` files are currently accessed through the `libnnpdf` C++ code (interfaced trough the SWIG wrappers). However there is a :ref:`project ` underway -to make these resources available in terms of containers standard Python +to make these resources available in terms of standard Python containers (particularly numpy arrays and pandas dataframes). The objectives include simplifying the codebase, increasing the ease of use and enabling more advanced computation and storage strategies. @@ -16,7 +16,7 @@ Loading FKTables ---------------- Currently only FKTables can be directly without C++ code. This is implemented -in the :py:mod:`validphys.fkarser` module. For example:: +in the :py:mod:`validphys.fkparser` module. For example:: from validphys.fkparser import load_fktable from validphys.loader import Loader @@ -24,6 +24,6 @@ in the :py:mod:`validphys.fkarser` module. For example:: fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) res = load_fktable(fk) -results in an object containing all the information needed to compute a +results in an :py:mod:`validphys.coredata.FKTableData` object containing all the information needed to compute a convolution. In particular the ``sigma`` property contains a dataframe representing the partonic cross-section (including the cfactors). diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 82b22220cb..258d909b9d 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -3,7 +3,7 @@ datastructures, contained in the :py:mod:`validphys.coredata` module, which are not backed by C++ managed memory, and so they can be easily pickled and interfaces with common Python libraries. The integration of these objects into -the codebase is currently work in progrress, and at the moment this module +the codebase is currently work in progress, and at the moment this module serves as a proof of concept. Most users will be interested in using the high level interface @@ -277,7 +277,7 @@ def _check_required_sections(res, lineno): ) def parse_fktable(f): - """Parse an open byte stream into an FKTableData. Raise a BaadFKTableError + """Parse an open byte stream into an FKTableData. Raise a BadFKTableError if problems are encountered. Parameters