diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 7f1f1932e3..7df8c3d589 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -44,6 +44,7 @@ requirements: - sphinx # documentation - recommonmark - sphinx_rtd_theme + - dataclasses # [py==36] test: requires: diff --git a/doc/sphinx/source/theory/FastInterface.rst b/doc/sphinx/source/theory/FastInterface.rst index eaec53fb29..6a9d650afb 100644 --- a/doc/sphinx/source/theory/FastInterface.rst +++ b/doc/sphinx/source/theory/FastInterface.rst @@ -1,3 +1,5 @@ +.. _fktables: + ============================================================ Fast Interface (FK tables) ============================================================ diff --git a/doc/sphinx/source/vp/index.rst b/doc/sphinx/source/vp/index.rst index c31753ba1a..236e6f4f92 100644 --- a/doc/sphinx/source/vp/index.rst +++ b/doc/sphinx/source/vp/index.rst @@ -14,3 +14,4 @@ vp-guide ./nnprofile.md ./scripts.md ./theorycov/index + ./pydataobjs.rst diff --git a/doc/sphinx/source/vp/pydataobjs.rst b/doc/sphinx/source/vp/pydataobjs.rst new file mode 100644 index 0000000000..e1a897d900 --- /dev/null +++ b/doc/sphinx/source/vp/pydataobjs.rst @@ -0,0 +1,29 @@ +.. _pyobjs: + +Python based data objects +========================= + +Internal data formats such as PDF sets, CommonData, or :ref:`FKTables +` files are currently accessed through the `libnnpdf` C++ code +(interfaced trough the SWIG wrappers). However there is a :ref:`project +` underway +to make these resources available in terms of standard Python containers +(particularly numpy arrays and pandas dataframes). The objectives include +simplifying the codebase, increasing the ease of use and enabling more advanced +computation and storage strategies. + +Loading FKTables +---------------- + +Currently only FKTables can be directly without C++ code. This is implemented +in the :py:mod:`validphys.fkparser` module. For example:: + + from validphys.fkparser import load_fktable + from validphys.loader import Loader + l = Loader() + fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) + res = load_fktable(fk) + +results in an :py:mod:`validphys.coredata.FKTableData` object containing all the information needed to compute a +convolution. In particular the ``sigma`` property contains a dataframe +representing the partonic cross-section (including the cfactors). diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py new file mode 100644 index 0000000000..e307c7e6ea --- /dev/null +++ b/validphys2/src/validphys/coredata.py @@ -0,0 +1,71 @@ +""" +Data containers backed by Python managed memory (Numpy arrays and Pandas +dataframes). This module is intended to substitute large parts of the C++ +wrappers. + +""" +import dataclasses +import numpy as np +import pandas as pd + +@dataclasses.dataclass(eq=False) +class FKTableData: + """ + Data contained in an FKTable + + Parameters + ---------- + hadronic : bool + Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed. + + Q0 : float + The scale at which the PDFs should be evaluated (in GeV). + + ndata : int + The number of data points in the grid. + + xgrid : array, shape (nx) + The points in x at which the PDFs should be evaluated. + + sigma : DataFrame + For hadronic data, the columns are the indexes in the ``NfxNf`` list of + possible flavour combinations of two PDFs. The MultiIndex contains + three keys, the data index, an index into ``xgrid`` for the first PDF + and an idex into ``xgrid`` for the second PDF, indicating if the points in + ``x`` where the PDF should be evaluated. + + For DIS data, the columns are indexes in the ``Nf`` list of flavours. + The MultiIndex contains two keys, the data index and an index into + ``xgrid`` indicating the points in ``x`` where the PDF should be + evaluated. + + metadata : dict + Other information contained in the FKTable. + """ + hadronic: bool + Q0: float + ndata: int + xgrid: np.array + sigma: pd.DataFrame + metadata: dict = dataclasses.field(default_factory=dict, repr=False) + +@dataclasses.dataclass(eq=False) +class CFactorData: + """ + Data contained in a CFactor + + Parameters + ---------- + + description : str + Information on how the data was obtained. + + central_value : array, shape(ndata) + The value of the cfactor for each data point. + + uncertainty : array, shape(ndata) + The absolute uncertainty on the cfactor if available. + """ + description: str + central_value: np.array + uncertainty: np.array diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py new file mode 100644 index 0000000000..258d909b9d --- /dev/null +++ b/validphys2/src/validphys/fkparser.py @@ -0,0 +1,368 @@ +""" +This module implements parsers for FKtable and CFactor files into useful +datastructures, contained in the :py:mod:`validphys.coredata` module, which are +not backed by C++ managed memory, and so they can be easily pickled and +interfaces with common Python libraries. The integration of these objects into +the codebase is currently work in progress, and at the moment this module +serves as a proof of concept. + +Most users will be interested in using the high level interface +:py:func:`load_fktable`. Given a :py:class:`validphys.core.FKTableSpec` +object, it returns an instance of :py:class:`validphys.coredata.FKTableData`, +an object with the required information to compute a convolution, with the +CFactors applied. + +.. code-block:: python + + from validphys.fkparser import load_fktable + from validphys.loader import Loader + l = Loader() + fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) + res = load_fktable(fk) +""" +import io +import functools +import tarfile +import dataclasses + +import numpy as np +import pandas as pd + +from validphys.coredata import FKTableData, CFactorData + + + + +class BadCFactorError(Exception): + """Exception raised when an CFactor cannot be parsed correctly""" + + +class BadFKTableError(Exception): + """Exception raised when an FKTable cannot be parsed correctly""" + + +@dataclasses.dataclass(frozen=True) +class GridInfo: + """Class containing the basic properties of an FKTable grid.""" + setname: str + hadronic: bool + ndata: int + nx: int + +def load_fktable(spec): + """Load the data corresponding to a FKSpec object. The cfactors + will be applied to the grid.""" + with open_fkpath(spec.fkpath) as handle: + tabledata = parse_fktable(handle) + if not spec.cfactors: + return tabledata + + ndata = tabledata.ndata + cfprod = np.ones(ndata) + for cf in spec.cfactors: + with open(cf, "rb") as f: + cfdata = parse_cfactor(f) + if len(cfdata.central_value) != ndata: + raise BadCFactorError( + "Length of cfactor data does not match the length of the fktable." + ) + cfprod *= cfdata.central_value + # TODO: Find a way to do this in place + tabledata.sigma = tabledata.sigma.multiply(pd.Series(cfprod), axis=0, level=0) + return tabledata + +def _get_compressed_buffer(path): + archive = tarfile.open(path) + members = archive.getmembers() + l = len(members) + if l != 1: + raise BadFKTableError( + f"Archive {path} should contain one file, but it contains {l}.") + return archive.extractfile(members[0]) + + +def open_fkpath(path): + """Return a file-like object from the fktable path, regardless of whether + it is compressed + + Parameters + .......... + path: Path or str + Path like file containing a valid FKTable. It can be either inside a + tarball or in plain text. + + Returns + ------- + f: file + A file like object for further processing. + """ + if tarfile.is_tarfile(path): + return _get_compressed_buffer(path) + return open(path, 'rb') + + +def _is_header_line(line): + return line.startswith((b'_', b'{')) + +def _bytes_to_bool(x): + return bool(int(x)) + +def _parse_fk_options(line_and_stream, value_parsers=None): + """Parse a sequence of lines of the form + *OPTION: VALUE + into a dictionary. + """ + res = {} + if value_parsers is None: + value_parsers = {} + for lineno, next_line in line_and_stream: + if _is_header_line(next_line): + return res, lineno, next_line + if not next_line.startswith(b'*'): + raise BadFKTableError(f"Error on line {lineno}: Expecting an option starting with '*'") + try: + keybytes, valuebytes = next_line.split(b':', maxsplit=1) + except ValueError: + raise BadFKTableError(f"Error on line {lineno}: Expecting an option containing ':'") + key = keybytes[1:].strip().decode() + if key in value_parsers: + try: + value = value_parsers[key](valuebytes) + except Exception as e: + raise BadFKTableError(f"Could not parse key {key} on line {lineno}") from e + else: + value = valuebytes.strip().decode() + res[key] = value + + raise BadFKTableError("FKTable should end with FastKernel spec, not with a set of options") + + +def _segment_parser(f): + @functools.wraps(f) + def f_(line_and_stream): + buf = io.BytesIO() + for lineno, next_line in line_and_stream: + if _is_header_line(next_line): + processed = f(buf) + return processed, lineno, next_line + buf.write(next_line) + raise BadFKTableError("FKTable should end with FastKernel spec, not with a segment string") + return f_ + +@_segment_parser +def _parse_string(buf): + return buf.getvalue().decode() + +@_segment_parser +def _parse_flavour_map(buf): + buf.seek(0) + return np.loadtxt(buf, dtype=bool) + +@_segment_parser +def _parse_xgrid(buf): + return np.fromstring(buf.getvalue(), sep='\n') + +# This used a different interface from segment parser because we want it to +# be fast. +# We assume it is going to be the last section. +def _parse_hadronic_fast_kernel(f): + """Parse the FastKernel secrion of an hadronic FKTable into a DataFrame. + ``f`` should be a stream containing only the section""" + # Note that we need the slower whitespace here because it turns out + # that there are fktables where space and tab are used as separators + # within the same table. + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1,2)) + df.columns = list(range(14*14)) + df.index.names = ['data', 'x1', 'x2'] + return df + +def _parse_dis_fast_kernel(f): + """Parse the FastKernel section of a DIS FKTable into a DataFrame. + ``f`` should be a stream containing only the section""" + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1)) + df.columns = list(range(14)) + df.index.names = ['data', 'x'] + return df + + +def _parse_gridinfo(line_and_stream): + dict_result, line_number, next_line = _parse_fk_options( + line_and_stream, + value_parsers={ + "HADRONIC": _bytes_to_bool, + "NDATA": int, + "NX": int + }) + gi = GridInfo(**{k.lower(): v for k, v in dict_result.items()}) + return gi, line_number, next_line + + + +def _parse_header(lineno, header): + if not _is_header_line(header): + raise BadFKTableError(f"Bad header at line {lineno}: First " + "character should be either '_' or '{'") + try: + endname = header.index(b'_', 1) + except ValueError: + raise BadFKTableError(f"Bad header at line {lineno}: Expected '_' after name") from None + header_name = header[1:endname] + #Note: This is not the same as header[0]. Bytes iterate as ints. + return header[0:1], header_name.decode() + + +def _build_sigma(f, res): + gi = res["GridInfo"] + fm = res["FlavourMap"] + table = ( + _parse_hadronic_fast_kernel(f) if gi.hadronic else _parse_dis_fast_kernel(f) + ) + # Filter out empty flavour indices + table = table.loc[:, fm.ravel()] + return table + +_KNOWN_SEGMENTS = { + "GridDesc": _parse_string, + "VersionInfo": _parse_fk_options, + "GridInfo": _parse_gridinfo, + "FlavourMap": _parse_flavour_map, + "xGrid": _parse_xgrid, + "TheoryInfo": functools.partial( + _parse_fk_options, + value_parsers={ + "ID": int, + "PTO": int, + "DAMP": _bytes_to_bool, + "IC": _bytes_to_bool, + "XIR": float, + "XIF": float, + "NfFF": int, + "MaxNfAs": int, + "MaxNfPdf": int, + "Q0": float, + "alphas": float, + "Qref": float, + "QED": _bytes_to_bool, + "alphaqed": float, + "Qedref": float, + "SxRes": _bytes_to_bool, + "mc": float, + "Qmc": float, + "kcThr": float, + "mb": float, + "Qmb": float, + "kbThr": float, + "mt": float, + "Qmt": float, + "ktThr": float, + "MZ": float, + "MW": float, + "GF": float, + "SIN2TW": float, + "TMC": _bytes_to_bool, + "MP": float, + "global_nx": int, + "EScaleVar": _bytes_to_bool, + }, + ), +} + +def _check_required_sections(res, lineno): + """Check that we have found all the required sections by the time we + reach 'FastKernel'""" + for section in _KNOWN_SEGMENTS: + if section not in res: + raise BadFKTableError( + f"{section} must come before 'FastKernel' section at {lineno}" + ) + +def parse_fktable(f): + """Parse an open byte stream into an FKTableData. Raise a BadFKTableError + if problems are encountered. + + Parameters + ---------- + f : file + Open file-like object. See :func:`open_fkpath`to obtain it. + + Returns + ------- + fktable : FKTableData + An object containing the FKTable data and information. + + Notes + ----- + This function operates at the level of a single file, and therefore it does + not apply CFactors (see :py:func:`load_fktable` for that) or handle operations + within COMPOUND ensembles. + """ + line_and_stream = enumerate(f, start=1) + res = {} + lineno, header = next(line_and_stream) + while True: + marker, header_name = _parse_header(lineno, header) + if header_name == "FastKernel": + _check_required_sections(res, lineno) + Q0 = res['TheoryInfo']['Q0'] + sigma = _build_sigma(f, res) + hadronic = res['GridInfo'].hadronic + ndata = res['GridInfo'].ndata + xgrid = res.pop('xGrid') + return FKTableData( + sigma=sigma, + ndata=ndata, + Q0=Q0, + metadata=res, + hadronic=hadronic, + xgrid=xgrid, + ) + elif header_name in _KNOWN_SEGMENTS: + parser = _KNOWN_SEGMENTS[header_name] + elif marker == b'{': + parser = _parse_string + elif marker == b'_': + parser = _parse_fk_options + else: + raise RuntimeError("Should not be here") + try: + out, lineno, header = parser(line_and_stream) + except Exception as e: + # Note that the old lineno is the one we want + raise BadFKTableError( + f"Failed processing header {header_name} on line {lineno}" + ) from e + res[header_name] = out + + +def parse_cfactor(f): + """Parse an open byte stream into a :py:class`CFactorData`. Raise a + BadCFactorError if problems are encountered. + + Parameters + ---------- + f : file + Binary file-like object + + Returns + ------- + cfac : CFactorData + An object containing the data on the cfactor for each point. + """ + stars = f.readline() + if not stars.startswith(b'*'): + raise BadCFactorError("First line should start with '*'.") + descbytes = io.BytesIO() + for line in f: + if line.startswith(b'*'): + break + descbytes.write(line) + description = descbytes.getvalue().decode() + try: + data = np.loadtxt(f) + except Exception as e: + raise BadCFactorError(e) from e + central_value = data[:, 0] + uncertainty = data[:, 1] + return CFactorData( + description=description, central_value=central_value, uncertainty=uncertainty + ) diff --git a/validphys2/src/validphys/tests/test_fkparser.py b/validphys2/src/validphys/tests/test_fkparser.py new file mode 100644 index 0000000000..5b8057683d --- /dev/null +++ b/validphys2/src/validphys/tests/test_fkparser.py @@ -0,0 +1,19 @@ +import pandas as pd + +from validphys.fkparser import load_fktable +from validphys.loader import Loader + + +def test_basic_loading(): + l = Loader() + # Test both with and without cfactors, and load both DIS and hadronic + for cfac in ((), ('QCD',)): + fk = l.check_fktable(setname='ATLASTTBARTOT', theoryID=162, cfac=cfac) + res = load_fktable(fk) + assert res.ndata == 3 + assert isinstance(res.sigma, pd.DataFrame) + fk = l.check_fktable(setname='H1HERAF2B', theoryID=162, cfac=()) + res = load_fktable(fk) + assert res.ndata == 12 + assert isinstance(res.sigma, pd.DataFrame) +