Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a331e04
added method that returns a new instance with modified central values…
Dec 13, 2022
beca837
added method to load validphys.core.CommonData from validphys.core.Co…
Dec 13, 2022
e192f80
added functions to write commondata and systype files and function to…
Dec 13, 2022
7e465c5
added functions used to generate pseudo data for closure tests
Dec 13, 2022
e4c8097
added test for validphys.pseudodata.make_level0_data function
Dec 13, 2022
61a9889
generation of level1 data done by make_replica function. Random seed …
Dec 13, 2022
7bf9018
import logging module within commondataparser
Dec 13, 2022
d9c2499
test_make_level0 data updated
Dec 13, 2022
c2af92b
theory 162 added
Dec 14, 2022
19d1f63
.
Dec 14, 2022
0a7442d
make_level0_data test done with theoryid 162
Dec 14, 2022
c1e5a16
added description to make_level1_data
Dec 14, 2022
b2e3a2c
added method to load list of validphys.coredata.CommonData instances …
Dec 14, 2022
ab0ca4c
list of commondata loaded with new DataGroupSpec method
Dec 14, 2022
c4dc48a
method name changed
Dec 14, 2022
cd75cc7
name of DataGroupSpec method changed
Dec 14, 2022
33d2ce9
reset_index of commondata tables
Dec 15, 2022
9a859b5
deleted test_filter_rebuild_closure_data.csv
Dec 15, 2022
eb438f9
regressions/test_filter_rebuild_closure_data.csv file updated
Dec 15, 2022
2c1abaa
bug in sytypes file name fixed
Dec 16, 2022
2c7f128
added functions to write commondata tables to files
Dec 16, 2022
16760bf
import new validphys.commondataparser functions to write commondata t…
Dec 16, 2022
9b8f5bc
added single_dataset
Dec 16, 2022
a1f9689
import info from conftest.py
Dec 16, 2022
d782d7a
unusued fakeset loaded with c++ removed
Dec 28, 2022
0ce9470
added functions to write commondata and systype data to buffer
Dec 28, 2022
66821be
write commondata and systype using commondataparser functions
Dec 28, 2022
6e86c5a
comment using numpy doc style
Dec 29, 2022
bd6be97
use assert_allclose from numpy.testing for arrays
comane Dec 31, 2022
a8d2055
use experiments_index to index level1 data in make_level1_data
comane Jan 4, 2023
f2a9624
use experiments_index provider
comane Jan 4, 2023
7e827c6
added commondata_wc provider to get commondata with cuts list given D…
comane Jan 4, 2023
3795663
make_level0_data renamed to level0_commondata_wc
comane Jan 4, 2023
cae6083
Merge branch 'use_only_python_commondata' into python_closure_samplin…
scarlehoff Jan 12, 2023
cc868fb
use the new export functions
scarlehoff Jan 12, 2023
2af5f39
Removed error rescaling within _filter_closure_data as unused. elimin…
comane Jan 31, 2023
40e6dc6
Merge branch 'master' into python_closure_sampling_merged
Zaharid Feb 7, 2023
68f389c
Update developing.yml
scarlehoff Feb 8, 2023
3fb3454
Improve formatting
Zaharid Feb 9, 2023
ab60f9d
load cd before if conditions
comane Feb 9, 2023
b4177bb
import dataset_t0_predictions outside of loop
comane Feb 9, 2023
af4b97d
import write commondata functions at the top of module
comane Feb 9, 2023
3e8161e
import at top of module
comane Feb 9, 2023
bc056bf
simplification of logic of _filter_closure_data function
comane Feb 10, 2023
1f61dfc
eliminated dependence of make_level1_data function on commondata_wc p…
comane Feb 10, 2023
167cbe2
commondata_wc provider deleted as superfluos
comane Feb 10, 2023
804ebca
_filter_closure_data no longer depends on prepare_nnpdf_rng and check…
comane Feb 13, 2023
9025c95
import write functions from commondatawriter to avoid circular import…
comane Feb 21, 2023
a4e6b39
module for writing commondata and systype table to file
comane Feb 21, 2023
d5fb7a0
write function in commondatawrite
comane Feb 21, 2023
5c2aaec
.
comane Feb 21, 2023
0da9735
Apply suggestions from code review
scarlehoff Feb 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions n3fit/runcards/examples/developing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ parameters: # This defines the parameter dictionary that is passed to the Model

fitting:
fitbasis: EVOL # EVOL (7), EVOLQED (8), etc.
savepseudodata: False
basis:
- {fl: sng, trainable: false, smallx: [1.093, 1.121], largex: [1.486, 3.287]}
- {fl: g, trainable: false, smallx: [0.8329, 1.071], largex: [3.084, 6.767]}
Expand Down
1 change: 1 addition & 0 deletions validphys2/src/validphys/commondataparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def parse_systypes(systypefile):
@dataclasses.dataclass(frozen=True)
class CommonDataMetadata:
"""Contains metadata information about the data being read"""

name: str
nsys: int
ndata: int
Expand Down
83 changes: 83 additions & 0 deletions validphys2/src/validphys/commondatawriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
This module contains functions to write commondata and systypes
tables to files
"""

def write_commondata_data(commondata, buffer):
"""
write commondata table to buffer, this can be a memory map,
compressed archive or strings (using for instance StringIO)


Parameters
----------

commondata : validphys.coredata.CommonData

buffer : memory map, compressed archive or strings
example: StringIO object


Example
-------
>>> from validphys.loader import Loader
>>> from io import StringIO

>>> l = Loader()
>>> cd = l.check_commondata("NMC").load_commondata_instance()
>>> sio = StringIO()
>>> write_commondata_data(cd,sio)
>>> print(sio.getvalue())

"""
header = f"{commondata.setname} {commondata.nsys} {commondata.ndata}\n"
buffer.write(header)
commondata.commondata_table.to_csv(buffer, sep="\t", header=None)


def write_commondata_to_file(commondata, path):
"""
write commondata table to file
"""
with open(path, "w") as file:
write_commondata_data(commondata, file)


def write_systype_data(commondata, buffer):
"""
write systype table to buffer, this can be a memory map,
compressed archive or strings (using for instance StringIO)


Parameters
----------

commondata : validphys.coredata.CommonData

buffer : memory map, compressed archive or strings
example: StringIO object


Example
-------
>>> from validphys.loader import Loader
>>> from io import StringIO

>>> l = Loader()
>>> cd = l.check_commondata("NMC").load_commondata_instance()
>>> sio = StringIO()
>>> write_systype_data(cd,sio)
>>> print(sio.getvalue())

"""
header = f"{commondata.nsys}\n"
buffer.write(header)
commondata.systype_table.to_csv(buffer, sep="\t", header=None)


def write_systype_to_file(commondata, path):
"""
write systype table to file
"""
with open(path, "w") as file:
write_systype_data(commondata, file)
23 changes: 23 additions & 0 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,14 @@ def __iter__(self):
def load(self):
return parse_commondata(self.datafile, self.sysfile, self.name)

def load_commondata_instance(self):
"""
load a validphys.core.CommonDataSpec to validphys.core.CommonData
"""
from validphys.commondataparser import load_commondata

return load_commondata(self)

@property
def plot_kinlabels(self):
return get_plot_kinlabels(self)
Expand Down Expand Up @@ -618,6 +626,21 @@ def load(self):
def load_commondata(self):
return [d.load_commondata() for d in self.datasets]


def load_commondata_instance(self):
"""
Given Experiment load list of validphys.coredata.CommonData
objects with cuts already applied
"""
commodata_list = []
for dataset in self.datasets:
cd = dataset.commondata.load_commondata_instance()
if dataset.cuts is None:
commodata_list.append(cd)
else:
commodata_list.append(cd.with_cuts(dataset.cuts.load()))
return commodata_list

@property
def thspec(self):
#TODO: Is this good enough? Should we explicitly pass the theory
Expand Down
20 changes: 11 additions & 9 deletions validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import dataclasses
import numpy as np
import pandas as pd

from validphys.commondatawriter import write_systype_to_file, write_commondata_to_file
KIN_NAMES = ["kin1", "kin2", "kin3"]


Expand Down Expand Up @@ -295,6 +295,11 @@ def get_kintable(self):
def central_values(self):
return self.commondata_table["data"]

def with_central_value(self, cv):
tb = self.commondata_table.copy()
tb["data"] = cv
return dataclasses.replace(self, commondata_table=tb)

def get_cv(self):
return self.central_values.values

Expand Down Expand Up @@ -364,21 +369,18 @@ def systematic_errors(self, central_values=None):
converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100
return pd.concat((self.additive_errors, converted_mult_errors), axis=1)


def export(self, path):
"""Export the data, and error types
Use the same format as libNNPDF:
Use the same format as libNNPDF:

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean the format is no longer the same as libNNPDF and so they are not compatible?

- A DATA_<dataset>.dat file with the dataframe of accepted points
- A systypes/STYPES_<dataset>.dat file with the error types
"""

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather have all the write functions in another file so they don't create a circular import.

And since they are the only part of commondataparser.py that gets imported in filters.py it won't change the structure. I also think that "namewise" they do not belong into a "parser" since they are doing the opposite.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree that they don't belong in the same place. The place I would look for write functions is the same where the read functios are. Besides, unless we have the write functions in coredata.py its not going to be trivial to avoid circular imports.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of the lines added to commondataparser.py need to import anything. You can just create a commondatawriter.py and then the problem is fixed. It is trivial to avoid circular imports.

The place I would look for write functions ...

If there is a file called commondatawriter.py next to it I'm sure you will open the right file.

Copy link
Copy Markdown
Contributor

@Zaharid Zaharid Feb 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reader functions need to import the data definitions they are reading into. Also I don't think there is anything wrong with circular imports in this case that justifies adding files with ten lines worth of code (not that it would avoid them, as said).

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reader functions need to import the data definitions they are reading into.

Which reader functions? I think you are thinking of a different thing?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was answering to

None of the lines added to commondataparser.py need to import anything.

saying that the functions that take files and return data structures do need to import the data structure, and there would be a circular import if you want a convenience method to read.

Copy link
Copy Markdown
Member Author

@scarlehoff scarlehoff Feb 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, but I really cannot see which of the lines added to commondataparser.py is using anything imported form the outside. They are all self-contained functions.

There are no reads methods in the changes introduced by this Pull Request. I just want to have a commondatawriter.py with those functions.

edit: i just tried it out and the tests pass without a circular import

dat_path = path / f"DATA_{self.setname}.dat"
sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
sys_path.parent.mkdir(exist_ok=True)

dat_string_raw = self.commondata_table.to_string(index=False, header=False, float_format="{:.8e}".format)
header = f"{self.setname} {self.nsys} {self.ndata}"
dat_string = "\n".join([f" {i+1} {r}" for i, r in enumerate(dat_string_raw.split("\n"))])
dat_path.write_text(f"{header}\n{dat_string}\n")

sys_raw = self.systype_table.to_string(index=True, header=False, index_names=False)
sys_path.write_text(f"{self.nsys}\n{sys_raw}\n")
write_systype_to_file(self, sys_path)
write_commondata_to_file(self, dat_path)
153 changes: 104 additions & 49 deletions validphys2/src/validphys/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@

import numpy as np

from reportengine.checks import make_argcheck, check, check_positive, make_check
from reportengine.checks import check, make_check
from reportengine.compat import yaml
import validphys.cuts

from validphys.commondatawriter import (
write_commondata_to_file,
write_systype_to_file,
)
log = logging.getLogger(__name__)

KIN_LABEL = {
Expand Down Expand Up @@ -72,12 +75,6 @@ def default_filter_rules_input():
return yaml.safe_load(read_text(validphys.cuts, "filters.yaml"))


@make_argcheck
def check_rngalgo(rngalgo: int):
"""Check rngalgo content"""
check(0 <= rngalgo < 17,
"Invalid rngalgo. Must be int between [0, 16].")


def check_nonnegative(var: str):
"""Ensure that `var` is positive"""
Expand All @@ -100,41 +97,20 @@ def export_mask(path, mask):
"""Dump mask to file"""
np.savetxt(path, mask, fmt='%d')

@check_rngalgo
@check_nonnegative('filterseed')
@check_nonnegative('seed')
def prepare_nnpdf_rng(filterseed:int, rngalgo:int, seed:int):
"""Initialise the internal NNPDF RNG, specified by ``rngalgo`` which must
be an integer between 0 and 16, seeded with ``filterseed``.
The RNG can then be subsequently used to i.e generate pseudodata.
"""
try:
from NNPDF import RandomGenerator
except ImportError as e:
logging.error("Generating closure data needs a valid installation of libNNPDF")
raise e

log.warning("Importing libNNPDF")
log.info("Initialising RNG")
RandomGenerator.InitRNG(rngalgo, seed)
RandomGenerator.GetRNG().SetSeed(filterseed)

@check_positive('errorsize')
def filter_closure_data(filter_path, data, fakepdf, fakenoise, errorsize, prepare_nnpdf_rng):

def filter_closure_data(filter_path, data, fakepdf, fakenoise, filterseed):
"""Filter closure data. In addition to cutting data points, the data is
generated from an underlying ``fakepdf``, applying a shift to the data
if ``fakenoise`` is ``True``, which emulates the experimental central values
being shifted away from the underlying law.

"""
log.info('Filtering closure-test data.')
return _filter_closure_data(
filter_path, data, fakepdf, fakenoise, errorsize)
return _filter_closure_data(filter_path, data, fakepdf, fakenoise, filterseed)


@check_positive("errorsize")
Comment thread
comane marked this conversation as resolved.
def filter_closure_data_by_experiment(
filter_path, experiments_data, fakepdf, fakenoise, errorsize, prepare_nnpdf_rng,
filter_path, experiments_data, fakepdf, fakenoise, filterseed, experiments_index
):
"""
Like :py:func:`filter_closure_data` except filters data by experiment.
Expand All @@ -145,10 +121,19 @@ def filter_closure_data_by_experiment(
not reproducible.

"""
return [
_filter_closure_data(filter_path, exp, fakepdf, fakenoise, errorsize)
for exp in experiments_data
]

res = []
for exp in experiments_data:
experiment_index = experiments_index[
experiments_index.isin([exp.name], level=0)
]
res.append(
_filter_closure_data(
filter_path, exp, fakepdf, fakenoise, filterseed, experiment_index
)
)

return res


def filter_real_data(filter_path, data):
Expand Down Expand Up @@ -183,6 +168,7 @@ def _write_ds_cut_data(path, dataset):

def _filter_real_data(filter_path, data):
"""Filter real experimental data."""

total_data_points = 0
total_cut_data_points = 0
for dataset in data.datasets:
Expand All @@ -194,24 +180,93 @@ def _filter_real_data(filter_path, data):
return total_data_points, total_cut_data_points


def _filter_closure_data(filter_path, data, fakepdf, fakenoise, errorsize):
"""Filter closure test data."""
def _filter_closure_data(
filter_path, data, fakepdf, fakenoise, filterseed, experiments_index
):
"""
This function is accessed within a closure test only, that is, the fakedata
namespace has to be True (If fakedata = False, the _filter_real_data function
will be used to write the commondata files).

The function writes commondata and systypes files within the
name_closure_test/filter folder.
If fakenoise is True, Level 1 type data is written to the filter folder, otherwise
Level 0 data is written.

Level 1 data is generated from the Level 0 data by adding noise sampled from
the experimental covariance matrix using the validphys.pseudodata.make_replica
function.

Parameters
----------

filter_path : str
path to filter folder

data : validphys.core.DataGroupSpec

fakepdf : validphys.core.PDF

fakenoise : bool
if fakenoise perform level1 shift of central data values

filterseed : int
random seed used for the generation of
random noise added to Level 0 data


experiments_index : pandas.MultiIndex


Returns
-------
tuple
total data points and points passing the cuts

"""
total_data_points = 0
total_cut_data_points = 0
fakeset = fakepdf.legacy_load()
# Load data, don't cache result
loaded_data = data.load.__wrapped__(data)
# generate level 1 shift if fakenoise
loaded_data.MakeClosure(fakeset, fakenoise)
for j, dataset in enumerate(data.datasets):

# circular import generated @ core.py
from validphys.pseudodata import level0_commondata_wc, make_level1_data

closure_data = level0_commondata_wc(data, fakepdf)

for dataset in data.datasets:
#== print number of points passing cuts, make dataset directory and write FKMASK ==#
path = filter_path / dataset.name
nfull, ncut = _write_ds_cut_data(path, dataset)
make_dataset_dir(path / "systypes")
total_data_points += nfull
total_cut_data_points += ncut
loaded_ds = loaded_data.GetSet(j)
if errorsize != 1.0:
loaded_ds.RescaleErrors(errorsize)
loaded_ds.Export(str(path))

if fakenoise:
#======= Level 1 closure test =======#

closure_data = make_level1_data(
data,
closure_data,
filterseed,
experiments_index,
)

#====== write commondata and systype files ======#
if fakenoise:
log.info("Writing Level1 data")
else:
log.info("Writing Level0 data")

for cd in closure_data:
path_cd = filter_path / cd.setname / f"DATA_{cd.setname}.dat"
path_sys = (
filter_path
/ cd.setname
/ "systypes"
/ f"SYSTYPE_{cd.setname}_DEFAULT.dat"
)
write_commondata_to_file(commondata=cd, path=path_cd)
write_systype_to_file(commondata=cd, path=path_sys)

return total_data_points, total_cut_data_points


Expand Down
Loading