Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 73 additions & 1 deletion doc/validphys2/guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -1602,8 +1602,16 @@ configuration setting:
the cuts for the given dataset. This is useful for example for
requiring the common subset of points that pass the cuts at NLO and
NNLO.
`use_cuts: 'fromsimilarpredictions'`
~ Compute the intersection between two namespaces (similar to for
`fromintersection`) but additionally require that the predictions computed for
each dataset across the namespaces are *similar*, specifically that the ratio
between the absolute difference in the predictions and the total experimental
uncertainty is smaller than a given value, `cut_similarity_threshold` that
must be provided. Note that for this to work with different cfactors across
the namespaces, one must provide a different `dataset_inputs` list for each.

The following example demonstrates these options:
The following example demonstrates the first three options:

```yaml
meta:
Expand Down Expand Up @@ -1671,6 +1679,70 @@ for each one individually. With these settings the later three
[dataspecs](#general-data-specification-the-dataspec-api) give the
same result.

The following example demonstrates the use of `fromsimilarpredictions`:

```yaml
meta:
title: "Test similarity cuts: Threshold 1,2"
author: Zahari Kassabov
keywords: [test]

show_total: True

NNLODatasts: &NNLODatasts
- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: [QCD]} # N
- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: [QCD]} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: [QCD]} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: [QCD]} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: [QCD]} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: [QCD]} # N

NLODatasts: &NLODatasts
- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: []} # N
- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: []} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: []} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: []} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: []} # N
- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: []} # N


dataset_inputs: *NLODatasts

cuts_intersection_spec:
- theoryid: 52
pdf: NNPDF31_nlo_as_0118
dataset_inputs: *NLODatasts

- theoryid: 53
pdf: NNPDF31_nlo_as_0118
dataset_inputs: *NNLODatasts


theoryid: 52
pdf: NNPDF31_nlo_as_0118

dataspecs:

- use_cuts: internal
speclabel: "No cuts"


- cut_similarity_threshold: 2
speclabel: "Threshold 2"
use_cuts: fromsimilarpredictions


- cut_similarity_threshold: 1
speclabel: "Threshold 1"
use_cuts: fromsimilarpredictions

template_text: |
{@dataspecs_chi2_table@}

actions_:
- report(main=True)
```

### Data theory comparison

The name of the data-theory comparison tool is `plot_fancy`. You can
Expand Down
138 changes: 56 additions & 82 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
ExperimentInput,
CutsPolicy,
MatchedCuts,
SimilarCuts,
ThCovMatSpec,
)
from validphys.loader import (
Expand Down Expand Up @@ -362,7 +363,13 @@ def produce_commondata(self, *, dataset_input, use_fitcommondata=False, fit=None
except InconsistentMetaDataError as e:
raise ConfigError(e) from e

def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
def parse_cut_similarity_threshold(self, th: numbers.Real):
"""Maximum relative ratio when using `fromsimilarpredictons` cuts."""
return th

def produce_cuts(
self, *, commondata, use_cuts, rules, fit=None, theoryid=None,
):
"""Obtain cuts for a given dataset input, based on the
appropriate policy."""
# TODO: Put this bit of logic into loader.check_cuts
Expand All @@ -384,7 +391,10 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
if not theoryid:
raise ConfigError("theoryid must be specified for internal cuts")
return self.loader.check_internal_cuts(commondata, rules)
elif use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE:
elif (
use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE
or use_cuts is CutsPolicy.FROM_SIMILAR_PREDICTIONS_NAMESPACE
):
cut_list = []
_, nss = self.parse_from_(None, "cuts_intersection_spec", write=False)
self._check_dataspecs_type(nss)
Expand All @@ -401,7 +411,43 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
_, nscuts = self.parse_from_(None, "cuts", write=False)
cut_list.append(nscuts)
ndata = commondata.ndata
return MatchedCuts(cut_list, ndata=ndata)
matched_cuts = MatchedCuts(cut_list, ndata=ndata)
if use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE:
return matched_cuts
else:
if len(nss) != 2:
raise ConfigError("Can only work with two namespaces")
_, cut_similarity_threshold = self.parse_from_(
None, "cut_similarity_threshold", write=False
)
name = commondata.name
inps = []
for i, ns in enumerate(nss):
with self.set_context(ns=self._curr_ns.new_child({**ns,})):
# TODO: find a way to not duplicate this and use a dict
# instead of a linear search
_, dins = self.parse_from_(None, "dataset_inputs", write=False)
try:
di = next(d for d in dins if d.name == name)
except StopIteration as e:
raise ConfigError(
f"cuts_intersection_spec namespace {i}: dataset inputs must define {name}"
) from e

with self.set_context(
ns=self._curr_ns.new_child(
{
"dataset_input": di,
"use_cuts": CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not just set the "cuts":matched_cuts or does that break something?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems sub optimal to have to reproduce the exact same set of cuts, especially considering we saved them earlier but I don't know about patching objects into the namespace directly.

"cuts": matched_cuts,
**ns,
}
)
):
_, ds = self.parse_from_(None, "dataset", write=False)
_, pdf = self.parse_from_(None, "pdf", write=False)
inps.append((ds, pdf))
return SimilarCuts(tuple(inps), cut_similarity_threshold)

raise TypeError("Wrong use_cuts")

Expand Down Expand Up @@ -455,17 +501,7 @@ def produce_dataset(
return ds

@configparser.element_of("experiments")
def parse_experiment(
self,
experiment: dict,
*,
theoryid,
use_cuts,
rules,
fit=None,
check_plotting: bool = False,
use_fitcommondata=False,
):
def parse_experiment(self, experiment: dict):
"""A set of datasets where correlated systematics are taken
into account. It is a mapping where the keys are the experiment
name 'experiment' and a list of datasets."""
Expand All @@ -475,41 +511,12 @@ def parse_experiment(
raise ConfigError(
"'experiment' must be a mapping with "
"'experiment' and 'datasets', but %s is missing" % e
)
) from e

dsinputs = [self.parse_dataset_input(ds) for ds in datasets]
cds = [
self.produce_commondata(
dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit
)
for dsinp in dsinputs
]
cutinps = [
self.produce_cuts(
rules=rules,
commondata=cd,
use_cuts=use_cuts,
fit=fit,
theoryid=theoryid,
)
for cd in cds
]

# autogenerated func, from elemet_of
datasets = [
self.produce_dataset(
rules=rules,
dataset_input=dsinp,
theoryid=theoryid,
cuts=cuts,
fit=fit,
check_plotting=check_plotting,
use_fitcommondata=use_fitcommondata,
)
for (dsinp, cuts) in zip(dsinputs, cutinps)
]

return DataGroupSpec(name=name, datasets=datasets, dsinputs=dsinputs)
return self.produce_data(group_name=name, data_input=dsinputs)

@configparser.element_of("experiment_inputs")
def parse_experiment_input(self, ei: dict):
Expand Down Expand Up @@ -1174,49 +1181,16 @@ def produce_data(
self,
data_input,
*,
theoryid,
use_cuts,
rules,
fit=None,
check_plotting: bool = False,
use_fitcommondata=False,
group_name="data",
):
"""A set of datasets where correlated systematics are taken
into account
"""
# TODO: extract the commondata and cuts and seperate from dataset
cds = [
self.produce_commondata(
dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit
)
for dsinp in data_input
]
cutinps = [
self.produce_cuts(
rules=rules,
commondata=cd,
use_cuts=use_cuts,
fit=fit,
theoryid=theoryid,
)
for cd in cds
]
datasets = []
for dsinp in data_input:
with self.set_context(ns=self._curr_ns.new_child({"dataset_input": dsinp})):
datasets.append(self.parse_from_(None, "dataset", write=False)[1])

# autogenerated func, from element_of
datasets = [
self.produce_dataset(
rules=rules,
dataset_input=dsinp,
theoryid=theoryid,
cuts=cuts,
fit=fit,
check_plotting=check_plotting,
use_fitcommondata=use_fitcommondata,
)
for (dsinp, cuts) in zip(data_input, cutinps)
]
# TODO: get rid of libnnpdf Experiment
return DataGroupSpec(name=group_name, datasets=datasets, dsinputs=data_input)

def _parse_data_input_from_(
Expand Down
36 changes: 36 additions & 0 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ class CutsPolicy(enum.Enum):
NOCUTS = "nocuts"
FROMFIT = "fromfit"
FROM_CUT_INTERSECTION_NAMESPACE = "fromintersection"
FROM_SIMILAR_PREDICTIONS_NAMESPACE = "fromsimilarpredictions"


class Cuts(TupleComp):
def __init__(self, name, path):
Expand Down Expand Up @@ -396,6 +398,40 @@ def load(self):
self._full = True
return np.arange(self.ndata)

class SimilarCuts(TupleComp):
def __init__(self, inputs, threshold):
if len(inputs) != 2:
raise ValueError("Expecting two input tuples")
firstcuts, secondcuts = inputs[0][0].cuts, inputs[1][0].cuts
if firstcuts != secondcuts:
raise ValueError("Expecting cuts to be the same for all datasets")
self.inputs = inputs
self.threshold = threshold
super().__init__(self.inputs, self.threshold)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Zaharid just a general question about why we are inheriting from TupleComp here and in the other related classes because I can't see anywhere where the methods from TupleComp are used?

Copy link
Copy Markdown
Contributor

@wilsonmr wilsonmr Jan 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The methods of TupleComp mean, amongst other things, that the subclasses are hashable. For example you can initiate two such classes with the same arguments and you will see that they are equal. See with PDF:

>>> from validphys.core import PDF
>>> pdf1 = PDF("foo")
>>> pdf2 = PDF("foo")
>>> pdf1 == pdf2
True

Which is useful because as far as we are concerned in this case a PDF is fully defined by its name (there should only be one NNPDF31_nnlo_as_0118 and that's all I need to know to get the correct pdf from the LHAPDF share folder).

Answering where this gets used: probably by the reportengine resource builder at some point to find unique dependencies (but I don't know - Zahari can answer). However, also all the classes which have their load method decorated with @functools.lru_cache must be hashable because the lru_cache stores the result of calling a function with a set of arguments, and so to check if it already cached the result you need to be able to see whether the arguments are actually the same. Since load always takes self as the first positional, you need to be able to check that self is indeed the same for two subsequent calls, which TupleComp facilitates.

There is also another double underscore method which means that classes print nicely:

>>> pdf1
PDF(name='foo')

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh and I should say by default the first thing I said isn't the case:

>>> class TestClass:
...  def __init__(self, a, b): self.a = a; self.b = b
... 
>>> cls1 = TestClass(1, 2)
>>> cls2 = TestClass(1, 2)
>>> cls1 == cls2
False

Perhaps I misunderstood something but that's my understanding anyway

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TupleComp is mostly a simpleminded attempt to implement something like dataclasses from the standard library, before they existed (and with only the required properties). I would probably use that nowadays, despite some unwelcome complexity (and in fact this is done in various newer interfaces already such as the fkparser). Note there is #408.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you both!


def load(self):
# TODO: Update this when a suitable interace becomes available
from validphys.convolution import central_predictions
from validphys.commondataparser import load_commondata
from validphys.covmats import covmat_from_systematics

first, second = self.inputs
first_ds = first[0]
exp_err = np.sqrt(
np.diag(
covmat_from_systematics(
load_commondata(first_ds.commondata).with_cuts(first_ds.cuts)
)
)
)
# Compute matched predictions
delta = np.abs(
(central_predictions(*first) - central_predictions(*second)).squeeze(axis=1)
)
ratio = delta / exp_err
passed = ratio < self.threshold
return passed[passed].index


def cut_mask(cuts):
"""Return an objects that will act as the cuts when applied as a slice"""
Expand Down
6 changes: 3 additions & 3 deletions validphys2/src/validphys/dataplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,9 +960,9 @@ def plot_dataspecs_positivity(
@make_argcheck
def _check_display_cuts_requires_use_cuts(display_cuts, use_cuts):
check(
not (display_cuts
and use_cuts not in (CutsPolicy.FROMFIT, CutsPolicy.INTERNAL)),
"The display_cuts option requires setting use_cuts to True")
not (display_cuts and use_cuts is CutsPolicy.NOCUTS),
"The display_cuts option requires setting some cuts",
)

@make_argcheck
def _check_marker_by(marker_by):
Expand Down
18 changes: 18 additions & 0 deletions validphys2/src/validphys/tests/test_cuts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from validphys.api import API
from validphys.core import SimilarCuts
from validphys.tests.conftest import THEORYID, PDF, DATA



def test_similarity_cuts():
plain = [{"dataset": dt["dataset"]} for dt in DATA]
inp = {
"theoryid": THEORYID,
"pdf": PDF,
"cut_similarity_threshold": 1.5,
"use_cuts": "fromsimilarpredictions",
"cuts_intersection_spec": [{"dataset_inputs": DATA}, {"dataset_inputs": plain}],
"dataset_input": DATA[1],
}
ds = API.dataset(**inp)
assert isinstance(ds.cuts, SimilarCuts)