diff --git a/doc/validphys2/guide.md b/doc/validphys2/guide.md index d4c17a5a09..42a7f9aac1 100644 --- a/doc/validphys2/guide.md +++ b/doc/validphys2/guide.md @@ -1602,8 +1602,16 @@ configuration setting: the cuts for the given dataset. This is useful for example for requiring the common subset of points that pass the cuts at NLO and NNLO. +`use_cuts: 'fromsimilarpredictions'` + ~ Compute the intersection between two namespaces (similar to for + `fromintersection`) but additionally require that the predictions computed for + each dataset across the namespaces are *similar*, specifically that the ratio + between the absolute difference in the predictions and the total experimental + uncertainty is smaller than a given value, `cut_similarity_threshold` that + must be provided. Note that for this to work with different cfactors across + the namespaces, one must provide a different `dataset_inputs` list for each. -The following example demonstrates these options: +The following example demonstrates the first three options: ```yaml meta: @@ -1671,6 +1679,70 @@ for each one individually. With these settings the later three [dataspecs](#general-data-specification-the-dataspec-api) give the same result. +The following example demonstrates the use of `fromsimilarpredictions`: + +```yaml +meta: + title: "Test similarity cuts: Threshold 1,2" + author: Zahari Kassabov + keywords: [test] + +show_total: True + +NNLODatasts: &NNLODatasts +- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: [QCD]} # N +- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: [QCD]} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: [QCD]} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: [QCD]} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: [QCD]} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: [QCD]} # N + +NLODatasts: &NLODatasts +- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: []} # N +- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: []} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: []} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: []} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: []} # N +- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: []} # N + + +dataset_inputs: *NLODatasts + +cuts_intersection_spec: + - theoryid: 52 + pdf: NNPDF31_nlo_as_0118 + dataset_inputs: *NLODatasts + + - theoryid: 53 + pdf: NNPDF31_nlo_as_0118 + dataset_inputs: *NNLODatasts + + +theoryid: 52 +pdf: NNPDF31_nlo_as_0118 + +dataspecs: + + - use_cuts: internal + speclabel: "No cuts" + + + - cut_similarity_threshold: 2 + speclabel: "Threshold 2" + use_cuts: fromsimilarpredictions + + + - cut_similarity_threshold: 1 + speclabel: "Threshold 1" + use_cuts: fromsimilarpredictions + +template_text: | + {@dataspecs_chi2_table@} + +actions_: + - report(main=True) +``` + ### Data theory comparison The name of the data-theory comparison tool is `plot_fancy`. You can diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 0faab45183..d51d849203 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -35,6 +35,7 @@ ExperimentInput, CutsPolicy, MatchedCuts, + SimilarCuts, ThCovMatSpec, ) from validphys.loader import ( @@ -362,7 +363,13 @@ def produce_commondata(self, *, dataset_input, use_fitcommondata=False, fit=None except InconsistentMetaDataError as e: raise ConfigError(e) from e - def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None): + def parse_cut_similarity_threshold(self, th: numbers.Real): + """Maximum relative ratio when using `fromsimilarpredictons` cuts.""" + return th + + def produce_cuts( + self, *, commondata, use_cuts, rules, fit=None, theoryid=None, + ): """Obtain cuts for a given dataset input, based on the appropriate policy.""" # TODO: Put this bit of logic into loader.check_cuts @@ -384,7 +391,10 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None): if not theoryid: raise ConfigError("theoryid must be specified for internal cuts") return self.loader.check_internal_cuts(commondata, rules) - elif use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE: + elif ( + use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE + or use_cuts is CutsPolicy.FROM_SIMILAR_PREDICTIONS_NAMESPACE + ): cut_list = [] _, nss = self.parse_from_(None, "cuts_intersection_spec", write=False) self._check_dataspecs_type(nss) @@ -401,7 +411,43 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None): _, nscuts = self.parse_from_(None, "cuts", write=False) cut_list.append(nscuts) ndata = commondata.ndata - return MatchedCuts(cut_list, ndata=ndata) + matched_cuts = MatchedCuts(cut_list, ndata=ndata) + if use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE: + return matched_cuts + else: + if len(nss) != 2: + raise ConfigError("Can only work with two namespaces") + _, cut_similarity_threshold = self.parse_from_( + None, "cut_similarity_threshold", write=False + ) + name = commondata.name + inps = [] + for i, ns in enumerate(nss): + with self.set_context(ns=self._curr_ns.new_child({**ns,})): + # TODO: find a way to not duplicate this and use a dict + # instead of a linear search + _, dins = self.parse_from_(None, "dataset_inputs", write=False) + try: + di = next(d for d in dins if d.name == name) + except StopIteration as e: + raise ConfigError( + f"cuts_intersection_spec namespace {i}: dataset inputs must define {name}" + ) from e + + with self.set_context( + ns=self._curr_ns.new_child( + { + "dataset_input": di, + "use_cuts": CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE, + "cuts": matched_cuts, + **ns, + } + ) + ): + _, ds = self.parse_from_(None, "dataset", write=False) + _, pdf = self.parse_from_(None, "pdf", write=False) + inps.append((ds, pdf)) + return SimilarCuts(tuple(inps), cut_similarity_threshold) raise TypeError("Wrong use_cuts") @@ -455,17 +501,7 @@ def produce_dataset( return ds @configparser.element_of("experiments") - def parse_experiment( - self, - experiment: dict, - *, - theoryid, - use_cuts, - rules, - fit=None, - check_plotting: bool = False, - use_fitcommondata=False, - ): + def parse_experiment(self, experiment: dict): """A set of datasets where correlated systematics are taken into account. It is a mapping where the keys are the experiment name 'experiment' and a list of datasets.""" @@ -475,41 +511,12 @@ def parse_experiment( raise ConfigError( "'experiment' must be a mapping with " "'experiment' and 'datasets', but %s is missing" % e - ) + ) from e dsinputs = [self.parse_dataset_input(ds) for ds in datasets] - cds = [ - self.produce_commondata( - dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit - ) - for dsinp in dsinputs - ] - cutinps = [ - self.produce_cuts( - rules=rules, - commondata=cd, - use_cuts=use_cuts, - fit=fit, - theoryid=theoryid, - ) - for cd in cds - ] - # autogenerated func, from elemet_of - datasets = [ - self.produce_dataset( - rules=rules, - dataset_input=dsinp, - theoryid=theoryid, - cuts=cuts, - fit=fit, - check_plotting=check_plotting, - use_fitcommondata=use_fitcommondata, - ) - for (dsinp, cuts) in zip(dsinputs, cutinps) - ] - return DataGroupSpec(name=name, datasets=datasets, dsinputs=dsinputs) + return self.produce_data(group_name=name, data_input=dsinputs) @configparser.element_of("experiment_inputs") def parse_experiment_input(self, ei: dict): @@ -1174,49 +1181,16 @@ def produce_data( self, data_input, *, - theoryid, - use_cuts, - rules, - fit=None, - check_plotting: bool = False, - use_fitcommondata=False, group_name="data", ): """A set of datasets where correlated systematics are taken into account """ - # TODO: extract the commondata and cuts and seperate from dataset - cds = [ - self.produce_commondata( - dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit - ) - for dsinp in data_input - ] - cutinps = [ - self.produce_cuts( - rules=rules, - commondata=cd, - use_cuts=use_cuts, - fit=fit, - theoryid=theoryid, - ) - for cd in cds - ] + datasets = [] + for dsinp in data_input: + with self.set_context(ns=self._curr_ns.new_child({"dataset_input": dsinp})): + datasets.append(self.parse_from_(None, "dataset", write=False)[1]) - # autogenerated func, from element_of - datasets = [ - self.produce_dataset( - rules=rules, - dataset_input=dsinp, - theoryid=theoryid, - cuts=cuts, - fit=fit, - check_plotting=check_plotting, - use_fitcommondata=use_fitcommondata, - ) - for (dsinp, cuts) in zip(data_input, cutinps) - ] - # TODO: get rid of libnnpdf Experiment return DataGroupSpec(name=group_name, datasets=datasets, dsinputs=data_input) def _parse_data_input_from_( diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index f862cd0627..6ddead9b80 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -359,6 +359,8 @@ class CutsPolicy(enum.Enum): NOCUTS = "nocuts" FROMFIT = "fromfit" FROM_CUT_INTERSECTION_NAMESPACE = "fromintersection" + FROM_SIMILAR_PREDICTIONS_NAMESPACE = "fromsimilarpredictions" + class Cuts(TupleComp): def __init__(self, name, path): @@ -396,6 +398,40 @@ def load(self): self._full = True return np.arange(self.ndata) +class SimilarCuts(TupleComp): + def __init__(self, inputs, threshold): + if len(inputs) != 2: + raise ValueError("Expecting two input tuples") + firstcuts, secondcuts = inputs[0][0].cuts, inputs[1][0].cuts + if firstcuts != secondcuts: + raise ValueError("Expecting cuts to be the same for all datasets") + self.inputs = inputs + self.threshold = threshold + super().__init__(self.inputs, self.threshold) + + def load(self): + # TODO: Update this when a suitable interace becomes available + from validphys.convolution import central_predictions + from validphys.commondataparser import load_commondata + from validphys.covmats import covmat_from_systematics + + first, second = self.inputs + first_ds = first[0] + exp_err = np.sqrt( + np.diag( + covmat_from_systematics( + load_commondata(first_ds.commondata).with_cuts(first_ds.cuts) + ) + ) + ) + # Compute matched predictions + delta = np.abs( + (central_predictions(*first) - central_predictions(*second)).squeeze(axis=1) + ) + ratio = delta / exp_err + passed = ratio < self.threshold + return passed[passed].index + def cut_mask(cuts): """Return an objects that will act as the cuts when applied as a slice""" diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index bcbcd94db1..72fe6ff6d7 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -960,9 +960,9 @@ def plot_dataspecs_positivity( @make_argcheck def _check_display_cuts_requires_use_cuts(display_cuts, use_cuts): check( - not (display_cuts - and use_cuts not in (CutsPolicy.FROMFIT, CutsPolicy.INTERNAL)), - "The display_cuts option requires setting use_cuts to True") + not (display_cuts and use_cuts is CutsPolicy.NOCUTS), + "The display_cuts option requires setting some cuts", + ) @make_argcheck def _check_marker_by(marker_by): diff --git a/validphys2/src/validphys/tests/test_cuts.py b/validphys2/src/validphys/tests/test_cuts.py new file mode 100644 index 0000000000..415759608d --- /dev/null +++ b/validphys2/src/validphys/tests/test_cuts.py @@ -0,0 +1,18 @@ +from validphys.api import API +from validphys.core import SimilarCuts +from validphys.tests.conftest import THEORYID, PDF, DATA + + + +def test_similarity_cuts(): + plain = [{"dataset": dt["dataset"]} for dt in DATA] + inp = { + "theoryid": THEORYID, + "pdf": PDF, + "cut_similarity_threshold": 1.5, + "use_cuts": "fromsimilarpredictions", + "cuts_intersection_spec": [{"dataset_inputs": DATA}, {"dataset_inputs": plain}], + "dataset_input": DATA[1], + } + ds = API.dataset(**inp) + assert isinstance(ds.cuts, SimilarCuts)