NNPDF · scarrazza · Mar 3, 2021 · Jan 20, 2021 · Jan 21, 2021 · Jan 22, 2021
diff --git a/doc/validphys2/guide.md b/doc/validphys2/guide.md
@@ -1602,8 +1602,16 @@ configuration setting:
   the cuts for the given dataset. This is useful for example for
   requiring the common subset of points that pass the cuts at NLO and
   NNLO.
+`use_cuts: 'fromsimilarpredictions'`
+  ~ Compute the intersection between two namespaces (similar to for
+  `fromintersection`) but additionally require that the predictions computed for
+  each dataset across the namespaces are *similar*, specifically that the ratio
+  between the absolute difference in the predictions and the total experimental
+  uncertainty is smaller than a given value, `cut_similarity_threshold` that
+  must be provided. Note that for this to work with different cfactors across
+  the namespaces, one must provide a different `dataset_inputs` list for each.
 
-The following example demonstrates these options:
+The following example demonstrates the first three options:
 
 ```yaml
 meta:
@@ -1671,6 +1679,70 @@ for each one individually.  With these settings the later three
 [dataspecs](#general-data-specification-the-dataspec-api) give the
 same result.
 
+The following example demonstrates the use of `fromsimilarpredictions`:
+
+```yaml
+meta:
+    title: "Test similarity cuts: Threshold 1,2"
+    author: Zahari Kassabov
+    keywords: [test]
+
+show_total: True
+
+NNLODatasts: &NNLODatasts
+- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: [QCD]}                      # N
+- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: [QCD]}                     # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: [QCD]}        # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: [QCD]}     # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: [QCD]}       # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: [QCD]}    # N
+
+NLODatasts: &NLODatasts
+- {dataset: ATLAS_SINGLETOP_TCH_R_7TEV, frac: 1.0, cfac: []}                      # N
+- {dataset: ATLAS_SINGLETOP_TCH_R_13TEV, frac: 1.0, cfac: []}                     # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM, frac: 1.0, cfac: []}        # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM, frac: 1.0, cfac: []}     # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM, frac: 0.75, cfac: []}       # N
+- {dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM, frac: 0.75, cfac: []}    # N
+
+
+dataset_inputs: *NLODatasts
+
+cuts_intersection_spec:
+    - theoryid: 52
+      pdf: NNPDF31_nlo_as_0118
+      dataset_inputs: *NLODatasts
+
+    - theoryid: 53
+      pdf: NNPDF31_nlo_as_0118
+      dataset_inputs: *NNLODatasts
+
+
+theoryid: 52
+pdf: NNPDF31_nlo_as_0118
+
+dataspecs:
+
+    - use_cuts: internal
+      speclabel: "No cuts"
+
+
+    - cut_similarity_threshold: 2
+      speclabel: "Threshold 2"
+      use_cuts: fromsimilarpredictions
+
+
+    - cut_similarity_threshold: 1
+      speclabel: "Threshold 1"
+      use_cuts: fromsimilarpredictions
+
+template_text: |
+    {@dataspecs_chi2_table@}
+
+actions_:
+    - report(main=True)
+```
+
 ### Data theory comparison
 
 The name of the data-theory comparison tool is `plot_fancy`. You can

diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -35,6 +35,7 @@
     ExperimentInput,
     CutsPolicy,
     MatchedCuts,
+    SimilarCuts,
     ThCovMatSpec,
 )
 from validphys.loader import (
@@ -362,7 +363,13 @@ def produce_commondata(self, *, dataset_input, use_fitcommondata=False, fit=None
         except InconsistentMetaDataError as e:
             raise ConfigError(e) from e
 
-    def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
+    def parse_cut_similarity_threshold(self, th: numbers.Real):
+        """Maximum relative ratio when using `fromsimilarpredictons` cuts."""
+        return th
+
+    def produce_cuts(
+        self, *, commondata, use_cuts, rules, fit=None, theoryid=None,
+    ):
         """Obtain cuts for a given dataset input, based on the
         appropriate policy."""
         # TODO: Put this bit of logic into loader.check_cuts
@@ -384,7 +391,10 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
             if not theoryid:
                 raise ConfigError("theoryid must be specified for internal cuts")
             return self.loader.check_internal_cuts(commondata, rules)
-        elif use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE:
+        elif (
+            use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE
+            or use_cuts is CutsPolicy.FROM_SIMILAR_PREDICTIONS_NAMESPACE
+        ):
             cut_list = []
             _, nss = self.parse_from_(None, "cuts_intersection_spec", write=False)
             self._check_dataspecs_type(nss)
@@ -401,7 +411,43 @@ def produce_cuts(self, *, commondata, use_cuts, rules, fit=None, theoryid=None):
                     _, nscuts = self.parse_from_(None, "cuts", write=False)
                     cut_list.append(nscuts)
             ndata = commondata.ndata
-            return MatchedCuts(cut_list, ndata=ndata)
+            matched_cuts = MatchedCuts(cut_list, ndata=ndata)
+            if use_cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE:
+                return matched_cuts
+            else:
+                if len(nss) != 2:
+                    raise ConfigError("Can only work with two namespaces")
+                _, cut_similarity_threshold = self.parse_from_(
+                    None, "cut_similarity_threshold", write=False
+                )
+                name = commondata.name
+                inps = []
+                for i, ns in enumerate(nss):
+                    with self.set_context(ns=self._curr_ns.new_child({**ns,})):
+                        # TODO: find a way to not duplicate this and use a dict
+                        # instead of a linear search
+                        _, dins = self.parse_from_(None, "dataset_inputs", write=False)
+                    try:
+                        di = next(d for d in dins if d.name == name)
+                    except StopIteration as e:
+                        raise ConfigError(
+                            f"cuts_intersection_spec namespace {i}: dataset inputs must define {name}"
+                        ) from e
+
+                    with self.set_context(
+                        ns=self._curr_ns.new_child(
+                            {
+                                "dataset_input": di,
+                                "use_cuts": CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE,
+                                "cuts": matched_cuts,
+                                **ns,
+                            }
+                        )
+                    ):
+                        _, ds = self.parse_from_(None, "dataset", write=False)
+                        _, pdf = self.parse_from_(None, "pdf", write=False)
+                    inps.append((ds, pdf))
+                return SimilarCuts(tuple(inps), cut_similarity_threshold)
 
         raise TypeError("Wrong use_cuts")
 
@@ -455,17 +501,7 @@ def produce_dataset(
         return ds
 
     @configparser.element_of("experiments")
-    def parse_experiment(
-        self,
-        experiment: dict,
-        *,
-        theoryid,
-        use_cuts,
-        rules,
-        fit=None,
-        check_plotting: bool = False,
-        use_fitcommondata=False,
-    ):
+    def parse_experiment(self, experiment: dict):
         """A set of datasets where correlated systematics are taken
            into account. It is a mapping where the keys are the experiment
            name 'experiment' and a list of datasets."""
@@ -475,41 +511,12 @@ def parse_experiment(
             raise ConfigError(
                 "'experiment' must be a mapping with "
                 "'experiment' and 'datasets', but %s is missing" % e
-            )
+            ) from e
 
         dsinputs = [self.parse_dataset_input(ds) for ds in datasets]
-        cds = [
-            self.produce_commondata(
-                dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit
-            )
-            for dsinp in dsinputs
-        ]
-        cutinps = [
-            self.produce_cuts(
-                rules=rules,
-                commondata=cd,
-                use_cuts=use_cuts,
-                fit=fit,
-                theoryid=theoryid,
-            )
-            for cd in cds
-        ]
 
-        # autogenerated func, from elemet_of
-        datasets = [
-            self.produce_dataset(
-                rules=rules,
-                dataset_input=dsinp,
-                theoryid=theoryid,
-                cuts=cuts,
-                fit=fit,
-                check_plotting=check_plotting,
-                use_fitcommondata=use_fitcommondata,
-            )
-            for (dsinp, cuts) in zip(dsinputs, cutinps)
-        ]
 
-        return DataGroupSpec(name=name, datasets=datasets, dsinputs=dsinputs)
+        return self.produce_data(group_name=name, data_input=dsinputs)
 
     @configparser.element_of("experiment_inputs")
     def parse_experiment_input(self, ei: dict):
@@ -1174,49 +1181,16 @@ def produce_data(
         self,
         data_input,
         *,
-        theoryid,
-        use_cuts,
-        rules,
-        fit=None,
-        check_plotting: bool = False,
-        use_fitcommondata=False,
         group_name="data",
     ):
         """A set of datasets where correlated systematics are taken
         into account
         """
-        # TODO: extract the commondata and cuts and seperate from dataset
-        cds = [
-            self.produce_commondata(
-                dataset_input=dsinp, use_fitcommondata=use_fitcommondata, fit=fit
-            )
-            for dsinp in data_input
-        ]
-        cutinps = [
-            self.produce_cuts(
-                rules=rules,
-                commondata=cd,
-                use_cuts=use_cuts,
-                fit=fit,
-                theoryid=theoryid,
-            )
-            for cd in cds
-        ]
+        datasets = []
+        for dsinp in data_input:
+            with self.set_context(ns=self._curr_ns.new_child({"dataset_input": dsinp})):
+                datasets.append(self.parse_from_(None, "dataset", write=False)[1])
 
-        # autogenerated func, from element_of
-        datasets = [
-            self.produce_dataset(
-                rules=rules,
-                dataset_input=dsinp,
-                theoryid=theoryid,
-                cuts=cuts,
-                fit=fit,
-                check_plotting=check_plotting,
-                use_fitcommondata=use_fitcommondata,
-            )
-            for (dsinp, cuts) in zip(data_input, cutinps)
-        ]
-        # TODO: get rid of libnnpdf Experiment
         return DataGroupSpec(name=group_name, datasets=datasets, dsinputs=data_input)
 
     def _parse_data_input_from_(

diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -359,6 +359,8 @@ class CutsPolicy(enum.Enum):
     NOCUTS = "nocuts"
     FROMFIT = "fromfit"
     FROM_CUT_INTERSECTION_NAMESPACE = "fromintersection"
+    FROM_SIMILAR_PREDICTIONS_NAMESPACE = "fromsimilarpredictions"
+
 
 class Cuts(TupleComp):
     def __init__(self, name, path):
@@ -396,6 +398,40 @@ def load(self):
         self._full = True
         return np.arange(self.ndata)
 
+class SimilarCuts(TupleComp):
+    def __init__(self, inputs, threshold):
+        if len(inputs) != 2:
+            raise ValueError("Expecting two input tuples")
+        firstcuts, secondcuts = inputs[0][0].cuts, inputs[1][0].cuts
+        if firstcuts != secondcuts:
+            raise ValueError("Expecting cuts to be the same for all datasets")
+        self.inputs = inputs
+        self.threshold = threshold
+        super().__init__(self.inputs, self.threshold)
+
+    def load(self):
+        # TODO: Update this when a suitable interace becomes available
+        from validphys.convolution import central_predictions
+        from validphys.commondataparser import load_commondata
+        from validphys.covmats import covmat_from_systematics
+
+        first, second = self.inputs
+        first_ds = first[0]
+        exp_err = np.sqrt(
+            np.diag(
+                covmat_from_systematics(
+                    load_commondata(first_ds.commondata).with_cuts(first_ds.cuts)
+                )
+            )
+        )
+        # Compute matched predictions
+        delta = np.abs(
+            (central_predictions(*first) - central_predictions(*second)).squeeze(axis=1)
+        )
+        ratio = delta / exp_err
+        passed = ratio < self.threshold
+        return passed[passed].index
+
 
 def cut_mask(cuts):
     """Return an objects that will act as the cuts when applied as a slice"""

diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
@@ -960,9 +960,9 @@ def plot_dataspecs_positivity(
 @make_argcheck
 def _check_display_cuts_requires_use_cuts(display_cuts, use_cuts):
     check(
-        not (display_cuts
-             and use_cuts not in (CutsPolicy.FROMFIT, CutsPolicy.INTERNAL)),
-        "The display_cuts option requires setting use_cuts to True")
+        not (display_cuts and use_cuts is CutsPolicy.NOCUTS),
+        "The display_cuts option requires setting some cuts",
+    )
 
 @make_argcheck
 def _check_marker_by(marker_by):

diff --git a/validphys2/src/validphys/tests/test_cuts.py b/validphys2/src/validphys/tests/test_cuts.py
@@ -0,0 +1,18 @@
+from validphys.api import API
+from validphys.core import SimilarCuts
+from validphys.tests.conftest import THEORYID, PDF, DATA
+
+
+
+def test_similarity_cuts():
+    plain = [{"dataset": dt["dataset"]} for dt in DATA]
+    inp = {
+        "theoryid": THEORYID,
+        "pdf": PDF,
+        "cut_similarity_threshold": 1.5,
+        "use_cuts": "fromsimilarpredictions",
+        "cuts_intersection_spec": [{"dataset_inputs": DATA}, {"dataset_inputs": plain}],
+        "dataset_input": DATA[1],
+    }
+    ds = API.dataset(**inp)
+    assert isinstance(ds.cuts, SimilarCuts)