From 85f779ffc91537c36a2ed92f073139112c7f5340 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Wed, 7 Feb 2024 13:56:26 -0800
Subject: [PATCH 1/3] Revert "Add missing subsume uses in egraph rules (#7879)"

This reverts commit 7464bbcb3b9b60005eaaa1738ad839c23fb48228.
---
 cranelift/codegen/src/opts/README.md    |  6 ------
 cranelift/codegen/src/opts/extends.isle |  4 ++--
 cranelift/codegen/src/opts/icmp.isle    | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/cranelift/codegen/src/opts/README.md b/cranelift/codegen/src/opts/README.md
index d7b09e4add71..8685b91f5739 100644
--- a/cranelift/codegen/src/opts/README.md
+++ b/cranelift/codegen/src/opts/README.md
@@ -38,12 +38,6 @@ of it boils down to the fact that, unlike traditional e-graphs, our rules are
    place of `x` in such cases would introduce uses of `y` where it is not
    defined.
 
-   An exception to this rule is discarding constants, as they can be
-   rematerialized anywhere without introducing correctness issues. For example,
-   the (admittedly silly) rule `(select 1 x (iconst_u _)) => x` would be a good
-   candidate for not using `subsume`, as it does not discard any non-constant
-   values introduced in its LHS.
-
 3. Avoid overly general rewrites like commutativity and associativity. Instead,
    prefer targeted instances of the rewrite (for example, canonicalizing adds
    where one operand is a constant such that the constant is always the add's
diff --git a/cranelift/codegen/src/opts/extends.isle b/cranelift/codegen/src/opts/extends.isle
index 268407b0748a..d99f42cca71d 100644
--- a/cranelift/codegen/src/opts/extends.isle
+++ b/cranelift/codegen/src/opts/extends.isle
@@ -29,12 +29,12 @@
        (slt ty
              (uextend $I64 x @ (value_type $I32))
              (iconst_u _ 0)))
-      (subsume (iconst_u ty 0)))
+      (iconst_u ty 0))
 (rule (simplify
        (sge ty
              (uextend $I64 x @ (value_type $I32))
              (iconst_u _ 0)))
-      (subsume (iconst_u ty 1)))
+      (iconst_u ty 1))
 
 ;; Sign-extending can't change whether a number is zero nor how it signed-compares to zero
 (rule (simplify (eq _ (sextend _ x@(value_type ty)) (iconst_s _ 0)))
diff --git a/cranelift/codegen/src/opts/icmp.isle b/cranelift/codegen/src/opts/icmp.isle
index f7d20fa3d322..fc1d0a157b0b 100644
--- a/cranelift/codegen/src/opts/icmp.isle
+++ b/cranelift/codegen/src/opts/icmp.isle
@@ -2,16 +2,16 @@
 
 ;; `x == x` is always true for integers; `x != x` is false. Strict
 ;; inequalities are false, and loose inequalities are true.
-(rule (simplify (eq  (ty_int ty) x x)) (subsume (iconst_u ty 1)))
-(rule (simplify (ne  (ty_int ty) x x)) (subsume (iconst_u ty 0)))
-(rule (simplify (ugt (ty_int ty) x x)) (subsume (iconst_u ty 0)))
-(rule (simplify (uge (ty_int ty) x x)) (subsume (iconst_u ty 1)))
-(rule (simplify (sgt (ty_int ty) x x)) (subsume (iconst_u ty 0)))
-(rule (simplify (sge (ty_int ty) x x)) (subsume (iconst_u ty 1)))
-(rule (simplify (ult (ty_int ty) x x)) (subsume (iconst_u ty 0)))
-(rule (simplify (ule (ty_int ty) x x)) (subsume (iconst_u ty 1)))
-(rule (simplify (slt (ty_int ty) x x)) (subsume (iconst_u ty 0)))
-(rule (simplify (sle (ty_int ty) x x)) (subsume (iconst_u ty 1)))
+(rule (simplify (eq  (ty_int ty) x x)) (iconst_u ty 1))
+(rule (simplify (ne  (ty_int ty) x x)) (iconst_u ty 0))
+(rule (simplify (ugt (ty_int ty) x x)) (iconst_u ty 0))
+(rule (simplify (uge (ty_int ty) x x)) (iconst_u ty 1))
+(rule (simplify (sgt (ty_int ty) x x)) (iconst_u ty 0))
+(rule (simplify (sge (ty_int ty) x x)) (iconst_u ty 1))
+(rule (simplify (ult (ty_int ty) x x)) (iconst_u ty 0))
+(rule (simplify (ule (ty_int ty) x x)) (iconst_u ty 1))
+(rule (simplify (slt (ty_int ty) x x)) (iconst_u ty 0))
+(rule (simplify (sle (ty_int ty) x x)) (iconst_u ty 1))
 
 ;; Optimize icmp-of-icmp.
 (rule (simplify (ne ty

From ef004b13575935c67fb71d55009a5220aa0c9e71 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Wed, 7 Feb 2024 13:56:40 -0800
Subject: [PATCH 2/3] Revert "Cranelift: Use a fixpoint loop to compute the
 best value for each eclass (#7859)"

This reverts commit 5b2ae8365ed0ef68c6da1465c1f5d9d1f0b4624b.
---
 cranelift/codegen/src/egraph.rs               |   1 -
 cranelift/codegen/src/egraph/cost.rs          |  78 +++------
 cranelift/codegen/src/egraph/elaborate.rs     | 149 +++++-------------
 cranelift/codegen/src/opts/README.md          |  86 +---------
 cranelift/codegen/src/opts/cprop.isle         |  10 +-
 .../filetests/egraph/issue-7875.clif          |  37 -----
 6 files changed, 67 insertions(+), 294 deletions(-)
 delete mode 100644 cranelift/filetests/filetests/egraph/issue-7875.clif

diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs
index fbedace9de20..41d1512b22cf 100644
--- a/cranelift/codegen/src/egraph.rs
+++ b/cranelift/codegen/src/egraph.rs
@@ -701,5 +701,4 @@ pub(crate) struct Stats {
     pub(crate) elaborate_func: u64,
     pub(crate) elaborate_func_pre_insts: u64,
     pub(crate) elaborate_func_post_insts: u64,
-    pub(crate) elaborate_best_cost_fixpoint_iters: u64,
 }
diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 28aab40e9740..2870b61f515f 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -74,7 +74,7 @@ impl Cost {
     const DEPTH_BITS: u8 = 8;
     const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
     const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
-    const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
+    const MAX_OP_COST: u32 = (Self::OP_COST_MASK >> Self::DEPTH_BITS) - 1;
 
     pub(crate) fn infinity() -> Cost {
         // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
@@ -86,16 +86,14 @@ impl Cost {
         Cost(0)
     }
 
-    /// Construct a new `Cost` from the given parts.
+    /// Construct a new finite cost from the given parts.
     ///
-    /// If the opcode cost is greater than or equal to the maximum representable
-    /// opcode cost, then the resulting `Cost` saturates to infinity.
-    fn new(opcode_cost: u32, depth: u8) -> Cost {
-        if opcode_cost >= Self::MAX_OP_COST {
-            Self::infinity()
-        } else {
-            Cost(opcode_cost << Self::DEPTH_BITS | u32::from(depth))
-        }
+    /// The opcode cost is clamped to the maximum value representable.
+    fn new_finite(opcode_cost: u32, depth: u8) -> Cost {
+        let opcode_cost = std::cmp::min(opcode_cost, Self::MAX_OP_COST);
+        let cost = Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth));
+        debug_assert_ne!(cost, Cost::infinity());
+        cost
     }
 
     fn depth(&self) -> u8 {
@@ -113,7 +111,7 @@ impl Cost {
     /// that satisfies `inst_predicates::is_pure_for_egraph()`.
     pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
         let c = pure_op_cost(op) + operand_costs.into_iter().sum();
-        Cost::new(c.op_cost(), c.depth().saturating_add(1))
+        Cost::new_finite(c.op_cost(), c.depth().saturating_add(1))
     }
 }
 
@@ -133,9 +131,12 @@ impl std::ops::Add<Cost> for Cost {
     type Output = Cost;
 
     fn add(self, other: Cost) -> Cost {
-        let op_cost = self.op_cost().saturating_add(other.op_cost());
+        let op_cost = std::cmp::min(
+            self.op_cost().saturating_add(other.op_cost()),
+            Self::MAX_OP_COST,
+        );
         let depth = std::cmp::max(self.depth(), other.depth());
-        Cost::new(op_cost, depth)
+        Cost::new_finite(op_cost, depth)
     }
 }
 
@@ -146,11 +147,11 @@ impl std::ops::Add<Cost> for Cost {
 fn pure_op_cost(op: Opcode) -> Cost {
     match op {
         // Constants.
-        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
+        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new_finite(1, 0),
 
         // Extends/reduces.
         Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
-            Cost::new(2, 0)
+            Cost::new_finite(2, 0)
         }
 
         // "Simple" arithmetic.
@@ -162,52 +163,9 @@ fn pure_op_cost(op: Opcode) -> Cost {
         | Opcode::Bnot
         | Opcode::Ishl
         | Opcode::Ushr
-        | Opcode::Sshr => Cost::new(3, 0),
+        | Opcode::Sshr => Cost::new_finite(3, 0),
 
         // Everything else (pure.)
-        _ => Cost::new(4, 0),
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn add_cost() {
-        let a = Cost::new(5, 2);
-        let b = Cost::new(37, 3);
-        assert_eq!(a + b, Cost::new(42, 3));
-        assert_eq!(b + a, Cost::new(42, 3));
-    }
-
-    #[test]
-    fn add_infinity() {
-        let a = Cost::new(5, 2);
-        let b = Cost::infinity();
-        assert_eq!(a + b, Cost::infinity());
-        assert_eq!(b + a, Cost::infinity());
-    }
-
-    #[test]
-    fn op_cost_saturates_to_infinity() {
-        let a = Cost::new(Cost::MAX_OP_COST - 10, 2);
-        let b = Cost::new(11, 2);
-        assert_eq!(a + b, Cost::infinity());
-        assert_eq!(b + a, Cost::infinity());
-    }
-
-    #[test]
-    fn depth_saturates_to_max_depth() {
-        let a = Cost::new(10, u8::MAX);
-        let b = Cost::new(10, 1);
-        assert_eq!(
-            Cost::of_pure_op(Opcode::Iconst, [a, b]),
-            Cost::new(21, u8::MAX)
-        );
-        assert_eq!(
-            Cost::of_pure_op(Opcode::Iconst, [b, a]),
-            Cost::new(21, u8::MAX)
-        );
+        _ => Cost::new_finite(4, 0),
     }
 }
diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
index 9b9d1ea05641..5637215ee66d 100644
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -7,7 +7,6 @@ use super::Stats;
 use crate::dominator_tree::DominatorTree;
 use crate::fx::{FxHashMap, FxHashSet};
 use crate::hash_map::Entry as HashEntry;
-use crate::inst_predicates::is_pure_for_egraph;
 use crate::ir::{Block, Function, Inst, Value, ValueDef};
 use crate::loop_analysis::{Loop, LoopAnalysis};
 use crate::scoped_hash_map::ScopedHashMap;
@@ -217,112 +216,46 @@ impl<'a> Elaborator<'a> {
 
     fn compute_best_values(&mut self) {
         let best = &mut self.value_to_best_value;
-
-        // Do a fixpoint loop to compute the best value for each eclass.
-        //
-        // The maximum number of iterations is the length of the longest chain
-        // of `vNN -> vMM` edges in the dataflow graph where `NN < MM`, so this
-        // is *technically* quadratic, but `cranelift-frontend` won't construct
-        // any such edges. NaN canonicalization will introduce some of these
-        // edges, but they are chains of only two or three edges. So in
-        // practice, we *never* do more than a handful of iterations here unless
-        // (a) we parsed the CLIF from text and the text was funkily numbered,
-        // which we don't really care about, or (b) the CLIF producer did
-        // something weird, in which case it is their responsibility to stop
-        // doing that.
-        trace!("Entering fixpoint loop to compute the best values for each eclass");
-        let mut keep_going = true;
-        while keep_going {
-            keep_going = false;
-            trace!(
-                "fixpoint iteration {}",
-                self.stats.elaborate_best_cost_fixpoint_iters
-            );
-            self.stats.elaborate_best_cost_fixpoint_iters += 1;
-
-            for (value, def) in self.func.dfg.values_and_defs() {
-                trace!("computing best for value {:?} def {:?}", value, def);
-                let orig_best_value = best[value];
-
-                match def {
-                    ValueDef::Union(x, y) => {
-                        // Pick the best of the two options based on
-                        // min-cost. This works because each element of `best`
-                        // is a `(cost, value)` tuple; `cost` comes first so
-                        // the natural comparison works based on cost, and
-                        // breaks ties based on value number.
-                        best[value] = std::cmp::min(best[x], best[y]);
-                        trace!(
-                            " -> best of union({:?}, {:?}) = {:?}",
-                            best[x],
-                            best[y],
-                            best[value]
-                        );
-                    }
-                    ValueDef::Param(_, _) => {
+        for (value, def) in self.func.dfg.values_and_defs() {
+            trace!("computing best for value {:?} def {:?}", value, def);
+            match def {
+                ValueDef::Union(x, y) => {
+                    // Pick the best of the two options based on
+                    // min-cost. This works because each element of `best`
+                    // is a `(cost, value)` tuple; `cost` comes first so
+                    // the natural comparison works based on cost, and
+                    // breaks ties based on value number.
+                    trace!(" -> best of {:?} and {:?}", best[x], best[y]);
+                    best[value] = std::cmp::min(best[x], best[y]);
+                    trace!(" -> {:?}", best[value]);
+                }
+                ValueDef::Param(_, _) => {
+                    best[value] = BestEntry(Cost::zero(), value);
+                }
+                // If the Inst is inserted into the layout (which is,
+                // at this point, only the side-effecting skeleton),
+                // then it must be computed and thus we give it zero
+                // cost.
+                ValueDef::Result(inst, _) => {
+                    if let Some(_) = self.func.layout.inst_block(inst) {
                         best[value] = BestEntry(Cost::zero(), value);
+                    } else {
+                        trace!(" -> value {}: result, computing cost", value);
+                        let inst_data = &self.func.dfg.insts[inst];
+                        // N.B.: at this point we know that the opcode is
+                        // pure, so `pure_op_cost`'s precondition is
+                        // satisfied.
+                        let cost = Cost::of_pure_op(
+                            inst_data.opcode(),
+                            self.func.dfg.inst_values(inst).map(|value| best[value].0),
+                        );
+                        best[value] = BestEntry(cost, value);
                     }
-                    // If the Inst is inserted into the layout (which is,
-                    // at this point, only the side-effecting skeleton),
-                    // then it must be computed and thus we give it zero
-                    // cost.
-                    ValueDef::Result(inst, _) => {
-                        if let Some(_) = self.func.layout.inst_block(inst) {
-                            best[value] = BestEntry(Cost::zero(), value);
-                        } else {
-                            let inst_data = &self.func.dfg.insts[inst];
-                            // N.B.: at this point we know that the opcode is
-                            // pure, so `pure_op_cost`'s precondition is
-                            // satisfied.
-                            let cost = Cost::of_pure_op(
-                                inst_data.opcode(),
-                                self.func.dfg.inst_values(inst).map(|value| best[value].0),
-                            );
-                            best[value] = BestEntry(cost, value);
-                            trace!(" -> cost of value {} = {:?}", value, cost);
-                        }
-                    }
-                };
-
-                // Keep on iterating the fixpoint loop while we are finding new
-                // best values.
-                keep_going |= orig_best_value != best[value];
-            }
-        }
-
-        if cfg!(any(feature = "trace-log", debug_assertions)) {
-            trace!("finished fixpoint loop to compute best value for each eclass");
-            for value in self.func.dfg.values() {
-                trace!("-> best for eclass {:?}: {:?}", value, best[value]);
-                debug_assert_ne!(best[value].1, Value::reserved_value());
-                // You might additionally be expecting an assert that the best
-                // cost is not infinity, however infinite cost *can* happen in
-                // practice. First, note that our cost function doesn't know
-                // about any shared structure in the dataflow graph, it only
-                // sums operand costs. (And trying to avoid that by deduping a
-                // single operation's operands is a losing game because you can
-                // always just add one indirection and go from `add(x, x)` to
-                // `add(foo(x), bar(x))` to hide the shared structure.) Given
-                // that blindness to sharing, we can make cost grow
-                // exponentially with a linear sequence of operations:
-                //
-                //     v0 = iconst.i32 1    ;; cost = 1
-                //     v1 = iadd v0, v0     ;; cost = 3 + 1 + 1
-                //     v2 = iadd v1, v1     ;; cost = 3 + 5 + 5
-                //     v3 = iadd v2, v2     ;; cost = 3 + 13 + 13
-                //     v4 = iadd v3, v3     ;; cost = 3 + 29 + 29
-                //     v5 = iadd v4, v4     ;; cost = 3 + 61 + 61
-                //     v6 = iadd v5, v5     ;; cost = 3 + 125 + 125
-                //     ;; etc...
-                //
-                // Such a chain can cause cost to saturate to infinity. How do
-                // we choose which e-node is best when there are multiple that
-                // have saturated to infinity? It doesn't matter. As long as
-                // invariant (2) for optimization rules is upheld by our rule
-                // set (see `cranelift/codegen/src/opts/README.md`) it is safe
-                // to choose *any* e-node in the e-class. At worst we will
-                // produce suboptimal code, but never an incorrectness.
-            }
+                }
+            };
+            debug_assert_ne!(best[value].0, Cost::infinity());
+            debug_assert_ne!(best[value].1, Value::reserved_value());
+            trace!("best for eclass {:?}: {:?}", value, best[value]);
         }
     }
 
@@ -673,13 +606,7 @@ impl<'a> Elaborator<'a> {
                         }
                         inst
                     };
-
                     // Place the inst just before `before`.
-                    debug_assert!(
-                        is_pure_for_egraph(self.func, inst),
-                        "something has gone very wrong if we are elaborating effectful \
-                         instructions, they should have remained in the skeleton"
-                    );
                     self.func.layout.insert_inst(inst, before);
 
                     // Update the inst's arguments.
diff --git a/cranelift/codegen/src/opts/README.md b/cranelift/codegen/src/opts/README.md
index 8685b91f5739..b4f46420abe0 100644
--- a/cranelift/codegen/src/opts/README.md
+++ b/cranelift/codegen/src/opts/README.md
@@ -1,81 +1,5 @@
-# Rules for Writing Optimization Rules
-
-For both correctness and compile speed, we must be careful with our rules. A lot
-of it boils down to the fact that, unlike traditional e-graphs, our rules are
-*directional*.
-
-1. Rules should not rewrite to worse code: the right-hand side should be at
-   least as good as the left-hand side or better.
-
-   For example, the rule
-
-       x => (add x 0)
-
-   is disallowed, but swapping its left- and right-hand sides produces a rule
-   that is allowed.
-
-   Any kind of canonicalizing rule that intends to help subsequent rules match
-   and unlock further optimizations (e.g. floating constants to the right side
-   for our constant-propagation rules to match) must produce canonicalized
-   output that is no worse than its noncanonical input.
-
-   We assume this invariant as a heuristic to break ties between two
-   otherwise-equal-cost expressions in various places, making up for some
-   limitations of our explicit cost function.
-
-2. Any rule that removes value-uses in its right-hand side that previously
-   existed in its left-hand side MUST use `subsume`.
-
-   For example, the rule
-
-       (select 1 x y) => x
-
-   MUST use `subsume`.
-
-   This is required for correctness because, once a value-use is removed, some
-   e-nodes in the e-class are more equal than others. There might be uses of `x`
-   in a scope where `y` is not available, and so emitting `(select 1 x y)` in
-   place of `x` in such cases would introduce uses of `y` where it is not
-   defined.
-
-3. Avoid overly general rewrites like commutativity and associativity. Instead,
-   prefer targeted instances of the rewrite (for example, canonicalizing adds
-   where one operand is a constant such that the constant is always the add's
-   second operand, rather than general commutativity for adds) or even writing
-   the "same" optimization rule multiple times.
-
-   For example, the commutativity in the first rule in the following snippet is
-   bad because it will match even when the first operand is not an add:
-
-       ;; Commute to allow `(foo (add ...) x)`, when we see it, to match.
-       (foo x y) => (foo y x)
-
-       ;; Optimize.
-       (foo x (add ...)) => (bar x)
-
-   Better is to commute only when we know that canonicalizing in this way will
-   all definitely allow the subsequent optimization rule to match:
-
-       ;; Canonicalize all adds to `foo`'s second operand.
-       (foo (add ...) x) => (foo x (add ...))
-
-       ;; Optimize.
-       (foo x (add ...)) => (bar x)
-
-   But even better in this case is to write the "same" optimization multiple
-   times:
-
-       (foo (add ...) x) => (bar x)
-       (foo x (add ...)) => (bar x)
-
-   The cost of rule-matching is amortized by the ISLE compiler, where as the
-   intermediate result of each rewrite allocates new e-nodes and requires
-   storage in the dataflow graph. Therefore, additional rules are cheaper than
-   additional e-nodes.
-
-   Commutativity and associativity in particular can cause huge amounts of
-   e-graph bloat.
-
-   One day we intend to extend ISLE with built-in support for commutativity, so
-   we don't need to author the redundant commutations ourselves:
-   https://github.com/bytecodealliance/wasmtime/issues/6128
+Rules here are allowed to rewrite pure expressions arbitrarily,
+using the same inputs as the original, or fewer. In other words, we
+cannot pull a new eclass id out of thin air and refer to it, other
+than a piece of the input or a new node that we construct; but we
+can freely rewrite e.g. `x+y-y` to `x`.
diff --git a/cranelift/codegen/src/opts/cprop.isle b/cranelift/codegen/src/opts/cprop.isle
index f84b300f62bd..94574592d2e2 100644
--- a/cranelift/codegen/src/opts/cprop.isle
+++ b/cranelift/codegen/src/opts/cprop.isle
@@ -167,10 +167,12 @@
        (bxor ty (bxor ty x k1 @ (iconst ty _)) k2 @ (iconst ty _)))
       (bxor ty x (bxor ty k1 k2)))
 
-(rule (simplify (select ty (iconst_u _ (u64_nonzero _)) x _))
-      (subsume x))
-(rule (simplify (select ty (iconst_u _ 0) _ y))
-      (subsume y))
+(rule (simplify
+       (select ty (iconst_u _ (u64_nonzero _)) x y))
+      x)
+(rule (simplify
+       (select ty (iconst_u _ 0) x y))
+      y)
 
 ;; Replace subtraction by a "negative" constant with addition.
 ;; Notably, this gives `x - (-1) == x + 1`, so other patterns don't have to
diff --git a/cranelift/filetests/filetests/egraph/issue-7875.clif b/cranelift/filetests/filetests/egraph/issue-7875.clif
deleted file mode 100644
index abdcb32288a5..000000000000
--- a/cranelift/filetests/filetests/egraph/issue-7875.clif
+++ /dev/null
@@ -1,37 +0,0 @@
-test optimize
-set enable_verifier=true
-set opt_level=speed
-target x86_64
-
-;; This test case should optimize just fine, and should definitely not produce
-;; CLIF that has verifier errors like
-;;
-;;     error: inst10 (v12 = select.f32 v11, v4, v10 ; v11 = 1): uses value arg
-;;     from non-dominating block4
-
-function %foo() {
-block0:
-    v0 = iconst.i64 0
-    v2 = f32const 0.0
-    v9 = f32const 0.0
-    v20 = fneg v2
-    v18 = fcmp eq v20, v20
-    v4 = select v18, v2, v20
-    v8 = iconst.i32 0
-    v11 = iconst.i32 1
-    brif v0, block2, block3
-
-block2:
-    brif.i32 v8, block4(v2), block4(v9)
-
-block4(v10: f32):
-    v12 = select.f32 v11, v4, v10
-    v13 = bitcast.i32 v12
-    store v13, v0
-    trap user0
-
-block3:
-    v15 = bitcast.i32 v4
-    store v15, v0
-    return
-}

From 6903ec029c72fc5c8ad95222e12ebf8ed28c1179 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Wed, 7 Feb 2024 14:39:29 -0800
Subject: [PATCH 3/3] Disallow values from jumping between blocks
 inappropriately

Undo changes to the union find structure and gvn_map when popping
the node stack in `remove_pure_and_optimize`, ensuring that we don't
propagate values in a way that would invalidate the invariant that the
`ValueData::Union` structure provides.

Additionally, remove all uses of the union find structure from
elaboration, as we have already computed the best values at that point.

Co-authored-by: Jamey Sharp <jsharp@fastly.com>
Co-authored-by: L Pereira <l.pereira@fastly.com>
Co-authored-by: Nick Fitzgerald <fitzgen@gmail.com>
---
 cranelift/codegen/src/ctxhash.rs          |  2 ++
 cranelift/codegen/src/egraph.rs           | 11 +++++--
 cranelift/codegen/src/egraph/elaborate.rs | 36 ++++++++++++-----------
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/cranelift/codegen/src/ctxhash.rs b/cranelift/codegen/src/ctxhash.rs
index e172d46c127a..190dd64a34d1 100644
--- a/cranelift/codegen/src/ctxhash.rs
+++ b/cranelift/codegen/src/ctxhash.rs
@@ -51,6 +51,7 @@ impl<V: Eq + Hash> CtxHash<V> for NullCtx {
 /// the hashcode, for memory efficiency: in common use, `K` and `V`
 /// are often 32 bits also, and a 12-byte bucket is measurably better
 /// than a 16-byte bucket.
+#[derive(Clone)]
 struct BucketData<K, V> {
     hash: u32,
     k: K,
@@ -58,6 +59,7 @@ struct BucketData<K, V> {
 }
 
 /// A HashMap that takes external context for all operations.
+#[derive(Clone)]
 pub struct CtxHashMap<K, V> {
     raw: RawTable<BucketData<K, V>>,
 }
diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs
index 41d1512b22cf..81da4559c93f 100644
--- a/cranelift/codegen/src/egraph.rs
+++ b/cranelift/codegen/src/egraph.rs
@@ -497,7 +497,7 @@ impl<'a> EgraphPass<'a> {
         let root = self.domtree_children.root();
         enum StackEntry {
             Visit(Block),
-            Pop,
+            Pop(UnionFind<Value>, CtxHashMap<(Type, InstructionData), Value>),
         }
         let mut block_stack = vec![StackEntry::Visit(root)];
         while let Some(entry) = block_stack.pop() {
@@ -505,7 +505,7 @@ impl<'a> EgraphPass<'a> {
                 StackEntry::Visit(block) => {
                     // We popped this block; push children
                     // immediately, then process this block.
-                    block_stack.push(StackEntry::Pop);
+                    block_stack.push(StackEntry::Pop(self.eclasses.clone(), gvn_map.clone()));
                     block_stack
                         .extend(self.domtree_children.children(block).map(StackEntry::Visit));
                     effectful_gvn_map.increment_depth();
@@ -579,8 +579,13 @@ impl<'a> EgraphPass<'a> {
                         }
                     }
                 }
-                StackEntry::Pop => {
+                StackEntry::Pop(eclasses, saved_gvn_map) => {
                     effectful_gvn_map.decrement_depth();
+                    // TODO: do we need to save both eclasses and gvn_map? All the tests pass
+                    // without restoring eclasses, but perhaps that's a limitation of our test
+                    // suite.
+                    self.eclasses = eclasses;
+                    gvn_map = saved_gvn_map;
                 }
             }
         }
diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
index 5637215ee66d..e01343770fe0 100644
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -21,7 +21,6 @@ pub(crate) struct Elaborator<'a> {
     domtree: &'a DominatorTree,
     domtree_children: &'a DomTreeWithChildren,
     loop_analysis: &'a LoopAnalysis,
-    eclasses: &'a mut UnionFind<Value>,
     /// Map from Value that is produced by a pure Inst (and was thus
     /// not in the side-effecting skeleton) to the value produced by
     /// an elaborated inst (placed in the layout) to whose results we
@@ -137,7 +136,7 @@ impl<'a> Elaborator<'a> {
         domtree_children: &'a DomTreeWithChildren,
         loop_analysis: &'a LoopAnalysis,
         remat_values: &'a FxHashSet<Value>,
-        eclasses: &'a mut UnionFind<Value>,
+        _eclasses: &'a mut UnionFind<Value>,
         stats: &'a mut Stats,
     ) -> Self {
         let num_values = func.dfg.num_values();
@@ -149,7 +148,6 @@ impl<'a> Elaborator<'a> {
             domtree,
             domtree_children,
             loop_analysis,
-            eclasses,
             value_to_elaborated_value: ScopedHashMap::with_capacity(num_values),
             value_to_best_value,
             loop_stack: smallvec![],
@@ -325,14 +323,14 @@ impl<'a> Elaborator<'a> {
                     let value = self.func.dfg.resolve_aliases(value);
 
                     self.stats.elaborate_visit_node += 1;
-                    let canonical_value = self.eclasses.find_and_update(value);
-                    debug_assert_ne!(canonical_value, Value::reserved_value());
-                    trace!(
-                        "elaborate: value {} canonical {} before {}",
-                        value,
-                        canonical_value,
-                        before
-                    );
+                    // let canonical_value = self.eclasses.find_and_update(value);
+                    // debug_assert_ne!(canonical_value, Value::reserved_value());
+                    // trace!(
+                    //     "elaborate: value {} canonical {} before {}",
+                    //     value,
+                    //     canonical_value,
+                    //     before
+                    // );
 
                     // Get the best option; we use `value` (latest
                     // value) here so we have a full view of the
@@ -342,7 +340,8 @@ impl<'a> Elaborator<'a> {
                     trace!("elaborate: value {} -> best {}", value, best_value);
                     debug_assert_ne!(best_value, Value::reserved_value());
 
-                    if let Some(elab_val) = self.value_to_elaborated_value.get(&canonical_value) {
+                    // if let Some(elab_val) = self.value_to_elaborated_value.get(&canonical_value) {
+                    if let Some(elab_val) = self.value_to_elaborated_value.get(&best_value) {
                         // Value is available; use it.
                         trace!("elaborate: value {} -> {:?}", value, elab_val);
                         self.stats.elaborate_memoize_hit += 1;
@@ -568,15 +567,16 @@ impl<'a> Elaborator<'a> {
                                 value: new_result,
                                 in_block: insert_block,
                             };
-                            let canonical_result = self.eclasses.find_and_update(result);
+                            // let canonical_result = self.eclasses.find_and_update(result);
+                            let BestEntry(_, canonical_result) = self.value_to_best_value[result];
                             self.value_to_elaborated_value.insert_if_absent_with_depth(
                                 canonical_result,
                                 elab_value,
                                 scope_depth,
                             );
 
-                            self.eclasses.add(new_result);
-                            self.eclasses.union(result, new_result);
+                            // self.eclasses.add(new_result);
+                            // self.eclasses.union(result, new_result);
                             self.value_to_best_value[new_result] = self.value_to_best_value[result];
 
                             trace!(
@@ -596,7 +596,8 @@ impl<'a> Elaborator<'a> {
                                 value: result,
                                 in_block: insert_block,
                             };
-                            let canonical_result = self.eclasses.find_and_update(result);
+                            // let canonical_result = self.eclasses.find_and_update(result);
+                            let BestEntry(_, canonical_result) = self.value_to_best_value[result];
                             self.value_to_elaborated_value.insert_if_absent_with_depth(
                                 canonical_result,
                                 elab_value,
@@ -684,7 +685,8 @@ impl<'a> Elaborator<'a> {
             // map now.
             for &result in self.func.dfg.inst_results(inst) {
                 trace!(" -> result {}", result);
-                let canonical_result = self.eclasses.find_and_update(result);
+                // let canonical_result = self.eclasses.find_and_update(result);
+                let BestEntry(_, canonical_result) = self.value_to_best_value[result];
                 self.value_to_elaborated_value.insert_if_absent(
                     canonical_result,
                     ElaboratedValue {