From 85f779ffc91537c36a2ed92f073139112c7f5340 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Wed, 7 Feb 2024 13:56:26 -0800 Subject: [PATCH 1/3] Revert "Add missing subsume uses in egraph rules (#7879)" This reverts commit 7464bbcb3b9b60005eaaa1738ad839c23fb48228. --- cranelift/codegen/src/opts/README.md | 6 ------ cranelift/codegen/src/opts/extends.isle | 4 ++-- cranelift/codegen/src/opts/icmp.isle | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/cranelift/codegen/src/opts/README.md b/cranelift/codegen/src/opts/README.md index d7b09e4add71..8685b91f5739 100644 --- a/cranelift/codegen/src/opts/README.md +++ b/cranelift/codegen/src/opts/README.md @@ -38,12 +38,6 @@ of it boils down to the fact that, unlike traditional e-graphs, our rules are place of `x` in such cases would introduce uses of `y` where it is not defined. - An exception to this rule is discarding constants, as they can be - rematerialized anywhere without introducing correctness issues. For example, - the (admittedly silly) rule `(select 1 x (iconst_u _)) => x` would be a good - candidate for not using `subsume`, as it does not discard any non-constant - values introduced in its LHS. - 3. Avoid overly general rewrites like commutativity and associativity. Instead, prefer targeted instances of the rewrite (for example, canonicalizing adds where one operand is a constant such that the constant is always the add's diff --git a/cranelift/codegen/src/opts/extends.isle b/cranelift/codegen/src/opts/extends.isle index 268407b0748a..d99f42cca71d 100644 --- a/cranelift/codegen/src/opts/extends.isle +++ b/cranelift/codegen/src/opts/extends.isle @@ -29,12 +29,12 @@ (slt ty (uextend $I64 x @ (value_type $I32)) (iconst_u _ 0))) - (subsume (iconst_u ty 0))) + (iconst_u ty 0)) (rule (simplify (sge ty (uextend $I64 x @ (value_type $I32)) (iconst_u _ 0))) - (subsume (iconst_u ty 1))) + (iconst_u ty 1)) ;; Sign-extending can't change whether a number is zero nor how it signed-compares to zero (rule (simplify (eq _ (sextend _ x@(value_type ty)) (iconst_s _ 0))) diff --git a/cranelift/codegen/src/opts/icmp.isle b/cranelift/codegen/src/opts/icmp.isle index f7d20fa3d322..fc1d0a157b0b 100644 --- a/cranelift/codegen/src/opts/icmp.isle +++ b/cranelift/codegen/src/opts/icmp.isle @@ -2,16 +2,16 @@ ;; `x == x` is always true for integers; `x != x` is false. Strict ;; inequalities are false, and loose inequalities are true. -(rule (simplify (eq (ty_int ty) x x)) (subsume (iconst_u ty 1))) -(rule (simplify (ne (ty_int ty) x x)) (subsume (iconst_u ty 0))) -(rule (simplify (ugt (ty_int ty) x x)) (subsume (iconst_u ty 0))) -(rule (simplify (uge (ty_int ty) x x)) (subsume (iconst_u ty 1))) -(rule (simplify (sgt (ty_int ty) x x)) (subsume (iconst_u ty 0))) -(rule (simplify (sge (ty_int ty) x x)) (subsume (iconst_u ty 1))) -(rule (simplify (ult (ty_int ty) x x)) (subsume (iconst_u ty 0))) -(rule (simplify (ule (ty_int ty) x x)) (subsume (iconst_u ty 1))) -(rule (simplify (slt (ty_int ty) x x)) (subsume (iconst_u ty 0))) -(rule (simplify (sle (ty_int ty) x x)) (subsume (iconst_u ty 1))) +(rule (simplify (eq (ty_int ty) x x)) (iconst_u ty 1)) +(rule (simplify (ne (ty_int ty) x x)) (iconst_u ty 0)) +(rule (simplify (ugt (ty_int ty) x x)) (iconst_u ty 0)) +(rule (simplify (uge (ty_int ty) x x)) (iconst_u ty 1)) +(rule (simplify (sgt (ty_int ty) x x)) (iconst_u ty 0)) +(rule (simplify (sge (ty_int ty) x x)) (iconst_u ty 1)) +(rule (simplify (ult (ty_int ty) x x)) (iconst_u ty 0)) +(rule (simplify (ule (ty_int ty) x x)) (iconst_u ty 1)) +(rule (simplify (slt (ty_int ty) x x)) (iconst_u ty 0)) +(rule (simplify (sle (ty_int ty) x x)) (iconst_u ty 1)) ;; Optimize icmp-of-icmp. (rule (simplify (ne ty From ef004b13575935c67fb71d55009a5220aa0c9e71 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Wed, 7 Feb 2024 13:56:40 -0800 Subject: [PATCH 2/3] Revert "Cranelift: Use a fixpoint loop to compute the best value for each eclass (#7859)" This reverts commit 5b2ae8365ed0ef68c6da1465c1f5d9d1f0b4624b. --- cranelift/codegen/src/egraph.rs | 1 - cranelift/codegen/src/egraph/cost.rs | 78 +++------ cranelift/codegen/src/egraph/elaborate.rs | 149 +++++------------- cranelift/codegen/src/opts/README.md | 86 +--------- cranelift/codegen/src/opts/cprop.isle | 10 +- .../filetests/egraph/issue-7875.clif | 37 ----- 6 files changed, 67 insertions(+), 294 deletions(-) delete mode 100644 cranelift/filetests/filetests/egraph/issue-7875.clif diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs index fbedace9de20..41d1512b22cf 100644 --- a/cranelift/codegen/src/egraph.rs +++ b/cranelift/codegen/src/egraph.rs @@ -701,5 +701,4 @@ pub(crate) struct Stats { pub(crate) elaborate_func: u64, pub(crate) elaborate_func_pre_insts: u64, pub(crate) elaborate_func_post_insts: u64, - pub(crate) elaborate_best_cost_fixpoint_iters: u64, } diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs index 28aab40e9740..2870b61f515f 100644 --- a/cranelift/codegen/src/egraph/cost.rs +++ b/cranelift/codegen/src/egraph/cost.rs @@ -74,7 +74,7 @@ impl Cost { const DEPTH_BITS: u8 = 8; const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1; const OP_COST_MASK: u32 = !Self::DEPTH_MASK; - const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS; + const MAX_OP_COST: u32 = (Self::OP_COST_MASK >> Self::DEPTH_BITS) - 1; pub(crate) fn infinity() -> Cost { // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost` @@ -86,16 +86,14 @@ impl Cost { Cost(0) } - /// Construct a new `Cost` from the given parts. + /// Construct a new finite cost from the given parts. /// - /// If the opcode cost is greater than or equal to the maximum representable - /// opcode cost, then the resulting `Cost` saturates to infinity. - fn new(opcode_cost: u32, depth: u8) -> Cost { - if opcode_cost >= Self::MAX_OP_COST { - Self::infinity() - } else { - Cost(opcode_cost << Self::DEPTH_BITS | u32::from(depth)) - } + /// The opcode cost is clamped to the maximum value representable. + fn new_finite(opcode_cost: u32, depth: u8) -> Cost { + let opcode_cost = std::cmp::min(opcode_cost, Self::MAX_OP_COST); + let cost = Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth)); + debug_assert_ne!(cost, Cost::infinity()); + cost } fn depth(&self) -> u8 { @@ -113,7 +111,7 @@ impl Cost { /// that satisfies `inst_predicates::is_pure_for_egraph()`. pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator) -> Self { let c = pure_op_cost(op) + operand_costs.into_iter().sum(); - Cost::new(c.op_cost(), c.depth().saturating_add(1)) + Cost::new_finite(c.op_cost(), c.depth().saturating_add(1)) } } @@ -133,9 +131,12 @@ impl std::ops::Add for Cost { type Output = Cost; fn add(self, other: Cost) -> Cost { - let op_cost = self.op_cost().saturating_add(other.op_cost()); + let op_cost = std::cmp::min( + self.op_cost().saturating_add(other.op_cost()), + Self::MAX_OP_COST, + ); let depth = std::cmp::max(self.depth(), other.depth()); - Cost::new(op_cost, depth) + Cost::new_finite(op_cost, depth) } } @@ -146,11 +147,11 @@ impl std::ops::Add for Cost { fn pure_op_cost(op: Opcode) -> Cost { match op { // Constants. - Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0), + Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new_finite(1, 0), // Extends/reduces. Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => { - Cost::new(2, 0) + Cost::new_finite(2, 0) } // "Simple" arithmetic. @@ -162,52 +163,9 @@ fn pure_op_cost(op: Opcode) -> Cost { | Opcode::Bnot | Opcode::Ishl | Opcode::Ushr - | Opcode::Sshr => Cost::new(3, 0), + | Opcode::Sshr => Cost::new_finite(3, 0), // Everything else (pure.) - _ => Cost::new(4, 0), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn add_cost() { - let a = Cost::new(5, 2); - let b = Cost::new(37, 3); - assert_eq!(a + b, Cost::new(42, 3)); - assert_eq!(b + a, Cost::new(42, 3)); - } - - #[test] - fn add_infinity() { - let a = Cost::new(5, 2); - let b = Cost::infinity(); - assert_eq!(a + b, Cost::infinity()); - assert_eq!(b + a, Cost::infinity()); - } - - #[test] - fn op_cost_saturates_to_infinity() { - let a = Cost::new(Cost::MAX_OP_COST - 10, 2); - let b = Cost::new(11, 2); - assert_eq!(a + b, Cost::infinity()); - assert_eq!(b + a, Cost::infinity()); - } - - #[test] - fn depth_saturates_to_max_depth() { - let a = Cost::new(10, u8::MAX); - let b = Cost::new(10, 1); - assert_eq!( - Cost::of_pure_op(Opcode::Iconst, [a, b]), - Cost::new(21, u8::MAX) - ); - assert_eq!( - Cost::of_pure_op(Opcode::Iconst, [b, a]), - Cost::new(21, u8::MAX) - ); + _ => Cost::new_finite(4, 0), } } diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs index 9b9d1ea05641..5637215ee66d 100644 --- a/cranelift/codegen/src/egraph/elaborate.rs +++ b/cranelift/codegen/src/egraph/elaborate.rs @@ -7,7 +7,6 @@ use super::Stats; use crate::dominator_tree::DominatorTree; use crate::fx::{FxHashMap, FxHashSet}; use crate::hash_map::Entry as HashEntry; -use crate::inst_predicates::is_pure_for_egraph; use crate::ir::{Block, Function, Inst, Value, ValueDef}; use crate::loop_analysis::{Loop, LoopAnalysis}; use crate::scoped_hash_map::ScopedHashMap; @@ -217,112 +216,46 @@ impl<'a> Elaborator<'a> { fn compute_best_values(&mut self) { let best = &mut self.value_to_best_value; - - // Do a fixpoint loop to compute the best value for each eclass. - // - // The maximum number of iterations is the length of the longest chain - // of `vNN -> vMM` edges in the dataflow graph where `NN < MM`, so this - // is *technically* quadratic, but `cranelift-frontend` won't construct - // any such edges. NaN canonicalization will introduce some of these - // edges, but they are chains of only two or three edges. So in - // practice, we *never* do more than a handful of iterations here unless - // (a) we parsed the CLIF from text and the text was funkily numbered, - // which we don't really care about, or (b) the CLIF producer did - // something weird, in which case it is their responsibility to stop - // doing that. - trace!("Entering fixpoint loop to compute the best values for each eclass"); - let mut keep_going = true; - while keep_going { - keep_going = false; - trace!( - "fixpoint iteration {}", - self.stats.elaborate_best_cost_fixpoint_iters - ); - self.stats.elaborate_best_cost_fixpoint_iters += 1; - - for (value, def) in self.func.dfg.values_and_defs() { - trace!("computing best for value {:?} def {:?}", value, def); - let orig_best_value = best[value]; - - match def { - ValueDef::Union(x, y) => { - // Pick the best of the two options based on - // min-cost. This works because each element of `best` - // is a `(cost, value)` tuple; `cost` comes first so - // the natural comparison works based on cost, and - // breaks ties based on value number. - best[value] = std::cmp::min(best[x], best[y]); - trace!( - " -> best of union({:?}, {:?}) = {:?}", - best[x], - best[y], - best[value] - ); - } - ValueDef::Param(_, _) => { + for (value, def) in self.func.dfg.values_and_defs() { + trace!("computing best for value {:?} def {:?}", value, def); + match def { + ValueDef::Union(x, y) => { + // Pick the best of the two options based on + // min-cost. This works because each element of `best` + // is a `(cost, value)` tuple; `cost` comes first so + // the natural comparison works based on cost, and + // breaks ties based on value number. + trace!(" -> best of {:?} and {:?}", best[x], best[y]); + best[value] = std::cmp::min(best[x], best[y]); + trace!(" -> {:?}", best[value]); + } + ValueDef::Param(_, _) => { + best[value] = BestEntry(Cost::zero(), value); + } + // If the Inst is inserted into the layout (which is, + // at this point, only the side-effecting skeleton), + // then it must be computed and thus we give it zero + // cost. + ValueDef::Result(inst, _) => { + if let Some(_) = self.func.layout.inst_block(inst) { best[value] = BestEntry(Cost::zero(), value); + } else { + trace!(" -> value {}: result, computing cost", value); + let inst_data = &self.func.dfg.insts[inst]; + // N.B.: at this point we know that the opcode is + // pure, so `pure_op_cost`'s precondition is + // satisfied. + let cost = Cost::of_pure_op( + inst_data.opcode(), + self.func.dfg.inst_values(inst).map(|value| best[value].0), + ); + best[value] = BestEntry(cost, value); } - // If the Inst is inserted into the layout (which is, - // at this point, only the side-effecting skeleton), - // then it must be computed and thus we give it zero - // cost. - ValueDef::Result(inst, _) => { - if let Some(_) = self.func.layout.inst_block(inst) { - best[value] = BestEntry(Cost::zero(), value); - } else { - let inst_data = &self.func.dfg.insts[inst]; - // N.B.: at this point we know that the opcode is - // pure, so `pure_op_cost`'s precondition is - // satisfied. - let cost = Cost::of_pure_op( - inst_data.opcode(), - self.func.dfg.inst_values(inst).map(|value| best[value].0), - ); - best[value] = BestEntry(cost, value); - trace!(" -> cost of value {} = {:?}", value, cost); - } - } - }; - - // Keep on iterating the fixpoint loop while we are finding new - // best values. - keep_going |= orig_best_value != best[value]; - } - } - - if cfg!(any(feature = "trace-log", debug_assertions)) { - trace!("finished fixpoint loop to compute best value for each eclass"); - for value in self.func.dfg.values() { - trace!("-> best for eclass {:?}: {:?}", value, best[value]); - debug_assert_ne!(best[value].1, Value::reserved_value()); - // You might additionally be expecting an assert that the best - // cost is not infinity, however infinite cost *can* happen in - // practice. First, note that our cost function doesn't know - // about any shared structure in the dataflow graph, it only - // sums operand costs. (And trying to avoid that by deduping a - // single operation's operands is a losing game because you can - // always just add one indirection and go from `add(x, x)` to - // `add(foo(x), bar(x))` to hide the shared structure.) Given - // that blindness to sharing, we can make cost grow - // exponentially with a linear sequence of operations: - // - // v0 = iconst.i32 1 ;; cost = 1 - // v1 = iadd v0, v0 ;; cost = 3 + 1 + 1 - // v2 = iadd v1, v1 ;; cost = 3 + 5 + 5 - // v3 = iadd v2, v2 ;; cost = 3 + 13 + 13 - // v4 = iadd v3, v3 ;; cost = 3 + 29 + 29 - // v5 = iadd v4, v4 ;; cost = 3 + 61 + 61 - // v6 = iadd v5, v5 ;; cost = 3 + 125 + 125 - // ;; etc... - // - // Such a chain can cause cost to saturate to infinity. How do - // we choose which e-node is best when there are multiple that - // have saturated to infinity? It doesn't matter. As long as - // invariant (2) for optimization rules is upheld by our rule - // set (see `cranelift/codegen/src/opts/README.md`) it is safe - // to choose *any* e-node in the e-class. At worst we will - // produce suboptimal code, but never an incorrectness. - } + } + }; + debug_assert_ne!(best[value].0, Cost::infinity()); + debug_assert_ne!(best[value].1, Value::reserved_value()); + trace!("best for eclass {:?}: {:?}", value, best[value]); } } @@ -673,13 +606,7 @@ impl<'a> Elaborator<'a> { } inst }; - // Place the inst just before `before`. - debug_assert!( - is_pure_for_egraph(self.func, inst), - "something has gone very wrong if we are elaborating effectful \ - instructions, they should have remained in the skeleton" - ); self.func.layout.insert_inst(inst, before); // Update the inst's arguments. diff --git a/cranelift/codegen/src/opts/README.md b/cranelift/codegen/src/opts/README.md index 8685b91f5739..b4f46420abe0 100644 --- a/cranelift/codegen/src/opts/README.md +++ b/cranelift/codegen/src/opts/README.md @@ -1,81 +1,5 @@ -# Rules for Writing Optimization Rules - -For both correctness and compile speed, we must be careful with our rules. A lot -of it boils down to the fact that, unlike traditional e-graphs, our rules are -*directional*. - -1. Rules should not rewrite to worse code: the right-hand side should be at - least as good as the left-hand side or better. - - For example, the rule - - x => (add x 0) - - is disallowed, but swapping its left- and right-hand sides produces a rule - that is allowed. - - Any kind of canonicalizing rule that intends to help subsequent rules match - and unlock further optimizations (e.g. floating constants to the right side - for our constant-propagation rules to match) must produce canonicalized - output that is no worse than its noncanonical input. - - We assume this invariant as a heuristic to break ties between two - otherwise-equal-cost expressions in various places, making up for some - limitations of our explicit cost function. - -2. Any rule that removes value-uses in its right-hand side that previously - existed in its left-hand side MUST use `subsume`. - - For example, the rule - - (select 1 x y) => x - - MUST use `subsume`. - - This is required for correctness because, once a value-use is removed, some - e-nodes in the e-class are more equal than others. There might be uses of `x` - in a scope where `y` is not available, and so emitting `(select 1 x y)` in - place of `x` in such cases would introduce uses of `y` where it is not - defined. - -3. Avoid overly general rewrites like commutativity and associativity. Instead, - prefer targeted instances of the rewrite (for example, canonicalizing adds - where one operand is a constant such that the constant is always the add's - second operand, rather than general commutativity for adds) or even writing - the "same" optimization rule multiple times. - - For example, the commutativity in the first rule in the following snippet is - bad because it will match even when the first operand is not an add: - - ;; Commute to allow `(foo (add ...) x)`, when we see it, to match. - (foo x y) => (foo y x) - - ;; Optimize. - (foo x (add ...)) => (bar x) - - Better is to commute only when we know that canonicalizing in this way will - all definitely allow the subsequent optimization rule to match: - - ;; Canonicalize all adds to `foo`'s second operand. - (foo (add ...) x) => (foo x (add ...)) - - ;; Optimize. - (foo x (add ...)) => (bar x) - - But even better in this case is to write the "same" optimization multiple - times: - - (foo (add ...) x) => (bar x) - (foo x (add ...)) => (bar x) - - The cost of rule-matching is amortized by the ISLE compiler, where as the - intermediate result of each rewrite allocates new e-nodes and requires - storage in the dataflow graph. Therefore, additional rules are cheaper than - additional e-nodes. - - Commutativity and associativity in particular can cause huge amounts of - e-graph bloat. - - One day we intend to extend ISLE with built-in support for commutativity, so - we don't need to author the redundant commutations ourselves: - https://github.com/bytecodealliance/wasmtime/issues/6128 +Rules here are allowed to rewrite pure expressions arbitrarily, +using the same inputs as the original, or fewer. In other words, we +cannot pull a new eclass id out of thin air and refer to it, other +than a piece of the input or a new node that we construct; but we +can freely rewrite e.g. `x+y-y` to `x`. diff --git a/cranelift/codegen/src/opts/cprop.isle b/cranelift/codegen/src/opts/cprop.isle index f84b300f62bd..94574592d2e2 100644 --- a/cranelift/codegen/src/opts/cprop.isle +++ b/cranelift/codegen/src/opts/cprop.isle @@ -167,10 +167,12 @@ (bxor ty (bxor ty x k1 @ (iconst ty _)) k2 @ (iconst ty _))) (bxor ty x (bxor ty k1 k2))) -(rule (simplify (select ty (iconst_u _ (u64_nonzero _)) x _)) - (subsume x)) -(rule (simplify (select ty (iconst_u _ 0) _ y)) - (subsume y)) +(rule (simplify + (select ty (iconst_u _ (u64_nonzero _)) x y)) + x) +(rule (simplify + (select ty (iconst_u _ 0) x y)) + y) ;; Replace subtraction by a "negative" constant with addition. ;; Notably, this gives `x - (-1) == x + 1`, so other patterns don't have to diff --git a/cranelift/filetests/filetests/egraph/issue-7875.clif b/cranelift/filetests/filetests/egraph/issue-7875.clif deleted file mode 100644 index abdcb32288a5..000000000000 --- a/cranelift/filetests/filetests/egraph/issue-7875.clif +++ /dev/null @@ -1,37 +0,0 @@ -test optimize -set enable_verifier=true -set opt_level=speed -target x86_64 - -;; This test case should optimize just fine, and should definitely not produce -;; CLIF that has verifier errors like -;; -;; error: inst10 (v12 = select.f32 v11, v4, v10 ; v11 = 1): uses value arg -;; from non-dominating block4 - -function %foo() { -block0: - v0 = iconst.i64 0 - v2 = f32const 0.0 - v9 = f32const 0.0 - v20 = fneg v2 - v18 = fcmp eq v20, v20 - v4 = select v18, v2, v20 - v8 = iconst.i32 0 - v11 = iconst.i32 1 - brif v0, block2, block3 - -block2: - brif.i32 v8, block4(v2), block4(v9) - -block4(v10: f32): - v12 = select.f32 v11, v4, v10 - v13 = bitcast.i32 v12 - store v13, v0 - trap user0 - -block3: - v15 = bitcast.i32 v4 - store v15, v0 - return -} From 6903ec029c72fc5c8ad95222e12ebf8ed28c1179 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Wed, 7 Feb 2024 14:39:29 -0800 Subject: [PATCH 3/3] Disallow values from jumping between blocks inappropriately Undo changes to the union find structure and gvn_map when popping the node stack in `remove_pure_and_optimize`, ensuring that we don't propagate values in a way that would invalidate the invariant that the `ValueData::Union` structure provides. Additionally, remove all uses of the union find structure from elaboration, as we have already computed the best values at that point. Co-authored-by: Jamey Sharp Co-authored-by: L Pereira Co-authored-by: Nick Fitzgerald --- cranelift/codegen/src/ctxhash.rs | 2 ++ cranelift/codegen/src/egraph.rs | 11 +++++-- cranelift/codegen/src/egraph/elaborate.rs | 36 ++++++++++++----------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/cranelift/codegen/src/ctxhash.rs b/cranelift/codegen/src/ctxhash.rs index e172d46c127a..190dd64a34d1 100644 --- a/cranelift/codegen/src/ctxhash.rs +++ b/cranelift/codegen/src/ctxhash.rs @@ -51,6 +51,7 @@ impl CtxHash for NullCtx { /// the hashcode, for memory efficiency: in common use, `K` and `V` /// are often 32 bits also, and a 12-byte bucket is measurably better /// than a 16-byte bucket. +#[derive(Clone)] struct BucketData { hash: u32, k: K, @@ -58,6 +59,7 @@ struct BucketData { } /// A HashMap that takes external context for all operations. +#[derive(Clone)] pub struct CtxHashMap { raw: RawTable>, } diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs index 41d1512b22cf..81da4559c93f 100644 --- a/cranelift/codegen/src/egraph.rs +++ b/cranelift/codegen/src/egraph.rs @@ -497,7 +497,7 @@ impl<'a> EgraphPass<'a> { let root = self.domtree_children.root(); enum StackEntry { Visit(Block), - Pop, + Pop(UnionFind, CtxHashMap<(Type, InstructionData), Value>), } let mut block_stack = vec![StackEntry::Visit(root)]; while let Some(entry) = block_stack.pop() { @@ -505,7 +505,7 @@ impl<'a> EgraphPass<'a> { StackEntry::Visit(block) => { // We popped this block; push children // immediately, then process this block. - block_stack.push(StackEntry::Pop); + block_stack.push(StackEntry::Pop(self.eclasses.clone(), gvn_map.clone())); block_stack .extend(self.domtree_children.children(block).map(StackEntry::Visit)); effectful_gvn_map.increment_depth(); @@ -579,8 +579,13 @@ impl<'a> EgraphPass<'a> { } } } - StackEntry::Pop => { + StackEntry::Pop(eclasses, saved_gvn_map) => { effectful_gvn_map.decrement_depth(); + // TODO: do we need to save both eclasses and gvn_map? All the tests pass + // without restoring eclasses, but perhaps that's a limitation of our test + // suite. + self.eclasses = eclasses; + gvn_map = saved_gvn_map; } } } diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs index 5637215ee66d..e01343770fe0 100644 --- a/cranelift/codegen/src/egraph/elaborate.rs +++ b/cranelift/codegen/src/egraph/elaborate.rs @@ -21,7 +21,6 @@ pub(crate) struct Elaborator<'a> { domtree: &'a DominatorTree, domtree_children: &'a DomTreeWithChildren, loop_analysis: &'a LoopAnalysis, - eclasses: &'a mut UnionFind, /// Map from Value that is produced by a pure Inst (and was thus /// not in the side-effecting skeleton) to the value produced by /// an elaborated inst (placed in the layout) to whose results we @@ -137,7 +136,7 @@ impl<'a> Elaborator<'a> { domtree_children: &'a DomTreeWithChildren, loop_analysis: &'a LoopAnalysis, remat_values: &'a FxHashSet, - eclasses: &'a mut UnionFind, + _eclasses: &'a mut UnionFind, stats: &'a mut Stats, ) -> Self { let num_values = func.dfg.num_values(); @@ -149,7 +148,6 @@ impl<'a> Elaborator<'a> { domtree, domtree_children, loop_analysis, - eclasses, value_to_elaborated_value: ScopedHashMap::with_capacity(num_values), value_to_best_value, loop_stack: smallvec![], @@ -325,14 +323,14 @@ impl<'a> Elaborator<'a> { let value = self.func.dfg.resolve_aliases(value); self.stats.elaborate_visit_node += 1; - let canonical_value = self.eclasses.find_and_update(value); - debug_assert_ne!(canonical_value, Value::reserved_value()); - trace!( - "elaborate: value {} canonical {} before {}", - value, - canonical_value, - before - ); + // let canonical_value = self.eclasses.find_and_update(value); + // debug_assert_ne!(canonical_value, Value::reserved_value()); + // trace!( + // "elaborate: value {} canonical {} before {}", + // value, + // canonical_value, + // before + // ); // Get the best option; we use `value` (latest // value) here so we have a full view of the @@ -342,7 +340,8 @@ impl<'a> Elaborator<'a> { trace!("elaborate: value {} -> best {}", value, best_value); debug_assert_ne!(best_value, Value::reserved_value()); - if let Some(elab_val) = self.value_to_elaborated_value.get(&canonical_value) { + // if let Some(elab_val) = self.value_to_elaborated_value.get(&canonical_value) { + if let Some(elab_val) = self.value_to_elaborated_value.get(&best_value) { // Value is available; use it. trace!("elaborate: value {} -> {:?}", value, elab_val); self.stats.elaborate_memoize_hit += 1; @@ -568,15 +567,16 @@ impl<'a> Elaborator<'a> { value: new_result, in_block: insert_block, }; - let canonical_result = self.eclasses.find_and_update(result); + // let canonical_result = self.eclasses.find_and_update(result); + let BestEntry(_, canonical_result) = self.value_to_best_value[result]; self.value_to_elaborated_value.insert_if_absent_with_depth( canonical_result, elab_value, scope_depth, ); - self.eclasses.add(new_result); - self.eclasses.union(result, new_result); + // self.eclasses.add(new_result); + // self.eclasses.union(result, new_result); self.value_to_best_value[new_result] = self.value_to_best_value[result]; trace!( @@ -596,7 +596,8 @@ impl<'a> Elaborator<'a> { value: result, in_block: insert_block, }; - let canonical_result = self.eclasses.find_and_update(result); + // let canonical_result = self.eclasses.find_and_update(result); + let BestEntry(_, canonical_result) = self.value_to_best_value[result]; self.value_to_elaborated_value.insert_if_absent_with_depth( canonical_result, elab_value, @@ -684,7 +685,8 @@ impl<'a> Elaborator<'a> { // map now. for &result in self.func.dfg.inst_results(inst) { trace!(" -> result {}", result); - let canonical_result = self.eclasses.find_and_update(result); + // let canonical_result = self.eclasses.find_and_update(result); + let BestEntry(_, canonical_result) = self.value_to_best_value[result]; self.value_to_elaborated_value.insert_if_absent( canonical_result, ElaboratedValue {