diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 40ae31c8021f..c064e7f2fac2 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -172,9 +172,9 @@ class GenerateProducerBody : public NoOpCollapsingMutator { } else { // This semaphore will end up on both sides of the fork, // so we'd better duplicate it. - string cloned_acquire = var->name + unique_name('_'); - cloned_acquires[var->name] = cloned_acquire; - return Acquire::make(Variable::make(type_of(), cloned_acquire), op->count, body); + vector &clones = cloned_acquires[var->name]; + clones.push_back(var->name + unique_name('_')); + return Acquire::make(Variable::make(type_of(), clones.back()), op->count, body); } } @@ -192,11 +192,11 @@ class GenerateProducerBody : public NoOpCollapsingMutator { return op; } - map &cloned_acquires; + map> &cloned_acquires; set inner_semaphores; public: - GenerateProducerBody(const string &f, const vector &s, map &a) + GenerateProducerBody(const string &f, const vector &s, map> &a) : func(f), sema(s), cloned_acquires(a) { } }; @@ -311,7 +311,7 @@ class ForkAsyncProducers : public IRMutator { const map &env; - map cloned_acquires; + map> cloned_acquires; Stmt visit(const Realize *op) override { auto it = env.find(op->name); @@ -354,10 +354,10 @@ class ForkAsyncProducers : public IRMutator { // If there's a nested async producer, we may have // recursively cloned this semaphore inside the mutation // of the producer and consumer. - auto it = cloned_acquires.find(sema_name); - if (it != cloned_acquires.end()) { - body = CloneAcquire(sema_name, it->second).mutate(body); - body = LetStmt::make(it->second, sema_space, body); + const vector &clones = cloned_acquires[sema_name]; + for (const auto &i : clones) { + body = CloneAcquire(sema_name, i).mutate(body); + body = LetStmt::make(i, sema_space, body); } body = LetStmt::make(sema_name, sema_space, body); diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 0a5381972000..c1e0d1fb7bfb 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -980,12 +980,27 @@ class CollapseSelects : public IRMutator { } }; -class ContainsLoop : public IRVisitor { +class ContainsHotLoop : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { result = true; } + void visit(const IfThenElse *op) override { + op->then_case.accept(this); + + // Don't count loops that appear in cold paths + const Call *c = op->condition.as(); + bool else_case_is_cold = + (c && + (c->is_intrinsic(Call::likely_if_innermost) || + c->is_intrinsic(Call::likely))); + if (op->else_case.defined() && + !else_case_is_cold) { + op->else_case.accept(this); + } + } + public: bool result = false; }; @@ -1009,7 +1024,7 @@ class LowerLikelyIfInnermost : public IRMutator { } Stmt visit(const For *op) override { - ContainsLoop c; + ContainsHotLoop c; op->body.accept(&c); inside_innermost_loop = !c.result; Stmt stmt = IRMutator::visit(op); diff --git a/src/Solve.cpp b/src/Solve.cpp index 553e3a91fa30..b5e381436e4b 100644 --- a/src/Solve.cpp +++ b/src/Solve.cpp @@ -417,9 +417,10 @@ class SolveExpression : public IRMutator { } Expr visit(const Call *op) override { - // Ignore likely intrinsics + // Ignore intrinsics that shouldn't affect the results. if (op->is_intrinsic(Call::likely) || - op->is_intrinsic(Call::likely_if_innermost)) { + op->is_intrinsic(Call::likely_if_innermost) || + op->is_intrinsic(Call::promise_clamped)) { return mutate(op->args[0]); } else { return IRMutator::visit(op); diff --git a/src/TrimNoOps.cpp b/src/TrimNoOps.cpp index 059a6236f35b..459ea62fbb90 100644 --- a/src/TrimNoOps.cpp +++ b/src/TrimNoOps.cpp @@ -163,6 +163,10 @@ class IsNoOp : public IRVisitor { IRVisitor::visit(op); } + void visit(const Acquire *op) override { + condition = const_false(); + } + template void visit_let(const LetOrLetStmt *op) { IRVisitor::visit(op); @@ -371,6 +375,8 @@ class TrimNoOps : public IRMutator { if (is_const_one(is_no_op.condition)) { // This loop is definitely useless + debug(3) << "Removed empty loop.\n" + << "Old: " << Stmt(op) << "\n"; return Evaluate::make(0); } else if (is_const_zero(is_no_op.condition)) { // This loop is definitely needed @@ -391,6 +397,8 @@ class TrimNoOps : public IRMutator { if (i.is_empty()) { // Empty loop + debug(3) << "Removed empty loop.\n" + << "Old: " << Stmt(op) << "\n"; return Evaluate::make(0); } diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp index 10483a823bcc..34ebe72603ef 100644 --- a/src/UniquifyVariableNames.cpp +++ b/src/UniquifyVariableNames.cpp @@ -243,7 +243,7 @@ void uniquify_variable_names_test() { {{x, Let::make(y.name(), 3, y)}, {x_1, Let::make(y.name(), 4, y)}}); - std::cout << "is_monotonic test passed" << std::endl; + std::cout << "uniquify_variable_names test passed" << std::endl; } } // namespace Internal diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index ac6ca2d495ec..e31126b4949e 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -30,86 +30,87 @@ Expr get_lane(const Expr &e, int l) { /** Find the exact max and min lanes of a vector expression. Not * conservative like bounds_of_expr, but uses similar rules for some - * common node types where it can be exact. */ -Interval bounds_of_lanes(const Expr &e) { + * common node types where it can be exact. If e is a nested vector, + * the result will be the bounds of the vectors in each lane. */ +Interval bounds_of_nested_lanes(const Expr &e) { if (const Add *add = e.as()) { if (const Broadcast *b = add->b.as()) { - Interval ia = bounds_of_lanes(add->a); + Interval ia = bounds_of_nested_lanes(add->a); return {ia.min + b->value, ia.max + b->value}; } else if (const Broadcast *b = add->a.as()) { - Interval ia = bounds_of_lanes(add->b); + Interval ia = bounds_of_nested_lanes(add->b); return {b->value + ia.min, b->value + ia.max}; } } else if (const Sub *sub = e.as()) { if (const Broadcast *b = sub->b.as()) { - Interval ia = bounds_of_lanes(sub->a); + Interval ia = bounds_of_nested_lanes(sub->a); return {ia.min - b->value, ia.max - b->value}; } else if (const Broadcast *b = sub->a.as()) { - Interval ia = bounds_of_lanes(sub->b); + Interval ia = bounds_of_nested_lanes(sub->b); return {b->value - ia.max, b->value - ia.max}; } } else if (const Mul *mul = e.as()) { if (const Broadcast *b = mul->b.as()) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_lanes(mul->a); + Interval ia = bounds_of_nested_lanes(mul->a); return {ia.min * b->value, ia.max * b->value}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_lanes(mul->a); + Interval ia = bounds_of_nested_lanes(mul->a); return {ia.max * b->value, ia.min * b->value}; } } else if (const Broadcast *b = mul->a.as()) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_lanes(mul->b); + Interval ia = bounds_of_nested_lanes(mul->b); return {b->value * ia.min, b->value * ia.max}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_lanes(mul->b); + Interval ia = bounds_of_nested_lanes(mul->b); return {b->value * ia.max, b->value * ia.min}; } } } else if (const Div *div = e.as
()) { if (const Broadcast *b = div->b.as()) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_lanes(div->a); + Interval ia = bounds_of_nested_lanes(div->a); return {ia.min / b->value, ia.max / b->value}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_lanes(div->a); + Interval ia = bounds_of_nested_lanes(div->a); return {ia.max / b->value, ia.min / b->value}; } } } else if (const And *and_ = e.as()) { if (const Broadcast *b = and_->b.as()) { - Interval ia = bounds_of_lanes(and_->a); + Interval ia = bounds_of_nested_lanes(and_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = and_->a.as()) { - Interval ia = bounds_of_lanes(and_->b); + Interval ia = bounds_of_nested_lanes(and_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Or *or_ = e.as()) { if (const Broadcast *b = or_->b.as()) { - Interval ia = bounds_of_lanes(or_->a); + Interval ia = bounds_of_nested_lanes(or_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = or_->a.as()) { - Interval ia = bounds_of_lanes(or_->b); + Interval ia = bounds_of_nested_lanes(or_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Min *min = e.as()) { if (const Broadcast *b = min->b.as()) { - Interval ia = bounds_of_lanes(min->a); + Interval ia = bounds_of_nested_lanes(min->a); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } else if (const Broadcast *b = min->a.as()) { - Interval ia = bounds_of_lanes(min->b); + Interval ia = bounds_of_nested_lanes(min->b); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } } else if (const Max *max = e.as()) { if (const Broadcast *b = max->b.as()) { - Interval ia = bounds_of_lanes(max->a); + Interval ia = bounds_of_nested_lanes(max->a); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } else if (const Broadcast *b = max->a.as()) { - Interval ia = bounds_of_lanes(max->b); + Interval ia = bounds_of_nested_lanes(max->b); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } } else if (const Not *not_ = e.as()) { - Interval ia = bounds_of_lanes(not_->a); + Interval ia = bounds_of_nested_lanes(not_->a); return {!ia.max, !ia.min}; } else if (const Ramp *r = e.as()) { Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1); @@ -118,11 +119,30 @@ Interval bounds_of_lanes(const Expr &e) { } else if (is_negative_const(r->stride)) { return {r->base + last_lane_idx * r->stride, r->base}; } + } else if (const LE *le = e.as()) { + // The least true this can be is if we maximize the LHS and minimize the RHS. + // The most true this can be is if we minimize the LHS and maximize the RHS. + // This is only exact if one of the two sides is a Broadcast. + Interval ia = bounds_of_nested_lanes(le->a); + Interval ib = bounds_of_nested_lanes(le->b); + if (ia.is_single_point() || ib.is_single_point()) { + return {ia.max <= ib.min, ia.min <= ib.max}; + } + } else if (const LT *lt = e.as()) { + // The least true this can be is if we maximize the LHS and minimize the RHS. + // The most true this can be is if we minimize the LHS and maximize the RHS. + // This is only exact if one of the two sides is a Broadcast. + Interval ia = bounds_of_nested_lanes(lt->a); + Interval ib = bounds_of_nested_lanes(lt->b); + if (ia.is_single_point() || ib.is_single_point()) { + return {ia.max < ib.min, ia.min < ib.max}; + } + } else if (const Broadcast *b = e.as()) { return {b->value, b->value}; } else if (const Let *let = e.as()) { - Interval ia = bounds_of_lanes(let->value); - Interval ib = bounds_of_lanes(let->body); + Interval ia = bounds_of_nested_lanes(let->value); + Interval ib = bounds_of_nested_lanes(let->body); if (expr_uses_var(ib.min, let->name)) { ib.min = Let::make(let->name, let->value, ib.min); } @@ -145,6 +165,19 @@ Interval bounds_of_lanes(const Expr &e) { } }; +/** Similar to bounds_of_nested_lanes, but it recursively reduces + * the bounds of nested vectors to scalars. */ +Interval bounds_of_lanes(const Expr &e) { + Interval bounds = bounds_of_nested_lanes(e); + if (!bounds.min.type().is_scalar()) { + bounds.min = bounds_of_lanes(bounds.min).min; + } + if (!bounds.max.type().is_scalar()) { + bounds.max = bounds_of_lanes(bounds.max).max; + } + return bounds; +} + // A ramp with the lanes repeated inner_repetitions times, and then // the whole vector repeated outer_repetitions times. // E.g: <0 0 2 2 4 4 6 6 0 0 2 2 4 4 6 6>.