diff --git a/apps/blur/halide_blur_generator.cpp b/apps/blur/halide_blur_generator.cpp index 5c208f796fee..d5dfe51f8f15 100644 --- a/apps/blur/halide_blur_generator.cpp +++ b/apps/blur/halide_blur_generator.cpp @@ -82,6 +82,7 @@ class HalideBlur : public Halide::Generator { } } else if (get_target().has_feature(Target::HVX)) { // Hexagon schedule. + // TODO: Try using a schedule like the CPU one below. const int vector_size = 128; blur_y.compute_root() @@ -96,8 +97,17 @@ class HalideBlur : public Halide::Generator { .vectorize(x, vector_size); } else { // CPU schedule. - blur_y.split(y, y, yi, 8).parallel(y).vectorize(x, 8); - blur_x.store_at(blur_y, y).compute_at(blur_y, yi).vectorize(x, 8); + // Compute blur_x as needed at each vector of the output. + // Halide will store blur_x in a circular buffer so its + // results can be re-used. + blur_y + .split(y, y, yi, 32) + .parallel(y) + .vectorize(x, 16); + blur_x + .store_at(blur_y, y) + .compute_at(blur_y, x) + .vectorize(x, 16); } } }; diff --git a/apps/blur/test.cpp b/apps/blur/test.cpp index 88f8d058a4cb..6d7e678285e7 100644 --- a/apps/blur/test.cpp +++ b/apps/blur/test.cpp @@ -159,8 +159,8 @@ int main(int argc, char **argv) { const bool is_hexagon = strstr(md->target, "hvx_128") || strstr(md->target, "hvx_64"); // The Hexagon simulator can't allocate as much memory as the above wants. - const int width = is_hexagon ? 648 : 6408; - const int height = is_hexagon ? 482 : 4802; + const int width = is_hexagon ? 648 : 2568; + const int height = is_hexagon ? 482 : 1922; Buffer input(width, height); diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp index 9b68c7cdb109..9c8005724555 100644 --- a/apps/camera_pipe/camera_pipe_generator.cpp +++ b/apps/camera_pipe/camera_pipe_generator.cpp @@ -530,7 +530,7 @@ void CameraPipe::generate() { .compute_at(processed, yi) .store_at(processed, yo) .prefetch(input, y, 2) - .fold_storage(y, 16) + .fold_storage(y, 4) .tile(x, y, x, y, xi, yi, 2 * vec, 2) .vectorize(xi) .unroll(yi); @@ -538,7 +538,7 @@ void CameraPipe::generate() { deinterleaved .compute_at(processed, yi) .store_at(processed, yo) - .fold_storage(y, 8) + .fold_storage(y, 4) .reorder(c, x, y) .vectorize(x, 2 * vec, TailStrategy::RoundUp) .unroll(c); diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index 77d32b8ac4a8..4a27e3dd454a 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -148,7 +148,7 @@ class LocalLaplacian : public Halide::Generator { outGPyramid[j] .store_at(output, yo) .compute_at(output, y) - .fold_storage(y, 8) + .fold_storage(y, 4) .vectorize(x, 8); } outGPyramid[0].compute_at(output, y).vectorize(x, 8); diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index 6b1798b25528..7fa67ac2192f 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -349,8 +349,12 @@ class ExtractSharedAndHeapAllocations : public IRMutator { // repeated dependence on the block var s.size = solve_expression(s.size, op->name).result; s.size = simplify(common_subexpression_elimination(s.size)); - auto result = is_monotonic(s.size, op->name); - if (result == Monotonic::Unknown) { + switch (is_monotonic(s.size, op->name)) { + case Monotonic::Unknown: + // TODO: if bounds_of_expr_in_scope becomes more + // powerful than is_monotonic, it might be better + // to call it here. That would be risky though, as + // it's not exact. debug(1) << "Shared allocation for " << s.name << " has a size that is non-monontonic in the gpu block variable " << op->name @@ -359,19 +363,19 @@ class ExtractSharedAndHeapAllocations : public IRMutator { get_compiler_logger()->record_non_monotonic_loop_var(op->name, s.size); } precompute_allocation_size(s); - } else { - auto interval_bounds = bounds_of_expr_in_scope(s.size, scope); - user_assert(interval_bounds.has_upper_bound()) - << "Couldn't infer bounds for " << s.name << " shared memory allocation\n"; - // In theory we could precompute the allocation - // size if there's no upper bound too, but for the - // assert above to fail we'd have to encounter an - // expression that is_monotonic detects as - // increasing, decreasing, or constant, but is - // somehow unbounded. It's probable that no such - // expression exists. is_monotonic is generally - // less capable than bounds_of_expr_in_scope. - s.size = interval_bounds.max; + break; + case Monotonic::Increasing: + s.size = substitute(op->name, simplify(op->min + op->extent - 1), s.size); + break; + case Monotonic::Constant: + // The size expression used the variable, but we + // may have successfully eliminated it above, or + // is_monotonic might have detected that the + // dependence is false somehow. Just treat it as + // decreasing... + case Monotonic::Decreasing: + s.size = substitute(op->name, op->min, s.size); + break; } } if (in_threads && op->is_parallel()) { diff --git a/src/Interval.cpp b/src/Interval.cpp index 6c9ef0d48843..10550f7ed48b 100644 --- a/src/Interval.cpp +++ b/src/Interval.cpp @@ -157,5 +157,91 @@ Expr Interval::neg_inf_noinline() { return Interval::neg_inf_expr; } +ConstantInterval::ConstantInterval() = default; + +ConstantInterval::ConstantInterval(int64_t min, int64_t max) + : min(min), max(max), min_defined(true), max_defined(true) { + internal_assert(min <= max); +} + +ConstantInterval ConstantInterval::everything() { + return ConstantInterval(); +} + +ConstantInterval ConstantInterval::single_point(int64_t x) { + return ConstantInterval(x, x); +} + +ConstantInterval ConstantInterval::bounded_below(int64_t min) { + ConstantInterval result(min, min); + result.max_defined = false; + return result; +} + +ConstantInterval ConstantInterval::bounded_above(int64_t max) { + ConstantInterval result(max, max); + result.min_defined = false; + return result; +} + +bool ConstantInterval::is_everything() const { + return !min_defined && !max_defined; +} + +bool ConstantInterval::is_single_point() const { + return min_defined && max_defined && min == max; +} + +bool ConstantInterval::is_single_point(int64_t x) const { + return min_defined && max_defined && min == x && max == x; +} + +bool ConstantInterval::has_upper_bound() const { + return max_defined; +} + +bool ConstantInterval::has_lower_bound() const { + return min_defined; +} + +bool ConstantInterval::is_bounded() const { + return has_upper_bound() && has_lower_bound(); +} + +bool ConstantInterval::operator==(const ConstantInterval &other) const { + if (min_defined != other.min_defined || max_defined != other.max_defined) { + return false; + } + return (!min_defined || min == other.min) && (!max_defined || max == other.max); +} + +void ConstantInterval::include(const ConstantInterval &i) { + if (max_defined && i.max_defined) { + max = std::max(max, i.max); + } else { + max_defined = false; + } + if (min_defined && i.min_defined) { + min = std::min(min, i.min); + } else { + min_defined = false; + } +} + +void ConstantInterval::include(int64_t x) { + if (max_defined) { + max = std::max(max, x); + } + if (min_defined) { + min = std::min(min, x); + } +} + +ConstantInterval ConstantInterval::make_union(const ConstantInterval &a, const ConstantInterval &b) { + ConstantInterval result = a; + result.include(b); + return result; +} + } // namespace Internal } // namespace Halide diff --git a/src/Interval.h b/src/Interval.h index 2c7c40c49712..1d90d4a29b55 100644 --- a/src/Interval.h +++ b/src/Interval.h @@ -110,6 +110,63 @@ struct Interval { static Expr neg_inf_noinline(); }; +/** A class to represent ranges of integers. Can be unbounded above or below, but + * they cannot be empty. */ +struct ConstantInterval { + /** The lower and upper bound of the interval. They are included + * in the interval. */ + int64_t min = 0, max = 0; + bool min_defined = false, max_defined = false; + + /* A default-constructed Interval is everything */ + ConstantInterval(); + + /** Construct an interval from a lower and upper bound. */ + ConstantInterval(int64_t min, int64_t max); + + /** The interval representing everything. */ + static ConstantInterval everything(); + + /** Construct an interval representing a single point. */ + static ConstantInterval single_point(int64_t x); + + /** Construct intervals bounded above or below. */ + static ConstantInterval bounded_below(int64_t min); + static ConstantInterval bounded_above(int64_t max); + + /** Is the interval the entire range */ + bool is_everything() const; + + /** Is the interval just a single value (min == max) */ + bool is_single_point() const; + + /** Is the interval a particular single value */ + bool is_single_point(int64_t x) const; + + /** Does the interval have a finite least upper bound */ + bool has_upper_bound() const; + + /** Does the interval have a finite greatest lower bound */ + bool has_lower_bound() const; + + /** Does the interval have a finite upper and lower bound */ + bool is_bounded() const; + + /** Expand the interval to include another Interval */ + void include(const ConstantInterval &i); + + /** Expand the interval to include a point */ + void include(int64_t x); + + /** Construct the smallest interval containing two intervals. */ + static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b); + + /** Equivalent to same_as. Exists so that the autoscheduler can + * compare two map for equality in order to + * cache computations. */ + bool operator==(const ConstantInterval &other) const; +}; + } // namespace Internal } // namespace Halide diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp index fd8285608770..7383ed377dff 100644 --- a/src/Monotonic.cpp +++ b/src/Monotonic.cpp @@ -1,4 +1,5 @@ #include "Monotonic.h" +#include "Bounds.h" #include "IROperator.h" #include "IRVisitor.h" #include "Scope.h" @@ -30,26 +31,212 @@ using std::string; namespace { -class MonotonicVisitor : public IRVisitor { +const int64_t *as_const_int_or_uint(const Expr &e) { + if (const int64_t *i = as_const_int(e)) { + return i; + } else if (const uint64_t *u = as_const_uint(e)) { + if (*u <= (uint64_t)std::numeric_limits::max()) { + return (const int64_t *)u; + } + } + return nullptr; +} + +bool is_constant(const ConstantInterval &a) { + return a.is_single_point(0); +} + +bool may_be_negative(const ConstantInterval &a) { + return !a.has_lower_bound() || a.min < 0; +} + +bool may_be_positive(const ConstantInterval &a) { + return !a.has_upper_bound() || a.max > 0; +} + +bool is_monotonic_increasing(const ConstantInterval &a) { + return !may_be_negative(a); +} + +bool is_monotonic_decreasing(const ConstantInterval &a) { + return !may_be_positive(a); +} + +ConstantInterval to_interval(Monotonic m) { + switch (m) { + case Monotonic::Constant: + return ConstantInterval::single_point(0); + case Monotonic::Increasing: + return ConstantInterval::bounded_below(0); + case Monotonic::Decreasing: + return ConstantInterval::bounded_above(0); + case Monotonic::Unknown: + return ConstantInterval::everything(); + } + return ConstantInterval::everything(); +} + +Monotonic to_monotonic(const ConstantInterval &x) { + if (is_constant(x)) { + return Monotonic::Constant; + } else if (is_monotonic_increasing(x)) { + return Monotonic::Increasing; + } else if (is_monotonic_decreasing(x)) { + return Monotonic::Decreasing; + } else { + return Monotonic::Unknown; + } +} + +ConstantInterval unify(const ConstantInterval &a, const ConstantInterval &b) { + return ConstantInterval::make_union(a, b); +} + +ConstantInterval unify(const ConstantInterval &a, int64_t b) { + ConstantInterval result; + result.include(b); + return result; +} + +// Helpers for doing arithmetic on ConstantIntervals that avoid generating +// expressions of pos_inf/neg_inf. +ConstantInterval add(const ConstantInterval &a, const ConstantInterval &b) { + ConstantInterval result; + result.min_defined = a.has_lower_bound() && b.has_lower_bound(); + result.max_defined = a.has_upper_bound() && b.has_upper_bound(); + if (result.has_lower_bound()) { + result.min = a.min + b.min; + } + if (result.has_upper_bound()) { + result.max = a.max + b.max; + } + return result; +} + +ConstantInterval add(const ConstantInterval &a, int64_t b) { + return add(a, ConstantInterval(b, b)); +} + +ConstantInterval negate(const ConstantInterval &r) { + ConstantInterval result; + result.min_defined = r.has_upper_bound(); + result.min = r.has_upper_bound() ? -r.max : 0; + result.max_defined = r.has_lower_bound(); + result.max = r.has_lower_bound() ? -r.min : 0; + return result; +} + +ConstantInterval sub(const ConstantInterval &a, const ConstantInterval &b) { + return add(a, negate(b)); +} + +ConstantInterval sub(const ConstantInterval &a, int64_t b) { + return sub(a, ConstantInterval(b, b)); +} + +ConstantInterval multiply(const ConstantInterval &a, int64_t b) { + ConstantInterval result(a); + if (b < 0) { + result = negate(result); + b = -b; + } + if (result.has_lower_bound()) { + result.min *= b; + } + if (result.has_upper_bound()) { + result.max *= b; + } + return result; +} + +ConstantInterval multiply(const ConstantInterval &a, const Expr &b) { + if (const int64_t *bi = as_const_int_or_uint(b)) { + return multiply(a, *bi); + } + return ConstantInterval::everything(); +} + +ConstantInterval multiply(const ConstantInterval &a, const ConstantInterval &b) { + int64_t bounds[4]; + int64_t *bounds_begin = &bounds[0]; + int64_t *bounds_end = &bounds[0]; + if (a.has_lower_bound() && b.has_lower_bound()) { + *bounds_end++ = a.min * b.min; + } + if (a.has_lower_bound() && b.has_upper_bound()) { + *bounds_end++ = a.min * b.max; + } + if (a.has_upper_bound() && b.has_lower_bound()) { + *bounds_end++ = a.max * b.min; + } + if (a.has_upper_bound() && b.has_upper_bound()) { + *bounds_end++ = a.max * b.max; + } + if (bounds_begin != bounds_end) { + ConstantInterval result = { + *std::min_element(bounds_begin, bounds_end), + *std::max_element(bounds_begin, bounds_end), + }; + // There *must* be a better way than this... Even + // cutting half the cases with swapping isn't that much help. + if (!a.has_lower_bound()) { + if (may_be_negative(b)) result.max_defined = false; // NOLINT + if (may_be_positive(b)) result.min_defined = false; // NOLINT + } + if (!a.has_upper_bound()) { + if (may_be_negative(b)) result.min_defined = false; // NOLINT + if (may_be_positive(b)) result.max_defined = false; // NOLINT + } + if (!b.has_lower_bound()) { + if (may_be_negative(a)) result.max_defined = false; // NOLINT + if (may_be_positive(a)) result.min_defined = false; // NOLINT + } + if (!b.has_upper_bound()) { + if (may_be_negative(a)) result.min_defined = false; // NOLINT + if (may_be_positive(a)) result.max_defined = false; // NOLINT + } + return result; + } else { + return ConstantInterval::everything(); + } +} + +ConstantInterval divide(const ConstantInterval &a, int64_t b) { + ConstantInterval result(a); + if (b < 0) { + result = negate(result); + b = -b; + } + if (result.has_lower_bound()) { + result.min = div_imp(result.min, b); + } + if (result.has_upper_bound()) { + result.max = div_imp(result.max + b - 1, b); + } + return result; +} + +class DerivativeBounds : public IRVisitor { const string &var; - Scope scope; + Scope scope; + Scope bounds; void visit(const IntImm *) override { - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const UIntImm *) override { - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const FloatImm *) override { - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const StringImm *) override { // require() Exprs can includes Strings. - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const Cast *op) override { @@ -67,135 +254,105 @@ class MonotonicVisitor : public IRVisitor { // A narrowing cast. There may be more cases we can catch, but // for now we punt. - if (result != Monotonic::Constant) { - result = Monotonic::Unknown; + if (!is_constant(result)) { + result = ConstantInterval::everything(); } } void visit(const Variable *op) override { if (op->name == var) { - result = Monotonic::Increasing; + result = ConstantInterval::single_point(1); } else if (scope.contains(op->name)) { result = scope.get(op->name); } else { - result = Monotonic::Constant; - } - } - - Monotonic flip(Monotonic r) { - switch (r) { - case Monotonic::Increasing: - return Monotonic::Decreasing; - case Monotonic::Decreasing: - return Monotonic::Increasing; - default: - return r; - } - } - - Monotonic unify(Monotonic a, Monotonic b) { - if (a == b) { - return a; - } - - if (a == Monotonic::Unknown || b == Monotonic::Unknown) { - return Monotonic::Unknown; - } - - if (a == Monotonic::Constant) { - return b; + result = ConstantInterval::single_point(0); } - - if (b == Monotonic::Constant) { - return a; - } - - return Monotonic::Unknown; } void visit(const Add *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; - result = unify(ra, rb); + ConstantInterval rb = result; + result = add(ra, rb); } void visit(const Sub *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; - result = unify(ra, flip(rb)); + ConstantInterval rb = result; + result = sub(ra, rb); } void visit(const Mul *op) override { - op->a.accept(this); - Monotonic ra = result; - op->b.accept(this); - Monotonic rb = result; - - if (ra == Monotonic::Constant && rb == Monotonic::Constant) { - result = Monotonic::Constant; - } else if (is_positive_const(op->a)) { - result = rb; - } else if (is_positive_const(op->b)) { - result = ra; - } else if (is_negative_const(op->a)) { - result = flip(rb); - } else if (is_negative_const(op->b)) { - result = flip(ra); + if (op->type.is_scalar()) { + op->a.accept(this); + ConstantInterval ra = result; + op->b.accept(this); + ConstantInterval rb = result; + + // This is essentially the product rule: a*rb + b*ra + // but only implemented for the case where a or b is constant. + if (const int64_t *b = as_const_int_or_uint(op->b)) { + result = multiply(ra, *b); + } else if (const int64_t *a = as_const_int_or_uint(op->a)) { + result = multiply(rb, *a); + } else { + result = ConstantInterval::everything(); + } } else { - result = Monotonic::Unknown; + result = ConstantInterval::everything(); } } void visit(const Div *op) override { - op->a.accept(this); - Monotonic ra = result; - op->b.accept(this); - Monotonic rb = result; - - if (ra == Monotonic::Constant && rb == Monotonic::Constant) { - result = Monotonic::Constant; - } else if (is_positive_const(op->b)) { - result = ra; - } else if (is_negative_const(op->b)) { - result = flip(ra); + if (op->type.is_scalar()) { + op->a.accept(this); + ConstantInterval ra = result; + + if (const int64_t *b = as_const_int_or_uint(op->b)) { + result = divide(ra, *b); + } else { + result = ConstantInterval::everything(); + } } else { - result = Monotonic::Unknown; + result = ConstantInterval::everything(); } } void visit(const Mod *op) override { - result = Monotonic::Unknown; + result = ConstantInterval::everything(); } void visit(const Min *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; + ConstantInterval rb = result; result = unify(ra, rb); } void visit(const Max *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; + ConstantInterval rb = result; result = unify(ra, rb); } void visit_eq(const Expr &a, const Expr &b) { a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; b.accept(this); - Monotonic rb = result; - if (ra == Monotonic::Constant && rb == Monotonic::Constant) { - result = Monotonic::Constant; + ConstantInterval rb = result; + if (is_constant(ra) && is_constant(rb)) { + result = ConstantInterval::single_point(0); } else { - result = Monotonic::Unknown; + // If the result is bounded, limit it to [-1, 1]. The largest + // difference possible is flipping from true to false or false + // to true. + result = ConstantInterval(-1, 1); } } @@ -209,10 +366,19 @@ class MonotonicVisitor : public IRVisitor { void visit_lt(const Expr &a, const Expr &b) { a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; b.accept(this); - Monotonic rb = result; - result = unify(flip(ra), rb); + ConstantInterval rb = result; + result = unify(negate(ra), rb); + // If the result is bounded, limit it to [-1, 1]. The largest + // difference possible is flipping from true to false or false + // to true. + if (result.has_lower_bound()) { + result.min = std::min(std::max(result.min, -1), 1); + } + if (result.has_upper_bound()) { + result.max = std::min(std::max(result.max, -1), 1); + } } void visit(const LT *op) override { @@ -233,71 +399,63 @@ class MonotonicVisitor : public IRVisitor { void visit(const And *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; + ConstantInterval rb = result; result = unify(ra, rb); } void visit(const Or *op) override { op->a.accept(this); - Monotonic ra = result; + ConstantInterval ra = result; op->b.accept(this); - Monotonic rb = result; + ConstantInterval rb = result; result = unify(ra, rb); } void visit(const Not *op) override { op->a.accept(this); - result = flip(result); + result = negate(result); } void visit(const Select *op) override { - op->condition.accept(this); - Monotonic rcond = result; - - op->true_value.accept(this); - Monotonic ra = result; - op->false_value.accept(this); - Monotonic rb = result; - Monotonic unified = unify(ra, rb); - - if (rcond == Monotonic::Constant) { - result = unified; - return; - } + // The result is the unified bounds, added to the "bump" that happens when switching from true to false. + if (op->type.is_scalar()) { + op->condition.accept(this); + ConstantInterval rcond = result; + + op->true_value.accept(this); + ConstantInterval ra = result; + op->false_value.accept(this); + ConstantInterval rb = result; + ConstantInterval unified = unify(ra, rb); + + // TODO: How to handle unsigned values? + Expr delta = simplify(op->true_value - op->false_value); + + Interval delta_bounds = find_constant_bounds(delta, bounds); + ConstantInterval adjusted_delta; + // TODO: Maybe we can do something with one-sided intervals? + if (delta_bounds.is_bounded()) { + ConstantInterval delta_low = multiply(rcond, delta_bounds.min); + ConstantInterval delta_high = multiply(rcond, delta_bounds.max); + adjusted_delta = ConstantInterval::make_union(delta_low, delta_high); + } else { + delta.accept(this); + ConstantInterval rdelta = result; + adjusted_delta = multiply(rcond, rdelta); + } - bool true_value_ge_false_value = can_prove(op->true_value >= op->false_value); - bool true_value_le_false_value = can_prove(op->true_value <= op->false_value); - - bool switches_from_true_to_false = rcond == Monotonic::Decreasing; - bool switches_from_false_to_true = rcond == Monotonic::Increasing; - - if (true_value_ge_false_value && - true_value_le_false_value) { - // The true value equals the false value. - result = ra; - } else if ((unified == Monotonic::Increasing || unified == Monotonic::Constant) && - ((switches_from_false_to_true && true_value_ge_false_value) || - (switches_from_true_to_false && true_value_le_false_value))) { - // Both paths increase, and the condition makes it switch - // from the lesser path to the greater path. - result = Monotonic::Increasing; - } else if ((unified == Monotonic::Decreasing || unified == Monotonic::Constant) && - ((switches_from_false_to_true && true_value_le_false_value) || - (switches_from_true_to_false && true_value_ge_false_value))) { - // Both paths decrease, and the condition makes it switch - // from the greater path to the lesser path. - result = Monotonic::Decreasing; + result = add(unified, adjusted_delta); } else { - result = Monotonic::Unknown; + result = ConstantInterval::everything(); } } void visit(const Load *op) override { op->index.accept(this); - if (result != Monotonic::Constant) { - result = Monotonic::Unknown; + if (!is_constant(result)) { + result = ConstantInterval::everything(); } } @@ -331,52 +489,55 @@ class MonotonicVisitor : public IRVisitor { return; } - if (!op->is_pure()) { + if (!op->is_pure() || !is_constant(result)) { // Even with constant args, the result could vary from one loop iteration to the next. - result = Monotonic::Unknown; + result = ConstantInterval::everything(); return; } for (size_t i = 0; i < op->args.size(); i++) { op->args[i].accept(this); - if (result != Monotonic::Constant) { + if (!is_constant(result)) { // One of the args is not constant. - result = Monotonic::Unknown; + result = ConstantInterval::everything(); return; } } - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const Let *op) override { op->value.accept(this); - if (result == Monotonic::Constant) { + ScopedBinding bounds_binding(bounds, op->name, find_constant_bounds(op->value, bounds)); + + if (is_constant(result)) { // No point pushing it if it's constant w.r.t the var, // because unknown variables are treated as constant. op->body.accept(this); } else { - scope.push(op->name, result); + ScopedBinding scope_binding(scope, op->name, result); op->body.accept(this); - scope.pop(op->name); } } void visit(const Shuffle *op) override { for (size_t i = 0; i < op->vectors.size(); i++) { op->vectors[i].accept(this); - if (result != Monotonic::Constant) { - result = Monotonic::Unknown; + if (!is_constant(result)) { + result = ConstantInterval::everything(); return; } } - result = Monotonic::Constant; + result = ConstantInterval::single_point(0); } void visit(const VectorReduce *op) override { op->value.accept(this); switch (op->op) { case VectorReduce::Add: + result = multiply(result, op->value.type().lanes() / op->type.lanes()); + break; case VectorReduce::Min: case VectorReduce::Max: // These reductions are monotonic in the arg @@ -385,8 +546,8 @@ class MonotonicVisitor : public IRVisitor { case VectorReduce::And: case VectorReduce::Or: // These ones are not - if (result != Monotonic::Constant) { - result = Monotonic::Unknown; + if (!is_constant(result)) { + result = ConstantInterval::everything(); } } } @@ -456,25 +617,43 @@ class MonotonicVisitor : public IRVisitor { } public: - Monotonic result; + ConstantInterval result; - MonotonicVisitor(const std::string &v, const Scope &parent) - : var(v), result(Monotonic::Unknown) { + DerivativeBounds(const std::string &v, const Scope &parent) + : var(v), result(ConstantInterval::everything()) { scope.set_containing_scope(&parent); } }; } // namespace -Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope &scope) { +ConstantInterval derivative_bounds(const Expr &e, const std::string &var, const Scope &scope) { if (!e.defined()) { - return Monotonic::Unknown; + return ConstantInterval::everything(); } - MonotonicVisitor m(var, scope); + DerivativeBounds m(var, scope); e.accept(&m); return m.result; } +Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope &scope) { + if (!e.defined()) { + return Monotonic::Unknown; + } + return to_monotonic(derivative_bounds(e, var, scope)); +} + +Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope &scope) { + if (!e.defined()) { + return Monotonic::Unknown; + } + Scope intervals_scope; + for (Scope::const_iterator i = scope.cbegin(); i != scope.cend(); ++i) { + intervals_scope.push(i.name(), to_interval(i.value())); + } + return is_monotonic(e, var, intervals_scope); +} + namespace { void check_increasing(const Expr &e) { internal_assert(is_monotonic(e, "x") == Monotonic::Increasing) @@ -506,6 +685,7 @@ void is_monotonic_test() { check_increasing(x + 4); check_increasing(x + y); check_increasing(x * 4); + check_increasing(x / 4); check_increasing(min(x + 4, y + 4)); check_increasing(max(x + y, x - y)); check_increasing(x >= y); @@ -513,12 +693,17 @@ void is_monotonic_test() { check_decreasing(-x); check_decreasing(x * -4); + check_decreasing(x / -4); check_decreasing(y - x); check_decreasing(x < y); check_decreasing(x <= y); check_unknown(x == y); check_unknown(x != y); + check_increasing(y <= x); + check_increasing(y < x); + check_decreasing(x <= y); + check_decreasing(x < y); check_unknown(x * y); // Not constant despite having constant args, because there's a side-effect. @@ -527,10 +712,14 @@ void is_monotonic_test() { check_increasing(select(y == 2, x, x + 4)); check_decreasing(select(y == 2, -x, x * -4)); - check_increasing(select(x > 2, x + 1, x)); - check_increasing(select(x < 2, x, x + 1)); - check_decreasing(select(x > 2, -x - 1, -x)); - check_decreasing(select(x < 2, -x, -x - 1)); + check_unknown(select(x > 2, x - 2, x)); + check_unknown(select(x < 2, x, x - 2)); + check_unknown(select(x > 2, -x + 2, -x)); + check_unknown(select(x < 2, -x, -x + 2)); + check_increasing(select(x > 2, x - 1, x)); + check_increasing(select(x < 2, x, x - 1)); + check_decreasing(select(x > 2, -x + 1, -x)); + check_decreasing(select(x < 2, -x, -x + 1)); check_unknown(select(x < 2, x, x - 5)); check_unknown(select(x > 2, x - 5, x)); @@ -546,6 +735,12 @@ void is_monotonic_test() { check_constant(select(y > 3, y + 23, y - 65)); + check_decreasing(select(2 <= x, 0, 1)); + check_increasing(select(2 <= x, 0, 1) + x); + check_decreasing(-min(x, 16)); + + check_unknown(select(0 < x, max(min(x, 4), 3), 4)); + std::cout << "is_monotonic test passed" << std::endl; } diff --git a/src/Monotonic.h b/src/Monotonic.h index c06fe8eac289..3d7946a13ed7 100644 --- a/src/Monotonic.h +++ b/src/Monotonic.h @@ -8,12 +8,16 @@ #include #include -#include "Expr.h" +#include "Interval.h" #include "Scope.h" namespace Halide { namespace Internal { +/** Find the bounds of the derivative of an expression. */ +ConstantInterval derivative_bounds(const Expr &e, const std::string &var, + const Scope &scope = Scope::empty_scope()); + /** * Detect whether an expression is monotonic increasing in a variable, * decreasing, or unknown. @@ -23,7 +27,8 @@ enum class Monotonic { Constant, Decreasing, Unknown }; Monotonic is_monotonic(const Expr &e, const std::string &var, - const Scope &scope = Scope::empty_scope()); + const Scope &scope = Scope::empty_scope()); +Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope &scope); /** Emit the monotonic class in human-readable form for debugging. */ std::ostream &operator<<(std::ostream &stream, const Monotonic &m); diff --git a/src/SimplifyCorrelatedDifferences.cpp b/src/SimplifyCorrelatedDifferences.cpp index 0c78cfd2ad4a..cf7b5475e3d3 100644 --- a/src/SimplifyCorrelatedDifferences.cpp +++ b/src/SimplifyCorrelatedDifferences.cpp @@ -24,7 +24,7 @@ class SimplifyCorrelatedDifferences : public IRMutator { string loop_var; - Scope monotonic; + Scope monotonic; struct OuterLet { string name; @@ -38,11 +38,11 @@ class SimplifyCorrelatedDifferences : public IRMutator { // Visit an entire chain of lets in a single method to conserve stack space. struct Frame { const LetStmtOrLet *op; - ScopedBinding binding; + ScopedBinding binding; Expr new_value; - Frame(const LetStmtOrLet *op, const string &loop_var, Scope &scope) + Frame(const LetStmtOrLet *op, const string &loop_var, Scope &scope) : op(op), - binding(scope, op->name, is_monotonic(op->value, loop_var, scope)) { + binding(scope, op->name, derivative_bounds(op->value, loop_var, scope)) { } Frame(const LetStmtOrLet *op) : op(op) { @@ -59,7 +59,7 @@ class SimplifyCorrelatedDifferences : public IRMutator { // same name. If we decide not to add an inner let, but do add // the outer one, then later references to it will be // incorrect. Second, if we don't add something that happens - // to be non-monotonic, then is_monotonic finds a variable + // to be non-monotonic, then derivative_bounds finds a variable // that references it in a later let, it will think it's a // constant, not an unknown. do { @@ -118,7 +118,7 @@ class SimplifyCorrelatedDifferences : public IRMutator { tmp_lets.swap(lets); loop_var = op->name; { - ScopedBinding bind(monotonic, loop_var, Monotonic::Increasing); + ScopedBinding bind(monotonic, loop_var, ConstantInterval::single_point(1)); s = IRMutator::visit(op); } loop_var.clear(); diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h index 845aaa07527d..f18e29645ac7 100644 --- a/src/Simplify_Internal.h +++ b/src/Simplify_Internal.h @@ -36,6 +36,7 @@ class Simplify : public VariadicVisitor { struct ExprInfo { // We track constant integer bounds when they exist + // TODO: Use ConstantInterval? int64_t min = 0, max = 0; bool min_defined = false, max_defined = false; // And the alignment of integer variables diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp index e55848db5783..9e1b7114eedb 100644 --- a/src/SlidingWindow.cpp +++ b/src/SlidingWindow.cpp @@ -3,20 +3,30 @@ #include "Bounds.h" #include "CompilerLogger.h" #include "Debug.h" +#include "ExprUsesVar.h" +#include "IREquality.h" +#include "IRMatch.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "Monotonic.h" #include "Scope.h" #include "Simplify.h" +#include "Solve.h" #include "Substitute.h" +#include +#include #include namespace Halide { namespace Internal { +using std::list; using std::map; +using std::pair; +using std::set; using std::string; +using std::vector; namespace { @@ -61,7 +71,7 @@ class ExpandExpr : public IRMutator { Expr visit(const Variable *var) override { if (scope.contains(var->name)) { Expr expr = scope.get(var->name); - debug(3) << "Fully expanded " << var->name << " -> " << expr << "\n"; + debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n"; return expr; } else { return var; @@ -78,16 +88,44 @@ class ExpandExpr : public IRMutator { Expr expand_expr(const Expr &e, const Scope &scope) { ExpandExpr ee(scope); Expr result = ee.mutate(e); - debug(3) << "Expanded " << e << " into " << result << "\n"; + debug(4) << "Expanded " << e << " into " << result << "\n"; return result; } +class FindProduce : public IRVisitor { + const string &func; + + using IRVisitor::visit; + + void visit(const ProducerConsumer *op) override { + if (op->is_producer && op->name == func) { + found = true; + } else { + IRVisitor::visit(op); + } + } + +public: + bool found = false; + + FindProduce(const string &func) + : func(func) { + } +}; + +bool find_produce(const Stmt &s, const string &func) { + FindProduce finder(func); + s.accept(&finder); + return finder.found; +} + // Perform sliding window optimization for a function over a // particular serial for loop class SlidingWindowOnFunctionAndLoop : public IRMutator { Function func; string loop_var; Expr loop_min; + set &slid_dimensions; Scope scope; map replacements; @@ -112,10 +150,10 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { } Stmt visit(const ProducerConsumer *op) override { - if (!op->is_producer || (op->name != func.name())) { - return IRMutator::visit(op); - } else { - Stmt stmt = op; + if (op->is_producer) { + if (op->name != func.name()) { + return IRMutator::visit(op); + } // We're interested in the case where exactly one of the // dimensions of the buffer has a min/extent that depends @@ -131,6 +169,10 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { string prefix = func.name() + ".s" + std::to_string(func.updates().size()) + "."; const std::vector func_args = func.args(); for (int i = 0; i < func.dimensions(); i++) { + if (slid_dimensions.count(i)) { + debug(3) << "Already slid over dimension " << i << ", so skipping it.\n"; + continue; + } // Look up the region required of this function's last stage string var = prefix + func_args[i]; internal_assert(scope.contains(var + ".min") && scope.contains(var + ".max")); @@ -169,7 +211,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { debug(3) << "Could not perform sliding window optimization of " << func.name() << " over " << loop_var << " because multiple " << "dimensions of the function dependended on the loop var\n"; - return stmt; + return op; } // If the function is not pure in the given dimension, give up. We also @@ -185,7 +227,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { debug(3) << "Could not performance sliding window optimization of " << func.name() << " over " << loop_var << " because the function " << "scatters along the related axis.\n"; - return stmt; + return op; } bool can_slide_up = false; @@ -219,7 +261,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { << " because I couldn't prove it moved monotonically along that dimension\n" << "Min is " << min_required << "\n" << "Max is " << max_required << "\n"; - return stmt; + return op; } // Ok, we've isolated a function, a dimension to slide @@ -242,26 +284,96 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { << " there's no overlap in the region computed across iterations\n" << "Min is " << min_required << "\n" << "Max is " << max_required << "\n"; - return stmt; + return op; } + // Update the bounds of this producer assuming the previous iteration + // has run already. Expr new_min, new_max; if (can_slide_up) { - new_min = select(loop_var_expr <= loop_min, min_required, likely_if_innermost(prev_max_plus_one)); + new_min = prev_max_plus_one; new_max = max_required; } else { new_min = min_required; - new_max = select(loop_var_expr <= loop_min, max_required, likely_if_innermost(prev_min_minus_one)); + new_max = prev_min_minus_one; } - Expr early_stages_min_required = new_min; - Expr early_stages_max_required = new_max; + // See if we can find a new min for the loop that can warm up the + // sliding window. We're going to do this by building an equation + // that describes the constraints we have on our new loop min. The + // first constraint is that the new loop min is not after the + // loop min. + string new_loop_min_name = unique_name('x'); + Expr new_loop_min_var = Variable::make(Int(32), new_loop_min_name); + Expr new_loop_min_eq = new_loop_min_var <= loop_min; + Expr new_min_at_new_loop_min = substitute(loop_var, new_loop_min_var, new_min); + Expr new_max_at_new_loop_min = substitute(loop_var, new_loop_min_var, new_max); + if (can_slide_up) { + // We need to find a new loop min that satisfies these constraints: + // - The new min at the new loop min needs to be before the min + // required at the original min. + // - The new max needs to be greater than the new min, both at the + // new loop min. This guarantees that the sliding window. + // Together, these conditions guarantee the sliding window is warmed + // up. The first condition checks that we reached the original loop + // min, and the second condition checks that the iterations before + // the original min weren't empty. + Expr min_required_at_loop_min = substitute(loop_var, loop_min, min_required); + new_loop_min_eq = new_loop_min_eq && + new_min_at_new_loop_min <= min_required_at_loop_min && + new_max_at_new_loop_min >= new_min_at_new_loop_min; + } else { + // When sliding down, the constraints are similar, just swapping + // the roles of the min and max. + Expr max_required_at_loop_min = substitute(loop_var, loop_min, max_required); + new_loop_min_eq = new_loop_min_eq && + new_max_at_new_loop_min >= max_required_at_loop_min && + new_min_at_new_loop_min <= new_max_at_new_loop_min; + } + // Try to solve the equation. + new_loop_min_eq = simplify(new_loop_min_eq); + Interval solve_result = solve_for_inner_interval(new_loop_min_eq, new_loop_min_name); + internal_assert(!new_loop_min.defined()); + if (solve_result.has_upper_bound() && !equal(solve_result.max, loop_min)) { + new_loop_min = simplify(solve_result.max); + + // We have a new loop min, so we an assume every iteration has + // a previous iteration. In order for this to be safe, we need + // the new min/max at the new loop min to be less than or equal to + // the min/max required at the original loop min. + Expr loop_var_expr = Variable::make(Int(32), loop_var); + Expr orig_loop_min_expr = Variable::make(Int(32), loop_var + ".loop_min.orig"); + if (can_slide_up) { + Expr min_required_at_loop_min = substitute(loop_var, orig_loop_min_expr, min_required); + new_min = max(new_min, min_required_at_loop_min); + } else { + Expr max_required_at_loop_min = substitute(loop_var, orig_loop_min_expr, max_required); + new_max = min(new_max, max_required_at_loop_min); + } + } else { + // We couldn't find a suitable new loop min, we can't assume + // every iteration has a previous iteration. The first iteration + // will warm up the loop. + Expr need_explicit_warmup = loop_var_expr <= loop_min; + if (can_slide_up) { + new_min = select(need_explicit_warmup, min_required, likely_if_innermost(new_min)); + } else { + new_max = select(need_explicit_warmup, max_required, likely_if_innermost(new_max)); + } + } + new_min = simplify(new_min); + new_max = simplify(new_max); debug(3) << "Sliding " << func.name() << ", " << dim << "\n" << "Pushing min up from " << min_required << " to " << new_min << "\n" - << "Shrinking max from " << max_required << " to " << new_max << "\n"; + << "Shrinking max from " << max_required << " to " << new_max << "\n" + << "Adjusting loop_min from " << loop_min << " to " << new_loop_min << "\n" + << "Equation is " << new_loop_min_eq << "\n"; + + slid_dimensions.insert(dim_idx); // Now redefine the appropriate regions required + internal_assert(replacements.empty()); if (can_slide_up) { replacements[prefix + dim + ".min"] = new_min; } else { @@ -280,19 +392,55 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { // the last stage to cover values produced by stages // before the last one. Because, e.g., an intermediate // stage may be unrolled, expanding its bounds provided. + Stmt result = op; if (!func.updates().empty()) { Box b = box_provided(op->body, func.name()); if (can_slide_up) { string n = prefix + dim + ".min"; Expr var = Variable::make(Int(32), n); - stmt = LetStmt::make(n, min(var, b[dim_idx].min), stmt); + result = LetStmt::make(n, min(var, b[dim_idx].min), result); } else { string n = prefix + dim + ".max"; Expr var = Variable::make(Int(32), n); - stmt = LetStmt::make(n, max(var, b[dim_idx].max), stmt); + result = LetStmt::make(n, max(var, b[dim_idx].max), result); + } + } + + return result; + } else if (!find_produce(op, func.name()) && new_loop_min.defined()) { + // The producer might have expanded the loop before the min to warm + // up the window. This consumer doesn't contain a producer that might + // be part of the warmup, so guard it with an if to only run it on + // the original loop bounds. + Expr loop_var_expr = Variable::make(Int(32), loop_var); + Expr orig_loop_min_expr = Variable::make(Int(32), loop_var + ".loop_min.orig"); + Expr guard = likely_if_innermost(orig_loop_min_expr <= loop_var_expr); + + // Put the if inside the consumer node, so semaphores end up outside the if. + // TODO: This is correct, but it produces slightly suboptimal code: if we + // didn't do this, the loop could likely be trimmed and the if simplified away. + Stmt body = mutate(op->body); + if (const IfThenElse *old_guard = body.as()) { + Expr x = Variable::make(Int(32), "*"); + vector matches; + if (expr_match(likely_if_innermost(x <= loop_var_expr), old_guard->condition, matches)) { + // There's already a condition on loop_var_expr here. Since we're + // adding a condition at the old loop min, this if must already be + // guarding more than we will. + guard = Expr(); } } - return stmt; + if (guard.defined()) { + debug(3) << "Guarding body " << guard << "\n"; + body = IfThenElse::make(guard, body); + } + if (body.same_as(op->body)) { + return op; + } else { + return ProducerConsumer::make_consume(op->name, body); + } + } else { + return IRMutator::visit(op); } } @@ -340,39 +488,123 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator { } public: - SlidingWindowOnFunctionAndLoop(Function f, string v, Expr v_min) - : func(std::move(f)), loop_var(std::move(v)), loop_min(std::move(v_min)) { + SlidingWindowOnFunctionAndLoop(Function f, string v, Expr v_min, set &slid_dimensions) + : func(std::move(f)), loop_var(std::move(v)), loop_min(std::move(v_min)), slid_dimensions(slid_dimensions) { } + + Expr new_loop_min; }; -// Perform sliding window optimization for a particular function -class SlidingWindowOnFunction : public IRMutator { - Function func; +// In Stmt s, does the production of b depend on a? +// We can't use produce/consume nodes to determine this, because they're "loose". +// For example, we get this: +// +// produce a { +// a(...) = ... +// } +// consume a { +// produce b { +// b(...) = ... // not depending on a +// } +// consume b { +// c(...) = a(...) + b(...) +// } +// } +// +// When we'd rather see this: +// +// produce a { +// a(...) = ... +// } +// produce b { +// b(...) = ... // not depending on a +// } +// consume a { +// consume b { +// c(...) = a(...) + b(...) +// } +// } +// +// TODO: We might also need to figure out transitive dependencies...? If so, it +// would be best to just fix the produce/consume relationships as above. We would +// just be able to look for produce b inside produce a. +class Dependencies : public IRVisitor { + using IRVisitor::visit; - using IRMutator::visit; + const string &producer; + bool in_producer = false; - Stmt visit(const For *op) override { - debug(3) << " Doing sliding window analysis over loop: " << op->name << "\n"; + void visit(const ProducerConsumer *op) override { + ScopedValue old_finding_a(in_producer, in_producer || (op->is_producer && op->name == producer)); + return IRVisitor::visit(op); + } - Stmt new_body = op->body; + void visit(const Call *op) override { + if (in_producer && op->call_type == Call::Halide) { + if (op->name != producer) { + dependencies.insert(op->name); + } + } + IRVisitor::visit(op); + } - new_body = mutate(new_body); +public: + set dependencies; + + Dependencies(const string &producer) + : producer(producer) { + } +}; - if (op->for_type == ForType::Serial || - op->for_type == ForType::Unrolled) { - new_body = SlidingWindowOnFunctionAndLoop(func, op->name, op->min).mutate(new_body); +bool depends_on(const string &a, const string &b, const Stmt &s, map &cache) { + if (a == b) { + return true; + } + auto cached = cache.find(b); + if (cached != cache.end()) { + return cached->second; + } + Dependencies deps(b); + s.accept(&deps); + // Recursively search for dependencies. + for (const string &i : deps.dependencies) { + if (depends_on(a, i, s, cache)) { + cache[b] = true; + return true; } + } + cache[b] = false; + return false; +} - if (new_body.same_as(op->body)) { - return op; +bool depends_on(const string &a, const string &b, const Stmt &s) { + map cache; + return depends_on(a, b, s, cache); +} + +// Update the loop variable referenced by prefetch directives. +class SubstitutePrefetchVar : public IRMutator { + const string &old_var; + const string &new_var; + + using IRMutator::visit; + + Stmt visit(const Prefetch *op) override { + Stmt new_body = mutate(op->body); + if (op->prefetch.var == old_var) { + PrefetchDirective p = op->prefetch; + p.var = new_var; + return Prefetch::make(op->name, op->types, op->bounds, p, op->condition, new_body); + } else if (!new_body.same_as(op->body)) { + return Prefetch::make(op->name, op->types, op->bounds, op->prefetch, op->condition, new_body); } else { - return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, new_body); + return op; } } public: - SlidingWindowOnFunction(Function f) - : func(std::move(f)) { + SubstitutePrefetchVar(const string &old_var, const string &new_var) + : old_var(old_var), new_var(new_var) { } }; @@ -380,6 +612,13 @@ class SlidingWindowOnFunction : public IRMutator { class SlidingWindow : public IRMutator { const map &env; + // A map of which dimensions we've already slid over, by Func name. + map> slid_dimensions; + + // Keep track of realizations we want to slide, from innermost to + // outermost. + list sliding; + using IRMutator::visit; Stmt visit(const Realize *op) override { @@ -399,13 +638,17 @@ class SlidingWindow : public IRMutator { return IRMutator::visit(op); } - Stmt new_body = op->body; - - debug(3) << "Doing sliding window analysis on realization of " << op->name << "\n"; - - new_body = SlidingWindowOnFunction(iter->second).mutate(new_body); - - new_body = mutate(new_body); + // We want to slide innermost first, so put it on the front of + // the list. + sliding.push_front(iter->second); + Stmt new_body = mutate(op->body); + sliding.pop_front(); + // Remove tracking of slid dimensions when we're done realizing + // it in case a realization appears elsewhere. + auto slid_it = slid_dimensions.find(iter->second.name()); + if (slid_it != slid_dimensions.end()) { + slid_dimensions.erase(slid_it); + } if (new_body.same_as(op->body)) { return op; @@ -415,16 +658,135 @@ class SlidingWindow : public IRMutator { } } + Stmt visit(const For *op) override { + if (!(op->for_type == ForType::Serial || op->for_type == ForType::Unrolled)) { + return IRMutator::visit(op); + } + debug(3) << "Doing sliding window analysis on loop " << op->name << "\n"; + + string name = op->name; + Stmt body = op->body; + Expr loop_min = op->min; + Expr loop_extent = op->extent; + Expr loop_max = Variable::make(Int(32), op->name + ".loop_max"); + + list> prev_loop_mins; + list> new_lets; + for (const Function &func : sliding) { + debug(3) << "Doing sliding window analysis on function " << func.name() << "\n"; + + // Figure out where we should start sliding from. If no + // other func needs this func, we can just start at the + // original loop min. + Expr prev_loop_min = op->min; + // If a previously slid func needs this func to be warmed + // up, then we need to back up the loop to warm up this + // func before the already slid func starts warming up. + for (const auto &i : prev_loop_mins) { + if (depends_on(func.name(), i.first, body)) { + prev_loop_min = i.second; + break; + } + } + + SlidingWindowOnFunctionAndLoop slider(func, name, prev_loop_min, slid_dimensions[func.name()]); + body = slider.mutate(body); + + if (slider.new_loop_min.defined()) { + Expr new_loop_min = slider.new_loop_min; + if (!prev_loop_min.same_as(loop_min)) { + // If we didn't start sliding from the previous + // loop min, we the old loop min might already + // be further back than this new one. + new_loop_min = min(new_loop_min, loop_min); + } + + // Put this at the front of the list, so we find it first + // when checking subsequent funcs. + prev_loop_mins.emplace_front(func.name(), new_loop_min); + + // Update the loop body to use the adjusted loop min. + string new_name = name + ".$n"; + loop_min = Variable::make(Int(32), new_name + ".loop_min"); + loop_extent = Variable::make(Int(32), new_name + ".loop_extent"); + body = substitute({ + {name, Variable::make(Int(32), new_name)}, + {name + ".loop_min", loop_min}, + {name + ".loop_extent", loop_extent}, + }, + body); + body = SubstitutePrefetchVar(name, new_name).mutate(body); + + name = new_name; + + // The new loop interval is the new loop min to the loop max. + new_lets.emplace_front(name + ".loop_min", new_loop_min); + new_lets.emplace_front(name + ".loop_min.orig", loop_min); + new_lets.emplace_front(name + ".loop_extent", (loop_max - loop_min) + 1); + } + } + + body = mutate(body); + + if (body.same_as(op->body) && loop_min.same_as(op->min) && loop_extent.same_as(op->extent) && name == op->name) { + return op; + } else { + Stmt result = For::make(name, loop_min, loop_extent, op->for_type, op->device_api, body); + if (!new_lets.empty()) { + result = LetStmt::make(name + ".loop_max", loop_max, result); + } + for (const auto &i : new_lets) { + result = LetStmt::make(i.first, i.second, result); + } + return result; + } + } + + Stmt visit(const IfThenElse *op) override { + // Don't let specializations corrupt the tracking of which + // dimensions have been slid. + map> old_slid_dimensions = slid_dimensions; + Stmt then_case = mutate(op->then_case); + slid_dimensions = old_slid_dimensions; + Stmt else_case = mutate(op->else_case); + slid_dimensions = old_slid_dimensions; + if (then_case.same_as(op->then_case) && else_case.same_as(op->else_case)) { + return op; + } else { + return IfThenElse::make(op->condition, then_case, else_case); + } + } + public: SlidingWindow(const map &e) : env(e) { } }; +// It is convenient to be able to assume that loops have a .loop_min.orig +// let in addition to .loop_min. Most of these will get simplified away. +class AddLoopMinOrig : public IRMutator { + using IRMutator::visit; + + Stmt visit(const For *op) override { + Stmt body = mutate(op->body); + Expr min = mutate(op->min); + Expr extent = mutate(op->extent); + + Stmt result; + if (body.same_as(op->body) && min.same_as(op->min) && extent.same_as(op->extent)) { + result = op; + } else { + result = For::make(op->name, min, extent, op->for_type, op->device_api, body); + } + return LetStmt::make(op->name + ".loop_min.orig", Variable::make(Int(32), op->name + ".loop_min"), result); + } +}; + } // namespace Stmt sliding_window(const Stmt &s, const map &env) { - return SlidingWindow(env).mutate(s); + return SlidingWindow(env).mutate(AddLoopMinOrig().mutate(s)); } } // namespace Internal diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp index 1c4c4c62cb0f..548d46cb57cf 100644 --- a/src/StorageFolding.cpp +++ b/src/StorageFolding.cpp @@ -512,6 +512,8 @@ class AttemptStorageFoldingOfFunction : public IRMutator { Box provided = box_provided(body, func.name()); Box required = box_required(body, func.name()); + // For storage folding, we don't care about conditional reads. + required.used = Expr(); Box box = box_union(provided, required); Expr loop_var = Variable::make(Int(32), op->name); @@ -674,8 +676,10 @@ class AttemptStorageFoldingOfFunction : public IRMutator { // Can't do much with this dimension if (!explicit_only) { debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n" - << "min = " << min << "\n" - << "max = " << max << "\n"; + << "min_initial = " << min_initial << "\n" + << "min_steady = " << min_steady << "\n" + << "max_initial = " << max_initial << "\n" + << "max_steady = " << max_steady << "\n"; } else { debug(3) << "Not folding because there is no explicit storage folding factor\n"; } @@ -789,22 +793,16 @@ class AttemptStorageFoldingOfFunction : public IRMutator { to_release = max_required - max_required_next; // This is the last time we use these entries } - // Logically we acquire the entire extent on - // the first iteration: - - // to_acquire = select(loop_var > loop_min, to_acquire, extent); - - // However it's simpler to implement this by - // just reducing the initial value on the - // semaphore by the difference, as long as it - // doesn't lift any inner names out of scope. - - Expr fudge = simplify(substitute(op->name, loop_min, extent - to_acquire)); - if (is_const(fudge) && can_prove(fudge <= sema.init)) { - sema.init -= fudge; - } else { - to_acquire = select(loop_var > loop_min, likely(to_acquire), extent); + if (provided.used.defined()) { + to_acquire = select(provided.used, to_acquire, 0); } + // We should always release the required region, even if we don't use it. + + // On the first iteration, we need to acquire the extent of the region shared + // between the producer and consumer, and we need to release it on the last + // iteration. + to_acquire = select(loop_var > loop_min, to_acquire, extent); + to_release = select(loop_var < loop_max, to_release, extent); // We may need dynamic assertions that a positive // amount of the semaphore is acquired/released, @@ -864,10 +862,8 @@ class AttemptStorageFoldingOfFunction : public IRMutator { } else { stmt = op; debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n" - << "min_initial = " << min_initial << "\n" - << "min_steady = " << min_steady << "\n" - << "max_initial = " << max_initial << "\n" - << "max_steady = " << max_steady << "\n"; + << "min = " << min << "\n" + << "max = " << max << "\n"; break; } } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 9873896b4223..fc1a9a182f49 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -14,11 +14,6 @@ tests(GROUPS correctness atomic_tuples.cpp atomics.cpp autodiff.cpp - autotune_bug.cpp - autotune_bug_2.cpp - autotune_bug_3.cpp - autotune_bug_4.cpp - autotune_bug_5.cpp bad_likely.cpp bit_counting.cpp bitwise_ops.cpp diff --git a/test/correctness/autotune_bug.cpp b/test/correctness/autotune_bug.cpp deleted file mode 100644 index f8403b6d49ec..000000000000 --- a/test/correctness/autotune_bug.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#define AUTOTUNE_N 16, 16 - -// This tests a segfault generated by an autotuned schedule. - -#include "Halide.h" -#include - -using namespace Halide; - -int main(int argc, char **argv) { - - ImageParam in_img(UInt(16), 2); - Func blur_x("blur_x"), blur_y("blur_y"); - Var x("x"), y("y"), xi("xi"), yi("yi"); - - Func input; - input(x, y) = in_img(clamp(x, 1, in_img.width() - 1), - clamp(y, 1, in_img.height()) - 1); - - // The algorithm - blur_x(x, y) = (input(x, y) + input(x + 1, y) + input(x + 2, y)) / 3; - blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3; - - Halide::Var _x2; - input - .reorder_storage(y, x) - .compute_root(); - blur_x - .split(x, x, _x2, 4) - .compute_at(blur_y, y); - blur_y - .reorder(y, x); - - blur_y.compile_jit(); - blur_y.infer_input_bounds({AUTOTUNE_N}); - assert(in_img.get().data()); - blur_y.realize({AUTOTUNE_N}); - - printf("Success!\n"); - return 0; -} diff --git a/test/correctness/autotune_bug_2.cpp b/test/correctness/autotune_bug_2.cpp deleted file mode 100644 index 65fc7d507e80..000000000000 --- a/test/correctness/autotune_bug_2.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -int my_trace(void *user_context, const halide_trace_event_t *e) { - // The schedule implies that f will be stored from 0 to 8 - if (e->event == 2 && std::string(e->func) == "f") { - if (e->coordinates[1] < 8) { - printf("Bounds on realization of f were supposed to be >= [0, 9]\n" - "Instead they are: %d %d\n", - e->coordinates[0], e->coordinates[1]); - exit(-1); - } - } - return 0; -} - -int main(int argc, char **argv) { - Func f("f"), g("g"); - Var x("x"); - f(x) = x; - RDom r(17, 1); - f(x) = r; - f.store_root(); - - g(x) = f(x) + f(x + 1); - f.compute_at(g, x); - - Var xo("xo"), xi("xi"); - f.split(x, xo, xi, 8); - f.update(); - - f.trace_realizations().trace_stores(); - - g.set_custom_trace(&my_trace); - g.bound(x, 0, 2); - g.output_buffer().dim(0).set_bounds(0, 2); - g.realize({2}); - - printf("Success!\n"); - - return 0; -} diff --git a/test/correctness/autotune_bug_3.cpp b/test/correctness/autotune_bug_3.cpp deleted file mode 100644 index 4bab12448b76..000000000000 --- a/test/correctness/autotune_bug_3.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -int my_trace(void *user_context, const halide_trace_event_t *e) { - // The schedule implies that f will be stored from 0 to 8 - if (e->event == 2 && std::string(e->func) == "f") { - if (e->coordinates[1] < 8) { - printf("Bounds on realization of f were supposed to be >= [0, 8]\n" - "Instead they are: %d %d\n", - e->coordinates[0], e->coordinates[1]); - exit(-1); - } - } - return 0; -} - -int main(int argc, char **argv) { - Func f("f"), g("g"); - Var x("x"); - f(x) = x; - f.store_root(); - - g(x) = f(x) + f(x + 1); - f.compute_at(g, x); - - Var xo("xo"), xi("xi"); - f.split(x, xo, xi, 8); - - f.trace_realizations().trace_stores(); - - g.set_custom_trace(&my_trace); - g.bound(x, 0, 2); - g.output_buffer().dim(0).set_bounds(0, 2); - g.realize({2}); - - printf("Success!\n"); - - return 0; -} diff --git a/test/correctness/autotune_bug_4.cpp b/test/correctness/autotune_bug_4.cpp deleted file mode 100644 index 6fc5a0751f6e..000000000000 --- a/test/correctness/autotune_bug_4.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -int my_trace(void *user_context, const halide_trace_event_t *e) { - // The schedule implies that f and g will be stored from 0 to 7 - if (e->event == 2 && std::string(e->func) == "f") { - if (e->coordinates[1] < 7) { - printf("Bounds on realization were supposed to be = [0, 7]\n" - "Instead they are: %d %d\n", - e->coordinates[0], e->coordinates[1]); - exit(-1); - } - } - return 0; -} - -int main(int argc, char **argv) { - Func f("f"), g("g"), h("h"); - Var x("x"); - - f(x) = x; - g(x) = f(x); - h(x) = g(x) + g(x + 1); - - Var xo("xo"), xi("xi"); - f.split(x, xo, xi, 4); - g.split(x, xo, xi, 5); - h.split(x, xo, xi, 6); - f.compute_at(h, xo); - g.compute_at(h, xo); - g.store_root(); - - f.trace_realizations().trace_stores().trace_loads(); - g.trace_realizations().trace_stores().trace_loads(); - - h.set_custom_trace(&my_trace); - h.bound(x, 0, 6); - h.realize({6}); - - printf("Success!\n"); - - return 0; -} diff --git a/test/correctness/autotune_bug_5.cpp b/test/correctness/autotune_bug_5.cpp deleted file mode 100644 index e012a1121501..000000000000 --- a/test/correctness/autotune_bug_5.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -int main(int argc, char **argv) { - Buffer input(1024, 1024); - - Func upsampled("upsampled"); - Func upsampledx("upsampledx"); - Var x("x"), y("y"); - - Func clamped("clamped"); - clamped(x, y) = input(x, y); - - upsampledx(x, y) = select((x % 2) == 0, - clamped(x, y), - clamped(x + 1, y)); - upsampled(x, y) = upsampledx(x, y); - - Var xi("xi"), yi("yi"); - clamped.compute_root(); // passes if this is removed, switched to inline - upsampled - .split(y, y, yi, 8) - .reorder(yi, y, x) - .compute_root(); - - upsampledx.compute_at(upsampled, yi); - - upsampled.realize({100, 100}); - - printf("Success!\n"); - return 0; -} diff --git a/test/correctness/sliding_reduction.cpp b/test/correctness/sliding_reduction.cpp index 087bc2d06b7d..3ce75056a09b 100644 --- a/test/correctness/sliding_reduction.cpp +++ b/test/correctness/sliding_reduction.cpp @@ -87,10 +87,8 @@ int main(int argc, char **argv) { // clobber earlier values of the final stage of f, so we have // to compute the final stage of f two rows at a time as well. - // The result is that we evaluate the first three rows of f - // for the first scanline of g, and then another two rows for - // every row of g thereafter. This adds up to 2*(3 + 9*2) = 42 - // evaluations of f. + // The result is that we extend the loop to warm up f by 2 + // iterations. This adds up to 2*(12*2) = 48 evaluations of f. Func f("f"); f(x, y) = x; f(0, y) += f(1, y) + f(2, y); @@ -108,7 +106,7 @@ int main(int argc, char **argv) { counter = 0; check(g.realize({2, 10})); - int correct = 42; + int correct = 48; if (counter != correct) { printf("Failed sliding a reduction: %d evaluations instead of %d\n", counter, correct); return -1; diff --git a/test/correctness/sliding_window.cpp b/test/correctness/sliding_window.cpp index 413bf9233160..31875158600a 100644 --- a/test/correctness/sliding_window.cpp +++ b/test/correctness/sliding_window.cpp @@ -40,6 +40,9 @@ int main(int argc, char **argv) { f.store_root().compute_at(g, x); + // Test that sliding window works when specializing. + g.specialize(g.output_buffer().dim(0).min() == 0); + Buffer im = g.realize({100}); // f should be able to tell that it only needs to compute each value once @@ -49,6 +52,44 @@ int main(int argc, char **argv) { } } + // Try two producers used by the same consumer. + { + count = 0; + Func f, g, h; + + f(x) = call_counter(2 * x + 0, 0); + g(x) = call_counter(2 * x + 1, 0); + h(x) = f(x) + f(x - 1) + g(x) + g(x - 1); + + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x); + + Buffer im = h.realize({100}); + if (count != 202) { + printf("f was called %d times instead of %d times\n", count, 202); + return -1; + } + } + + // Try a sequence of two sliding windows. + { + count = 0; + Func f, g, h; + + f(x) = call_counter(2 * x + 0, 0); + g(x) = f(x) + f(x - 1); + h(x) = g(x) + g(x - 1); + + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x); + + Buffer im = h.realize({100}); + if (count != 102) { + printf("f was called %d times instead of %d times\n", count, 102); + return -1; + } + } + // Try again where there's a containing stage { count = 0; @@ -201,6 +242,24 @@ int main(int argc, char **argv) { } } + { + // Sliding where we only need a new value every third iteration of the consumer. + // This test checks that we don't ask for excessive bounds. + ImageParam f(Int(32), 1); + Func g; + + g(x) = f(x / 3); + + Var xo; + g.split(x, xo, x, 10); + f.in().store_at(g, xo).compute_at(g, x); + + Buffer buf(33); + f.set(buf); + + Buffer im = g.realize({98}); + } + { // Sliding with an unrolled producer Var x, xi; @@ -221,6 +280,102 @@ int main(int argc, char **argv) { } } + { + // Sliding with a vectorized producer and consumer. + count = 0; + Func f, g; + f(x) = call_counter(x, 0); + g(x) = f(x + 1) + f(x - 1); + + f.store_root().compute_at(g, x).vectorize(x, 4); + g.vectorize(x, 4); + + Buffer im = g.realize({100}); + if (count != 104) { + printf("f was called %d times instead of %d times\n", count, 104); + return -1; + } + } + + { + // A sequence of stencils, all computed at the output. + count = 0; + Func f, g, h, u, v; + f(x, y) = call_counter(x, y); + g(x, y) = f(x, y - 1) + f(x, y + 1); + h(x, y) = g(x - 1, y) + g(x + 1, y); + u(x, y) = h(x, y - 1) + h(x, y + 1); + v(x, y) = u(x - 1, y) + u(x + 1, y); + + u.compute_at(v, y); + h.store_root().compute_at(v, y); + g.store_root().compute_at(v, y); + f.store_root().compute_at(v, y); + + v.realize({10, 10}); + if (count != 14 * 14) { + printf("f was called %d times instead of %d times\n", count, 14 * 14); + return -1; + } + } + + { + // A sequence of stencils, sliding computed at the output. + count = 0; + Func f, g, h, u, v; + f(x, y) = call_counter(x, y); + g(x, y) = f(x, y - 1) + f(x, y + 1); + h(x, y) = g(x - 1, y) + g(x + 1, y); + u(x, y) = h(x, y - 1) + h(x, y + 1); + v(x, y) = u(x - 1, y) + u(x + 1, y); + + u.compute_at(v, y); + h.store_root().compute_at(v, y); + g.compute_at(h, y); + f.store_root().compute_at(v, y); + + v.realize({10, 10}); + if (count != 14 * 14) { + printf("f was called %d times instead of %d times\n", count, 14 * 14); + return -1; + } + } + + { + // Sliding a func that has a boundary condition before the beginning + // of the loop. This needs an explicit warmup before we start sliding. + count = 0; + Func f, g; + f(x) = call_counter(x, 0); + g(x) = f(max(x, 3)); + + f.store_root().compute_at(g, x); + + g.realize({10}); + if (count != 7) { + printf("f was called %d times instead of %d times\n", count, 7); + return -1; + } + } + + { + // Sliding a func that has a boundary condition on both sides. + count = 0; + Func f, g, h; + f(x) = call_counter(x, 0); + g(x) = f(clamp(x, 0, 9)); + h(x) = g(x - 1) + g(x + 1); + + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x); + + h.realize({10}); + if (count != 10) { + printf("f was called %d times instead of %d times\n", count, 10); + return -1; + } + } + printf("Success!\n"); return 0; } diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp index 3bbfb672f512..bf412e52d7ab 100644 --- a/test/correctness/storage_folding.cpp +++ b/test/correctness/storage_folding.cpp @@ -385,6 +385,42 @@ int main(int argc, char **argv) { } } + { + custom_malloc_sizes.clear(); + Func f, g, h; + + // Two stages of upsampling is even trickier. + h(x, y) = x * y; + g(x, y) = h(x, y / 2) + h(x, y / 2 + 1); + f(x, y) = g(x, y / 2) + g(x, y / 2 + 1); + + h.compute_at(f, y).store_root().fold_storage(y, 4); + g.compute_at(f, y).store_root().fold_storage(y, 2); + + f.set_custom_allocator(my_malloc, my_free); + + Buffer im = f.realize({1000, 1000}); + + // Halide allocates one extra scalar, so we account for that. + size_t expected_size_g = 1000 * 4 * sizeof(int) + sizeof(int); + size_t expected_size_h = 1000 * 2 * sizeof(int) + sizeof(int); + if (!check_expected_mallocs({expected_size_g, expected_size_h})) { + return -1; + } + + for (int y = 0; y < im.height(); y++) { + for (int x = 0; x < im.width(); x++) { + auto correct_h = [](int x, int y) { return x * y; }; + auto correct_g = [=](int x, int y) { return correct_h(x, y / 2) + correct_h(x, y / 2 + 1); }; + auto correct_f = [=](int x, int y) { return correct_g(x, y / 2) + correct_g(x, y / 2 + 1); }; + if (im(x, y) != correct_f(x, y)) { + printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct_f(x, y)); + return -1; + } + } + } + } + for (bool interleave : {false, true}) { Func f, g; diff --git a/test/correctness/tracing.cpp b/test/correctness/tracing.cpp index 92cce6f2f752..1eeab85513c0 100644 --- a/test/correctness/tracing.cpp +++ b/test/correctness/tracing.cpp @@ -243,9 +243,9 @@ int main(int argc, char **argv) { {103, 11, 1, 2, 32, 4, 1, 4, {1, 2, 3, 4}, {0.995004f, 0.980067f, 0.955337f, 0.921061f}, ""}, {103, 11, 5, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 9, 6, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, + {105, 1, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 17, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.099833f, 0.198669f, 0.295520f}, ""}, {103, 17, 0, 2, 32, 4, 1, 4, {1, 2, 3, 4}, {0.995004f, 0.980067f, 0.955337f, 0.921061f}, ""}, - {105, 1, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {102, 10, 1, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.995004f, 1.079900f, 1.154006f, 1.216581f}, ""}, {103, 17, 7, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 9, 4, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, @@ -253,9 +253,9 @@ int main(int argc, char **argv) { {103, 23, 1, 2, 32, 4, 1, 4, {5, 6, 7, 8}, {0.877583f, 0.825336f, 0.764842f, 0.696707f}, ""}, {103, 23, 5, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 9, 6, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, + {105, 1, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 27, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.389418f, 0.479426f, 0.564642f, 0.644218f}, ""}, {103, 27, 0, 2, 32, 4, 1, 4, {5, 6, 7, 8}, {0.877583f, 0.825336f, 0.764842f, 0.696707f}, ""}, - {105, 1, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {102, 10, 1, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {1.267001f, 1.304761f, 1.329485f, 1.340924f}, ""}, {103, 27, 7, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 9, 4, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, @@ -263,9 +263,9 @@ int main(int argc, char **argv) { {103, 33, 1, 2, 32, 4, 1, 4, {7, 8, 9, 10}, {0.764842f, 0.696707f, 0.621610f, 0.540302f}, ""}, {103, 33, 5, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 9, 6, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, + {105, 1, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {103, 37, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.564642f, 0.644218f, 0.717356f, 0.783327f}, ""}, {103, 37, 0, 2, 32, 4, 1, 4, {7, 8, 9, 10}, {0.764842f, 0.696707f, 0.621610f, 0.540302f}, ""}, - {105, 1, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {102, 10, 1, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {1.329485f, 1.340924f, 1.338966f, 1.323629f}, ""}, {103, 37, 7, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""}, {102, 10, 5, 3, 0, 0, 0, 2, {0, 10, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},