diff --git a/apps/blur/halide_blur_generator.cpp b/apps/blur/halide_blur_generator.cpp
index 5c208f796fee..d5dfe51f8f15 100644
--- a/apps/blur/halide_blur_generator.cpp
+++ b/apps/blur/halide_blur_generator.cpp
@@ -82,6 +82,7 @@ class HalideBlur : public Halide::Generator<HalideBlur> {
             }
         } else if (get_target().has_feature(Target::HVX)) {
             // Hexagon schedule.
+            // TODO: Try using a schedule like the CPU one below.
             const int vector_size = 128;
 
             blur_y.compute_root()
@@ -96,8 +97,17 @@ class HalideBlur : public Halide::Generator<HalideBlur> {
                 .vectorize(x, vector_size);
         } else {
             // CPU schedule.
-            blur_y.split(y, y, yi, 8).parallel(y).vectorize(x, 8);
-            blur_x.store_at(blur_y, y).compute_at(blur_y, yi).vectorize(x, 8);
+            // Compute blur_x as needed at each vector of the output.
+            // Halide will store blur_x in a circular buffer so its
+            // results can be re-used.
+            blur_y
+                .split(y, y, yi, 32)
+                .parallel(y)
+                .vectorize(x, 16);
+            blur_x
+                .store_at(blur_y, y)
+                .compute_at(blur_y, x)
+                .vectorize(x, 16);
         }
     }
 };
diff --git a/apps/blur/test.cpp b/apps/blur/test.cpp
index 88f8d058a4cb..6d7e678285e7 100644
--- a/apps/blur/test.cpp
+++ b/apps/blur/test.cpp
@@ -159,8 +159,8 @@ int main(int argc, char **argv) {
     const bool is_hexagon = strstr(md->target, "hvx_128") || strstr(md->target, "hvx_64");
 
     // The Hexagon simulator can't allocate as much memory as the above wants.
-    const int width = is_hexagon ? 648 : 6408;
-    const int height = is_hexagon ? 482 : 4802;
+    const int width = is_hexagon ? 648 : 2568;
+    const int height = is_hexagon ? 482 : 1922;
 
     Buffer<uint16_t> input(width, height);
 
diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp
index 9b68c7cdb109..9c8005724555 100644
--- a/apps/camera_pipe/camera_pipe_generator.cpp
+++ b/apps/camera_pipe/camera_pipe_generator.cpp
@@ -530,7 +530,7 @@ void CameraPipe::generate() {
             .compute_at(processed, yi)
             .store_at(processed, yo)
             .prefetch(input, y, 2)
-            .fold_storage(y, 16)
+            .fold_storage(y, 4)
             .tile(x, y, x, y, xi, yi, 2 * vec, 2)
             .vectorize(xi)
             .unroll(yi);
@@ -538,7 +538,7 @@ void CameraPipe::generate() {
         deinterleaved
             .compute_at(processed, yi)
             .store_at(processed, yo)
-            .fold_storage(y, 8)
+            .fold_storage(y, 4)
             .reorder(c, x, y)
             .vectorize(x, 2 * vec, TailStrategy::RoundUp)
             .unroll(c);
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index 77d32b8ac4a8..4a27e3dd454a 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -148,7 +148,7 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
                 outGPyramid[j]
                     .store_at(output, yo)
                     .compute_at(output, y)
-                    .fold_storage(y, 8)
+                    .fold_storage(y, 4)
                     .vectorize(x, 8);
             }
             outGPyramid[0].compute_at(output, y).vectorize(x, 8);
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index 6b1798b25528..7fa67ac2192f 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -349,8 +349,12 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
                 // repeated dependence on the block var
                 s.size = solve_expression(s.size, op->name).result;
                 s.size = simplify(common_subexpression_elimination(s.size));
-                auto result = is_monotonic(s.size, op->name);
-                if (result == Monotonic::Unknown) {
+                switch (is_monotonic(s.size, op->name)) {
+                case Monotonic::Unknown:
+                    // TODO: if bounds_of_expr_in_scope becomes more
+                    // powerful than is_monotonic, it might be better
+                    // to call it here. That would be risky though, as
+                    // it's not exact.
                     debug(1)
                         << "Shared allocation for " << s.name
                         << " has a size that is non-monontonic in the gpu block variable " << op->name
@@ -359,19 +363,19 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
                         get_compiler_logger()->record_non_monotonic_loop_var(op->name, s.size);
                     }
                     precompute_allocation_size(s);
-                } else {
-                    auto interval_bounds = bounds_of_expr_in_scope(s.size, scope);
-                    user_assert(interval_bounds.has_upper_bound())
-                        << "Couldn't infer bounds for " << s.name << " shared memory allocation\n";
-                    // In theory we could precompute the allocation
-                    // size if there's no upper bound too, but for the
-                    // assert above to fail we'd have to encounter an
-                    // expression that is_monotonic detects as
-                    // increasing, decreasing, or constant, but is
-                    // somehow unbounded. It's probable that no such
-                    // expression exists. is_monotonic is generally
-                    // less capable than bounds_of_expr_in_scope.
-                    s.size = interval_bounds.max;
+                    break;
+                case Monotonic::Increasing:
+                    s.size = substitute(op->name, simplify(op->min + op->extent - 1), s.size);
+                    break;
+                case Monotonic::Constant:
+                    // The size expression used the variable, but we
+                    // may have successfully eliminated it above, or
+                    // is_monotonic might have detected that the
+                    // dependence is false somehow. Just treat it as
+                    // decreasing...
+                case Monotonic::Decreasing:
+                    s.size = substitute(op->name, op->min, s.size);
+                    break;
                 }
             }
             if (in_threads && op->is_parallel()) {
diff --git a/src/Interval.cpp b/src/Interval.cpp
index 6c9ef0d48843..10550f7ed48b 100644
--- a/src/Interval.cpp
+++ b/src/Interval.cpp
@@ -157,5 +157,91 @@ Expr Interval::neg_inf_noinline() {
     return Interval::neg_inf_expr;
 }
 
+ConstantInterval::ConstantInterval() = default;
+
+ConstantInterval::ConstantInterval(int64_t min, int64_t max)
+    : min(min), max(max), min_defined(true), max_defined(true) {
+    internal_assert(min <= max);
+}
+
+ConstantInterval ConstantInterval::everything() {
+    return ConstantInterval();
+}
+
+ConstantInterval ConstantInterval::single_point(int64_t x) {
+    return ConstantInterval(x, x);
+}
+
+ConstantInterval ConstantInterval::bounded_below(int64_t min) {
+    ConstantInterval result(min, min);
+    result.max_defined = false;
+    return result;
+}
+
+ConstantInterval ConstantInterval::bounded_above(int64_t max) {
+    ConstantInterval result(max, max);
+    result.min_defined = false;
+    return result;
+}
+
+bool ConstantInterval::is_everything() const {
+    return !min_defined && !max_defined;
+}
+
+bool ConstantInterval::is_single_point() const {
+    return min_defined && max_defined && min == max;
+}
+
+bool ConstantInterval::is_single_point(int64_t x) const {
+    return min_defined && max_defined && min == x && max == x;
+}
+
+bool ConstantInterval::has_upper_bound() const {
+    return max_defined;
+}
+
+bool ConstantInterval::has_lower_bound() const {
+    return min_defined;
+}
+
+bool ConstantInterval::is_bounded() const {
+    return has_upper_bound() && has_lower_bound();
+}
+
+bool ConstantInterval::operator==(const ConstantInterval &other) const {
+    if (min_defined != other.min_defined || max_defined != other.max_defined) {
+        return false;
+    }
+    return (!min_defined || min == other.min) && (!max_defined || max == other.max);
+}
+
+void ConstantInterval::include(const ConstantInterval &i) {
+    if (max_defined && i.max_defined) {
+        max = std::max(max, i.max);
+    } else {
+        max_defined = false;
+    }
+    if (min_defined && i.min_defined) {
+        min = std::min(min, i.min);
+    } else {
+        min_defined = false;
+    }
+}
+
+void ConstantInterval::include(int64_t x) {
+    if (max_defined) {
+        max = std::max(max, x);
+    }
+    if (min_defined) {
+        min = std::min(min, x);
+    }
+}
+
+ConstantInterval ConstantInterval::make_union(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result = a;
+    result.include(b);
+    return result;
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Interval.h b/src/Interval.h
index 2c7c40c49712..1d90d4a29b55 100644
--- a/src/Interval.h
+++ b/src/Interval.h
@@ -110,6 +110,63 @@ struct Interval {
     static Expr neg_inf_noinline();
 };
 
+/** A class to represent ranges of integers. Can be unbounded above or below, but
+ * they cannot be empty. */
+struct ConstantInterval {
+    /** The lower and upper bound of the interval. They are included
+     * in the interval. */
+    int64_t min = 0, max = 0;
+    bool min_defined = false, max_defined = false;
+
+    /* A default-constructed Interval is everything */
+    ConstantInterval();
+
+    /** Construct an interval from a lower and upper bound. */
+    ConstantInterval(int64_t min, int64_t max);
+
+    /** The interval representing everything. */
+    static ConstantInterval everything();
+
+    /** Construct an interval representing a single point. */
+    static ConstantInterval single_point(int64_t x);
+
+    /** Construct intervals bounded above or below. */
+    static ConstantInterval bounded_below(int64_t min);
+    static ConstantInterval bounded_above(int64_t max);
+
+    /** Is the interval the entire range */
+    bool is_everything() const;
+
+    /** Is the interval just a single value (min == max) */
+    bool is_single_point() const;
+
+    /** Is the interval a particular single value */
+    bool is_single_point(int64_t x) const;
+
+    /** Does the interval have a finite least upper bound */
+    bool has_upper_bound() const;
+
+    /** Does the interval have a finite greatest lower bound */
+    bool has_lower_bound() const;
+
+    /** Does the interval have a finite upper and lower bound */
+    bool is_bounded() const;
+
+    /** Expand the interval to include another Interval */
+    void include(const ConstantInterval &i);
+
+    /** Expand the interval to include a point */
+    void include(int64_t x);
+
+    /** Construct the smallest interval containing two intervals. */
+    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);
+
+    /** Equivalent to same_as. Exists so that the autoscheduler can
+     * compare two map<string, Interval> for equality in order to
+     * cache computations. */
+    bool operator==(const ConstantInterval &other) const;
+};
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index fd8285608770..7383ed377dff 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -1,4 +1,5 @@
 #include "Monotonic.h"
+#include "Bounds.h"
 #include "IROperator.h"
 #include "IRVisitor.h"
 #include "Scope.h"
@@ -30,26 +31,212 @@ using std::string;
 
 namespace {
 
-class MonotonicVisitor : public IRVisitor {
+const int64_t *as_const_int_or_uint(const Expr &e) {
+    if (const int64_t *i = as_const_int(e)) {
+        return i;
+    } else if (const uint64_t *u = as_const_uint(e)) {
+        if (*u <= (uint64_t)std::numeric_limits<int64_t>::max()) {
+            return (const int64_t *)u;
+        }
+    }
+    return nullptr;
+}
+
+bool is_constant(const ConstantInterval &a) {
+    return a.is_single_point(0);
+}
+
+bool may_be_negative(const ConstantInterval &a) {
+    return !a.has_lower_bound() || a.min < 0;
+}
+
+bool may_be_positive(const ConstantInterval &a) {
+    return !a.has_upper_bound() || a.max > 0;
+}
+
+bool is_monotonic_increasing(const ConstantInterval &a) {
+    return !may_be_negative(a);
+}
+
+bool is_monotonic_decreasing(const ConstantInterval &a) {
+    return !may_be_positive(a);
+}
+
+ConstantInterval to_interval(Monotonic m) {
+    switch (m) {
+    case Monotonic::Constant:
+        return ConstantInterval::single_point(0);
+    case Monotonic::Increasing:
+        return ConstantInterval::bounded_below(0);
+    case Monotonic::Decreasing:
+        return ConstantInterval::bounded_above(0);
+    case Monotonic::Unknown:
+        return ConstantInterval::everything();
+    }
+    return ConstantInterval::everything();
+}
+
+Monotonic to_monotonic(const ConstantInterval &x) {
+    if (is_constant(x)) {
+        return Monotonic::Constant;
+    } else if (is_monotonic_increasing(x)) {
+        return Monotonic::Increasing;
+    } else if (is_monotonic_decreasing(x)) {
+        return Monotonic::Decreasing;
+    } else {
+        return Monotonic::Unknown;
+    }
+}
+
+ConstantInterval unify(const ConstantInterval &a, const ConstantInterval &b) {
+    return ConstantInterval::make_union(a, b);
+}
+
+ConstantInterval unify(const ConstantInterval &a, int64_t b) {
+    ConstantInterval result;
+    result.include(b);
+    return result;
+}
+
+// Helpers for doing arithmetic on ConstantIntervals that avoid generating
+// expressions of pos_inf/neg_inf.
+ConstantInterval add(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+    result.min_defined = a.has_lower_bound() && b.has_lower_bound();
+    result.max_defined = a.has_upper_bound() && b.has_upper_bound();
+    if (result.has_lower_bound()) {
+        result.min = a.min + b.min;
+    }
+    if (result.has_upper_bound()) {
+        result.max = a.max + b.max;
+    }
+    return result;
+}
+
+ConstantInterval add(const ConstantInterval &a, int64_t b) {
+    return add(a, ConstantInterval(b, b));
+}
+
+ConstantInterval negate(const ConstantInterval &r) {
+    ConstantInterval result;
+    result.min_defined = r.has_upper_bound();
+    result.min = r.has_upper_bound() ? -r.max : 0;
+    result.max_defined = r.has_lower_bound();
+    result.max = r.has_lower_bound() ? -r.min : 0;
+    return result;
+}
+
+ConstantInterval sub(const ConstantInterval &a, const ConstantInterval &b) {
+    return add(a, negate(b));
+}
+
+ConstantInterval sub(const ConstantInterval &a, int64_t b) {
+    return sub(a, ConstantInterval(b, b));
+}
+
+ConstantInterval multiply(const ConstantInterval &a, int64_t b) {
+    ConstantInterval result(a);
+    if (b < 0) {
+        result = negate(result);
+        b = -b;
+    }
+    if (result.has_lower_bound()) {
+        result.min *= b;
+    }
+    if (result.has_upper_bound()) {
+        result.max *= b;
+    }
+    return result;
+}
+
+ConstantInterval multiply(const ConstantInterval &a, const Expr &b) {
+    if (const int64_t *bi = as_const_int_or_uint(b)) {
+        return multiply(a, *bi);
+    }
+    return ConstantInterval::everything();
+}
+
+ConstantInterval multiply(const ConstantInterval &a, const ConstantInterval &b) {
+    int64_t bounds[4];
+    int64_t *bounds_begin = &bounds[0];
+    int64_t *bounds_end = &bounds[0];
+    if (a.has_lower_bound() && b.has_lower_bound()) {
+        *bounds_end++ = a.min * b.min;
+    }
+    if (a.has_lower_bound() && b.has_upper_bound()) {
+        *bounds_end++ = a.min * b.max;
+    }
+    if (a.has_upper_bound() && b.has_lower_bound()) {
+        *bounds_end++ = a.max * b.min;
+    }
+    if (a.has_upper_bound() && b.has_upper_bound()) {
+        *bounds_end++ = a.max * b.max;
+    }
+    if (bounds_begin != bounds_end) {
+        ConstantInterval result = {
+            *std::min_element(bounds_begin, bounds_end),
+            *std::max_element(bounds_begin, bounds_end),
+        };
+        // There *must* be a better way than this... Even
+        // cutting half the cases with swapping isn't that much help.
+        if (!a.has_lower_bound()) {
+            if (may_be_negative(b)) result.max_defined = false;  // NOLINT
+            if (may_be_positive(b)) result.min_defined = false;  // NOLINT
+        }
+        if (!a.has_upper_bound()) {
+            if (may_be_negative(b)) result.min_defined = false;  // NOLINT
+            if (may_be_positive(b)) result.max_defined = false;  // NOLINT
+        }
+        if (!b.has_lower_bound()) {
+            if (may_be_negative(a)) result.max_defined = false;  // NOLINT
+            if (may_be_positive(a)) result.min_defined = false;  // NOLINT
+        }
+        if (!b.has_upper_bound()) {
+            if (may_be_negative(a)) result.min_defined = false;  // NOLINT
+            if (may_be_positive(a)) result.max_defined = false;  // NOLINT
+        }
+        return result;
+    } else {
+        return ConstantInterval::everything();
+    }
+}
+
+ConstantInterval divide(const ConstantInterval &a, int64_t b) {
+    ConstantInterval result(a);
+    if (b < 0) {
+        result = negate(result);
+        b = -b;
+    }
+    if (result.has_lower_bound()) {
+        result.min = div_imp(result.min, b);
+    }
+    if (result.has_upper_bound()) {
+        result.max = div_imp(result.max + b - 1, b);
+    }
+    return result;
+}
+
+class DerivativeBounds : public IRVisitor {
     const string &var;
 
-    Scope<Monotonic> scope;
+    Scope<ConstantInterval> scope;
+    Scope<Interval> bounds;
 
     void visit(const IntImm *) override {
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const UIntImm *) override {
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const FloatImm *) override {
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const StringImm *) override {
         // require() Exprs can includes Strings.
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const Cast *op) override {
@@ -67,135 +254,105 @@ class MonotonicVisitor : public IRVisitor {
 
         // A narrowing cast. There may be more cases we can catch, but
         // for now we punt.
-        if (result != Monotonic::Constant) {
-            result = Monotonic::Unknown;
+        if (!is_constant(result)) {
+            result = ConstantInterval::everything();
         }
     }
 
     void visit(const Variable *op) override {
         if (op->name == var) {
-            result = Monotonic::Increasing;
+            result = ConstantInterval::single_point(1);
         } else if (scope.contains(op->name)) {
             result = scope.get(op->name);
         } else {
-            result = Monotonic::Constant;
-        }
-    }
-
-    Monotonic flip(Monotonic r) {
-        switch (r) {
-        case Monotonic::Increasing:
-            return Monotonic::Decreasing;
-        case Monotonic::Decreasing:
-            return Monotonic::Increasing;
-        default:
-            return r;
-        }
-    }
-
-    Monotonic unify(Monotonic a, Monotonic b) {
-        if (a == b) {
-            return a;
-        }
-
-        if (a == Monotonic::Unknown || b == Monotonic::Unknown) {
-            return Monotonic::Unknown;
-        }
-
-        if (a == Monotonic::Constant) {
-            return b;
+            result = ConstantInterval::single_point(0);
         }
-
-        if (b == Monotonic::Constant) {
-            return a;
-        }
-
-        return Monotonic::Unknown;
     }
 
     void visit(const Add *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
-        result = unify(ra, rb);
+        ConstantInterval rb = result;
+        result = add(ra, rb);
     }
 
     void visit(const Sub *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
-        result = unify(ra, flip(rb));
+        ConstantInterval rb = result;
+        result = sub(ra, rb);
     }
 
     void visit(const Mul *op) override {
-        op->a.accept(this);
-        Monotonic ra = result;
-        op->b.accept(this);
-        Monotonic rb = result;
-
-        if (ra == Monotonic::Constant && rb == Monotonic::Constant) {
-            result = Monotonic::Constant;
-        } else if (is_positive_const(op->a)) {
-            result = rb;
-        } else if (is_positive_const(op->b)) {
-            result = ra;
-        } else if (is_negative_const(op->a)) {
-            result = flip(rb);
-        } else if (is_negative_const(op->b)) {
-            result = flip(ra);
+        if (op->type.is_scalar()) {
+            op->a.accept(this);
+            ConstantInterval ra = result;
+            op->b.accept(this);
+            ConstantInterval rb = result;
+
+            // This is essentially the product rule: a*rb + b*ra
+            // but only implemented for the case where a or b is constant.
+            if (const int64_t *b = as_const_int_or_uint(op->b)) {
+                result = multiply(ra, *b);
+            } else if (const int64_t *a = as_const_int_or_uint(op->a)) {
+                result = multiply(rb, *a);
+            } else {
+                result = ConstantInterval::everything();
+            }
         } else {
-            result = Monotonic::Unknown;
+            result = ConstantInterval::everything();
         }
     }
 
     void visit(const Div *op) override {
-        op->a.accept(this);
-        Monotonic ra = result;
-        op->b.accept(this);
-        Monotonic rb = result;
-
-        if (ra == Monotonic::Constant && rb == Monotonic::Constant) {
-            result = Monotonic::Constant;
-        } else if (is_positive_const(op->b)) {
-            result = ra;
-        } else if (is_negative_const(op->b)) {
-            result = flip(ra);
+        if (op->type.is_scalar()) {
+            op->a.accept(this);
+            ConstantInterval ra = result;
+
+            if (const int64_t *b = as_const_int_or_uint(op->b)) {
+                result = divide(ra, *b);
+            } else {
+                result = ConstantInterval::everything();
+            }
         } else {
-            result = Monotonic::Unknown;
+            result = ConstantInterval::everything();
         }
     }
 
     void visit(const Mod *op) override {
-        result = Monotonic::Unknown;
+        result = ConstantInterval::everything();
     }
 
     void visit(const Min *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
+        ConstantInterval rb = result;
         result = unify(ra, rb);
     }
 
     void visit(const Max *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
+        ConstantInterval rb = result;
         result = unify(ra, rb);
     }
 
     void visit_eq(const Expr &a, const Expr &b) {
         a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         b.accept(this);
-        Monotonic rb = result;
-        if (ra == Monotonic::Constant && rb == Monotonic::Constant) {
-            result = Monotonic::Constant;
+        ConstantInterval rb = result;
+        if (is_constant(ra) && is_constant(rb)) {
+            result = ConstantInterval::single_point(0);
         } else {
-            result = Monotonic::Unknown;
+            // If the result is bounded, limit it to [-1, 1]. The largest
+            // difference possible is flipping from true to false or false
+            // to true.
+            result = ConstantInterval(-1, 1);
         }
     }
 
@@ -209,10 +366,19 @@ class MonotonicVisitor : public IRVisitor {
 
     void visit_lt(const Expr &a, const Expr &b) {
         a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         b.accept(this);
-        Monotonic rb = result;
-        result = unify(flip(ra), rb);
+        ConstantInterval rb = result;
+        result = unify(negate(ra), rb);
+        // If the result is bounded, limit it to [-1, 1]. The largest
+        // difference possible is flipping from true to false or false
+        // to true.
+        if (result.has_lower_bound()) {
+            result.min = std::min<int64_t>(std::max<int64_t>(result.min, -1), 1);
+        }
+        if (result.has_upper_bound()) {
+            result.max = std::min<int64_t>(std::max<int64_t>(result.max, -1), 1);
+        }
     }
 
     void visit(const LT *op) override {
@@ -233,71 +399,63 @@ class MonotonicVisitor : public IRVisitor {
 
     void visit(const And *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
+        ConstantInterval rb = result;
         result = unify(ra, rb);
     }
 
     void visit(const Or *op) override {
         op->a.accept(this);
-        Monotonic ra = result;
+        ConstantInterval ra = result;
         op->b.accept(this);
-        Monotonic rb = result;
+        ConstantInterval rb = result;
         result = unify(ra, rb);
     }
 
     void visit(const Not *op) override {
         op->a.accept(this);
-        result = flip(result);
+        result = negate(result);
     }
 
     void visit(const Select *op) override {
-        op->condition.accept(this);
-        Monotonic rcond = result;
-
-        op->true_value.accept(this);
-        Monotonic ra = result;
-        op->false_value.accept(this);
-        Monotonic rb = result;
-        Monotonic unified = unify(ra, rb);
-
-        if (rcond == Monotonic::Constant) {
-            result = unified;
-            return;
-        }
+        // The result is the unified bounds, added to the "bump" that happens when switching from true to false.
+        if (op->type.is_scalar()) {
+            op->condition.accept(this);
+            ConstantInterval rcond = result;
+
+            op->true_value.accept(this);
+            ConstantInterval ra = result;
+            op->false_value.accept(this);
+            ConstantInterval rb = result;
+            ConstantInterval unified = unify(ra, rb);
+
+            // TODO: How to handle unsigned values?
+            Expr delta = simplify(op->true_value - op->false_value);
+
+            Interval delta_bounds = find_constant_bounds(delta, bounds);
+            ConstantInterval adjusted_delta;
+            // TODO: Maybe we can do something with one-sided intervals?
+            if (delta_bounds.is_bounded()) {
+                ConstantInterval delta_low = multiply(rcond, delta_bounds.min);
+                ConstantInterval delta_high = multiply(rcond, delta_bounds.max);
+                adjusted_delta = ConstantInterval::make_union(delta_low, delta_high);
+            } else {
+                delta.accept(this);
+                ConstantInterval rdelta = result;
+                adjusted_delta = multiply(rcond, rdelta);
+            }
 
-        bool true_value_ge_false_value = can_prove(op->true_value >= op->false_value);
-        bool true_value_le_false_value = can_prove(op->true_value <= op->false_value);
-
-        bool switches_from_true_to_false = rcond == Monotonic::Decreasing;
-        bool switches_from_false_to_true = rcond == Monotonic::Increasing;
-
-        if (true_value_ge_false_value &&
-            true_value_le_false_value) {
-            // The true value equals the false value.
-            result = ra;
-        } else if ((unified == Monotonic::Increasing || unified == Monotonic::Constant) &&
-                   ((switches_from_false_to_true && true_value_ge_false_value) ||
-                    (switches_from_true_to_false && true_value_le_false_value))) {
-            // Both paths increase, and the condition makes it switch
-            // from the lesser path to the greater path.
-            result = Monotonic::Increasing;
-        } else if ((unified == Monotonic::Decreasing || unified == Monotonic::Constant) &&
-                   ((switches_from_false_to_true && true_value_le_false_value) ||
-                    (switches_from_true_to_false && true_value_ge_false_value))) {
-            // Both paths decrease, and the condition makes it switch
-            // from the greater path to the lesser path.
-            result = Monotonic::Decreasing;
+            result = add(unified, adjusted_delta);
         } else {
-            result = Monotonic::Unknown;
+            result = ConstantInterval::everything();
         }
     }
 
     void visit(const Load *op) override {
         op->index.accept(this);
-        if (result != Monotonic::Constant) {
-            result = Monotonic::Unknown;
+        if (!is_constant(result)) {
+            result = ConstantInterval::everything();
         }
     }
 
@@ -331,52 +489,55 @@ class MonotonicVisitor : public IRVisitor {
             return;
         }
 
-        if (!op->is_pure()) {
+        if (!op->is_pure() || !is_constant(result)) {
             // Even with constant args, the result could vary from one loop iteration to the next.
-            result = Monotonic::Unknown;
+            result = ConstantInterval::everything();
             return;
         }
 
         for (size_t i = 0; i < op->args.size(); i++) {
             op->args[i].accept(this);
-            if (result != Monotonic::Constant) {
+            if (!is_constant(result)) {
                 // One of the args is not constant.
-                result = Monotonic::Unknown;
+                result = ConstantInterval::everything();
                 return;
             }
         }
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const Let *op) override {
         op->value.accept(this);
 
-        if (result == Monotonic::Constant) {
+        ScopedBinding<Interval> bounds_binding(bounds, op->name, find_constant_bounds(op->value, bounds));
+
+        if (is_constant(result)) {
             // No point pushing it if it's constant w.r.t the var,
             // because unknown variables are treated as constant.
             op->body.accept(this);
         } else {
-            scope.push(op->name, result);
+            ScopedBinding<ConstantInterval> scope_binding(scope, op->name, result);
             op->body.accept(this);
-            scope.pop(op->name);
         }
     }
 
     void visit(const Shuffle *op) override {
         for (size_t i = 0; i < op->vectors.size(); i++) {
             op->vectors[i].accept(this);
-            if (result != Monotonic::Constant) {
-                result = Monotonic::Unknown;
+            if (!is_constant(result)) {
+                result = ConstantInterval::everything();
                 return;
             }
         }
-        result = Monotonic::Constant;
+        result = ConstantInterval::single_point(0);
     }
 
     void visit(const VectorReduce *op) override {
         op->value.accept(this);
         switch (op->op) {
         case VectorReduce::Add:
+            result = multiply(result, op->value.type().lanes() / op->type.lanes());
+            break;
         case VectorReduce::Min:
         case VectorReduce::Max:
             // These reductions are monotonic in the arg
@@ -385,8 +546,8 @@ class MonotonicVisitor : public IRVisitor {
         case VectorReduce::And:
         case VectorReduce::Or:
             // These ones are not
-            if (result != Monotonic::Constant) {
-                result = Monotonic::Unknown;
+            if (!is_constant(result)) {
+                result = ConstantInterval::everything();
             }
         }
     }
@@ -456,25 +617,43 @@ class MonotonicVisitor : public IRVisitor {
     }
 
 public:
-    Monotonic result;
+    ConstantInterval result;
 
-    MonotonicVisitor(const std::string &v, const Scope<Monotonic> &parent)
-        : var(v), result(Monotonic::Unknown) {
+    DerivativeBounds(const std::string &v, const Scope<ConstantInterval> &parent)
+        : var(v), result(ConstantInterval::everything()) {
         scope.set_containing_scope(&parent);
     }
 };
 
 }  // namespace
 
-Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope<Monotonic> &scope) {
+ConstantInterval derivative_bounds(const Expr &e, const std::string &var, const Scope<ConstantInterval> &scope) {
     if (!e.defined()) {
-        return Monotonic::Unknown;
+        return ConstantInterval::everything();
     }
-    MonotonicVisitor m(var, scope);
+    DerivativeBounds m(var, scope);
     e.accept(&m);
     return m.result;
 }
 
+Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope<ConstantInterval> &scope) {
+    if (!e.defined()) {
+        return Monotonic::Unknown;
+    }
+    return to_monotonic(derivative_bounds(e, var, scope));
+}
+
+Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope<Monotonic> &scope) {
+    if (!e.defined()) {
+        return Monotonic::Unknown;
+    }
+    Scope<ConstantInterval> intervals_scope;
+    for (Scope<Monotonic>::const_iterator i = scope.cbegin(); i != scope.cend(); ++i) {
+        intervals_scope.push(i.name(), to_interval(i.value()));
+    }
+    return is_monotonic(e, var, intervals_scope);
+}
+
 namespace {
 void check_increasing(const Expr &e) {
     internal_assert(is_monotonic(e, "x") == Monotonic::Increasing)
@@ -506,6 +685,7 @@ void is_monotonic_test() {
     check_increasing(x + 4);
     check_increasing(x + y);
     check_increasing(x * 4);
+    check_increasing(x / 4);
     check_increasing(min(x + 4, y + 4));
     check_increasing(max(x + y, x - y));
     check_increasing(x >= y);
@@ -513,12 +693,17 @@ void is_monotonic_test() {
 
     check_decreasing(-x);
     check_decreasing(x * -4);
+    check_decreasing(x / -4);
     check_decreasing(y - x);
     check_decreasing(x < y);
     check_decreasing(x <= y);
 
     check_unknown(x == y);
     check_unknown(x != y);
+    check_increasing(y <= x);
+    check_increasing(y < x);
+    check_decreasing(x <= y);
+    check_decreasing(x < y);
     check_unknown(x * y);
 
     // Not constant despite having constant args, because there's a side-effect.
@@ -527,10 +712,14 @@ void is_monotonic_test() {
     check_increasing(select(y == 2, x, x + 4));
     check_decreasing(select(y == 2, -x, x * -4));
 
-    check_increasing(select(x > 2, x + 1, x));
-    check_increasing(select(x < 2, x, x + 1));
-    check_decreasing(select(x > 2, -x - 1, -x));
-    check_decreasing(select(x < 2, -x, -x - 1));
+    check_unknown(select(x > 2, x - 2, x));
+    check_unknown(select(x < 2, x, x - 2));
+    check_unknown(select(x > 2, -x + 2, -x));
+    check_unknown(select(x < 2, -x, -x + 2));
+    check_increasing(select(x > 2, x - 1, x));
+    check_increasing(select(x < 2, x, x - 1));
+    check_decreasing(select(x > 2, -x + 1, -x));
+    check_decreasing(select(x < 2, -x, -x + 1));
 
     check_unknown(select(x < 2, x, x - 5));
     check_unknown(select(x > 2, x - 5, x));
@@ -546,6 +735,12 @@ void is_monotonic_test() {
 
     check_constant(select(y > 3, y + 23, y - 65));
 
+    check_decreasing(select(2 <= x, 0, 1));
+    check_increasing(select(2 <= x, 0, 1) + x);
+    check_decreasing(-min(x, 16));
+
+    check_unknown(select(0 < x, max(min(x, 4), 3), 4));
+
     std::cout << "is_monotonic test passed" << std::endl;
 }
 
diff --git a/src/Monotonic.h b/src/Monotonic.h
index c06fe8eac289..3d7946a13ed7 100644
--- a/src/Monotonic.h
+++ b/src/Monotonic.h
@@ -8,12 +8,16 @@
 #include <iostream>
 #include <string>
 
-#include "Expr.h"
+#include "Interval.h"
 #include "Scope.h"
 
 namespace Halide {
 namespace Internal {
 
+/** Find the bounds of the derivative of an expression. */
+ConstantInterval derivative_bounds(const Expr &e, const std::string &var,
+                                   const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope());
+
 /**
  * Detect whether an expression is monotonic increasing in a variable,
  * decreasing, or unknown.
@@ -23,7 +27,8 @@ enum class Monotonic { Constant,
                        Decreasing,
                        Unknown };
 Monotonic is_monotonic(const Expr &e, const std::string &var,
-                       const Scope<Monotonic> &scope = Scope<Monotonic>::empty_scope());
+                       const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope());
+Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope<Monotonic> &scope);
 
 /** Emit the monotonic class in human-readable form for debugging. */
 std::ostream &operator<<(std::ostream &stream, const Monotonic &m);
diff --git a/src/SimplifyCorrelatedDifferences.cpp b/src/SimplifyCorrelatedDifferences.cpp
index 0c78cfd2ad4a..cf7b5475e3d3 100644
--- a/src/SimplifyCorrelatedDifferences.cpp
+++ b/src/SimplifyCorrelatedDifferences.cpp
@@ -24,7 +24,7 @@ class SimplifyCorrelatedDifferences : public IRMutator {
 
     string loop_var;
 
-    Scope<Monotonic> monotonic;
+    Scope<ConstantInterval> monotonic;
 
     struct OuterLet {
         string name;
@@ -38,11 +38,11 @@ class SimplifyCorrelatedDifferences : public IRMutator {
         // Visit an entire chain of lets in a single method to conserve stack space.
         struct Frame {
             const LetStmtOrLet *op;
-            ScopedBinding<Monotonic> binding;
+            ScopedBinding<ConstantInterval> binding;
             Expr new_value;
-            Frame(const LetStmtOrLet *op, const string &loop_var, Scope<Monotonic> &scope)
+            Frame(const LetStmtOrLet *op, const string &loop_var, Scope<ConstantInterval> &scope)
                 : op(op),
-                  binding(scope, op->name, is_monotonic(op->value, loop_var, scope)) {
+                  binding(scope, op->name, derivative_bounds(op->value, loop_var, scope)) {
             }
             Frame(const LetStmtOrLet *op)
                 : op(op) {
@@ -59,7 +59,7 @@ class SimplifyCorrelatedDifferences : public IRMutator {
         // same name. If we decide not to add an inner let, but do add
         // the outer one, then later references to it will be
         // incorrect. Second, if we don't add something that happens
-        // to be non-monotonic, then is_monotonic finds a variable
+        // to be non-monotonic, then derivative_bounds finds a variable
         // that references it in a later let, it will think it's a
         // constant, not an unknown.
         do {
@@ -118,7 +118,7 @@ class SimplifyCorrelatedDifferences : public IRMutator {
             tmp_lets.swap(lets);
             loop_var = op->name;
             {
-                ScopedBinding<Monotonic> bind(monotonic, loop_var, Monotonic::Increasing);
+                ScopedBinding<ConstantInterval> bind(monotonic, loop_var, ConstantInterval::single_point(1));
                 s = IRMutator::visit(op);
             }
             loop_var.clear();
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index 845aaa07527d..f18e29645ac7 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -36,6 +36,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
 
     struct ExprInfo {
         // We track constant integer bounds when they exist
+        // TODO: Use ConstantInterval?
         int64_t min = 0, max = 0;
         bool min_defined = false, max_defined = false;
         // And the alignment of integer variables
diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp
index e55848db5783..9e1b7114eedb 100644
--- a/src/SlidingWindow.cpp
+++ b/src/SlidingWindow.cpp
@@ -3,20 +3,30 @@
 #include "Bounds.h"
 #include "CompilerLogger.h"
 #include "Debug.h"
+#include "ExprUsesVar.h"
+#include "IREquality.h"
+#include "IRMatch.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "Monotonic.h"
 #include "Scope.h"
 #include "Simplify.h"
+#include "Solve.h"
 #include "Substitute.h"
+#include <list>
+#include <set>
 #include <utility>
 
 namespace Halide {
 namespace Internal {
 
+using std::list;
 using std::map;
+using std::pair;
+using std::set;
 using std::string;
+using std::vector;
 
 namespace {
 
@@ -61,7 +71,7 @@ class ExpandExpr : public IRMutator {
     Expr visit(const Variable *var) override {
         if (scope.contains(var->name)) {
             Expr expr = scope.get(var->name);
-            debug(3) << "Fully expanded " << var->name << " -> " << expr << "\n";
+            debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n";
             return expr;
         } else {
             return var;
@@ -78,16 +88,44 @@ class ExpandExpr : public IRMutator {
 Expr expand_expr(const Expr &e, const Scope<Expr> &scope) {
     ExpandExpr ee(scope);
     Expr result = ee.mutate(e);
-    debug(3) << "Expanded " << e << " into " << result << "\n";
+    debug(4) << "Expanded " << e << " into " << result << "\n";
     return result;
 }
 
+class FindProduce : public IRVisitor {
+    const string &func;
+
+    using IRVisitor::visit;
+
+    void visit(const ProducerConsumer *op) override {
+        if (op->is_producer && op->name == func) {
+            found = true;
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+public:
+    bool found = false;
+
+    FindProduce(const string &func)
+        : func(func) {
+    }
+};
+
+bool find_produce(const Stmt &s, const string &func) {
+    FindProduce finder(func);
+    s.accept(&finder);
+    return finder.found;
+}
+
 // Perform sliding window optimization for a function over a
 // particular serial for loop
 class SlidingWindowOnFunctionAndLoop : public IRMutator {
     Function func;
     string loop_var;
     Expr loop_min;
+    set<int> &slid_dimensions;
     Scope<Expr> scope;
 
     map<string, Expr> replacements;
@@ -112,10 +150,10 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
     }
 
     Stmt visit(const ProducerConsumer *op) override {
-        if (!op->is_producer || (op->name != func.name())) {
-            return IRMutator::visit(op);
-        } else {
-            Stmt stmt = op;
+        if (op->is_producer) {
+            if (op->name != func.name()) {
+                return IRMutator::visit(op);
+            }
 
             // We're interested in the case where exactly one of the
             // dimensions of the buffer has a min/extent that depends
@@ -131,6 +169,10 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
             string prefix = func.name() + ".s" + std::to_string(func.updates().size()) + ".";
             const std::vector<string> func_args = func.args();
             for (int i = 0; i < func.dimensions(); i++) {
+                if (slid_dimensions.count(i)) {
+                    debug(3) << "Already slid over dimension " << i << ", so skipping it.\n";
+                    continue;
+                }
                 // Look up the region required of this function's last stage
                 string var = prefix + func_args[i];
                 internal_assert(scope.contains(var + ".min") && scope.contains(var + ".max"));
@@ -169,7 +211,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
                 debug(3) << "Could not perform sliding window optimization of "
                          << func.name() << " over " << loop_var << " because multiple "
                          << "dimensions of the function dependended on the loop var\n";
-                return stmt;
+                return op;
             }
 
             // If the function is not pure in the given dimension, give up. We also
@@ -185,7 +227,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
                 debug(3) << "Could not performance sliding window optimization of "
                          << func.name() << " over " << loop_var << " because the function "
                          << "scatters along the related axis.\n";
-                return stmt;
+                return op;
             }
 
             bool can_slide_up = false;
@@ -219,7 +261,7 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
                          << " because I couldn't prove it moved monotonically along that dimension\n"
                          << "Min is " << min_required << "\n"
                          << "Max is " << max_required << "\n";
-                return stmt;
+                return op;
             }
 
             // Ok, we've isolated a function, a dimension to slide
@@ -242,26 +284,96 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
                          << " there's no overlap in the region computed across iterations\n"
                          << "Min is " << min_required << "\n"
                          << "Max is " << max_required << "\n";
-                return stmt;
+                return op;
             }
 
+            // Update the bounds of this producer assuming the previous iteration
+            // has run already.
             Expr new_min, new_max;
             if (can_slide_up) {
-                new_min = select(loop_var_expr <= loop_min, min_required, likely_if_innermost(prev_max_plus_one));
+                new_min = prev_max_plus_one;
                 new_max = max_required;
             } else {
                 new_min = min_required;
-                new_max = select(loop_var_expr <= loop_min, max_required, likely_if_innermost(prev_min_minus_one));
+                new_max = prev_min_minus_one;
             }
 
-            Expr early_stages_min_required = new_min;
-            Expr early_stages_max_required = new_max;
+            // See if we can find a new min for the loop that can warm up the
+            // sliding window. We're going to do this by building an equation
+            // that describes the constraints we have on our new loop min. The
+            // first constraint is that the new loop min is not after the
+            // loop min.
+            string new_loop_min_name = unique_name('x');
+            Expr new_loop_min_var = Variable::make(Int(32), new_loop_min_name);
+            Expr new_loop_min_eq = new_loop_min_var <= loop_min;
+            Expr new_min_at_new_loop_min = substitute(loop_var, new_loop_min_var, new_min);
+            Expr new_max_at_new_loop_min = substitute(loop_var, new_loop_min_var, new_max);
+            if (can_slide_up) {
+                // We need to find a new loop min that satisfies these constraints:
+                // - The new min at the new loop min needs to be before the min
+                //   required at the original min.
+                // - The new max needs to be greater than the new min, both at the
+                //   new loop min. This guarantees that the sliding window.
+                // Together, these conditions guarantee the sliding window is warmed
+                // up. The first condition checks that we reached the original loop
+                // min, and the second condition checks that the iterations before
+                // the original min weren't empty.
+                Expr min_required_at_loop_min = substitute(loop_var, loop_min, min_required);
+                new_loop_min_eq = new_loop_min_eq &&
+                                  new_min_at_new_loop_min <= min_required_at_loop_min &&
+                                  new_max_at_new_loop_min >= new_min_at_new_loop_min;
+            } else {
+                // When sliding down, the constraints are similar, just swapping
+                // the roles of the min and max.
+                Expr max_required_at_loop_min = substitute(loop_var, loop_min, max_required);
+                new_loop_min_eq = new_loop_min_eq &&
+                                  new_max_at_new_loop_min >= max_required_at_loop_min &&
+                                  new_min_at_new_loop_min <= new_max_at_new_loop_min;
+            }
+            // Try to solve the equation.
+            new_loop_min_eq = simplify(new_loop_min_eq);
+            Interval solve_result = solve_for_inner_interval(new_loop_min_eq, new_loop_min_name);
+            internal_assert(!new_loop_min.defined());
+            if (solve_result.has_upper_bound() && !equal(solve_result.max, loop_min)) {
+                new_loop_min = simplify(solve_result.max);
+
+                // We have a new loop min, so we an assume every iteration has
+                // a previous iteration. In order for this to be safe, we need
+                // the new min/max at the new loop min to be less than or equal to
+                // the min/max required at the original loop min.
+                Expr loop_var_expr = Variable::make(Int(32), loop_var);
+                Expr orig_loop_min_expr = Variable::make(Int(32), loop_var + ".loop_min.orig");
+                if (can_slide_up) {
+                    Expr min_required_at_loop_min = substitute(loop_var, orig_loop_min_expr, min_required);
+                    new_min = max(new_min, min_required_at_loop_min);
+                } else {
+                    Expr max_required_at_loop_min = substitute(loop_var, orig_loop_min_expr, max_required);
+                    new_max = min(new_max, max_required_at_loop_min);
+                }
+            } else {
+                // We couldn't find a suitable new loop min, we can't assume
+                // every iteration has a previous iteration. The first iteration
+                // will warm up the loop.
+                Expr need_explicit_warmup = loop_var_expr <= loop_min;
+                if (can_slide_up) {
+                    new_min = select(need_explicit_warmup, min_required, likely_if_innermost(new_min));
+                } else {
+                    new_max = select(need_explicit_warmup, max_required, likely_if_innermost(new_max));
+                }
+            }
+            new_min = simplify(new_min);
+            new_max = simplify(new_max);
 
             debug(3) << "Sliding " << func.name() << ", " << dim << "\n"
                      << "Pushing min up from " << min_required << " to " << new_min << "\n"
-                     << "Shrinking max from " << max_required << " to " << new_max << "\n";
+                     << "Shrinking max from " << max_required << " to " << new_max << "\n"
+                     << "Adjusting loop_min from " << loop_min << " to " << new_loop_min << "\n"
+                     << "Equation is " << new_loop_min_eq << "\n";
+
+            slid_dimensions.insert(dim_idx);
 
             // Now redefine the appropriate regions required
+            internal_assert(replacements.empty());
             if (can_slide_up) {
                 replacements[prefix + dim + ".min"] = new_min;
             } else {
@@ -280,19 +392,55 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
             // the last stage to cover values produced by stages
             // before the last one. Because, e.g., an intermediate
             // stage may be unrolled, expanding its bounds provided.
+            Stmt result = op;
             if (!func.updates().empty()) {
                 Box b = box_provided(op->body, func.name());
                 if (can_slide_up) {
                     string n = prefix + dim + ".min";
                     Expr var = Variable::make(Int(32), n);
-                    stmt = LetStmt::make(n, min(var, b[dim_idx].min), stmt);
+                    result = LetStmt::make(n, min(var, b[dim_idx].min), result);
                 } else {
                     string n = prefix + dim + ".max";
                     Expr var = Variable::make(Int(32), n);
-                    stmt = LetStmt::make(n, max(var, b[dim_idx].max), stmt);
+                    result = LetStmt::make(n, max(var, b[dim_idx].max), result);
+                }
+            }
+
+            return result;
+        } else if (!find_produce(op, func.name()) && new_loop_min.defined()) {
+            // The producer might have expanded the loop before the min to warm
+            // up the window. This consumer doesn't contain a producer that might
+            // be part of the warmup, so guard it with an if to only run it on
+            // the original loop bounds.
+            Expr loop_var_expr = Variable::make(Int(32), loop_var);
+            Expr orig_loop_min_expr = Variable::make(Int(32), loop_var + ".loop_min.orig");
+            Expr guard = likely_if_innermost(orig_loop_min_expr <= loop_var_expr);
+
+            // Put the if inside the consumer node, so semaphores end up outside the if.
+            // TODO: This is correct, but it produces slightly suboptimal code: if we
+            // didn't do this, the loop could likely be trimmed and the if simplified away.
+            Stmt body = mutate(op->body);
+            if (const IfThenElse *old_guard = body.as<IfThenElse>()) {
+                Expr x = Variable::make(Int(32), "*");
+                vector<Expr> matches;
+                if (expr_match(likely_if_innermost(x <= loop_var_expr), old_guard->condition, matches)) {
+                    // There's already a condition on loop_var_expr here. Since we're
+                    // adding a condition at the old loop min, this if must already be
+                    // guarding more than we will.
+                    guard = Expr();
                 }
             }
-            return stmt;
+            if (guard.defined()) {
+                debug(3) << "Guarding body " << guard << "\n";
+                body = IfThenElse::make(guard, body);
+            }
+            if (body.same_as(op->body)) {
+                return op;
+            } else {
+                return ProducerConsumer::make_consume(op->name, body);
+            }
+        } else {
+            return IRMutator::visit(op);
         }
     }
 
@@ -340,39 +488,123 @@ class SlidingWindowOnFunctionAndLoop : public IRMutator {
     }
 
 public:
-    SlidingWindowOnFunctionAndLoop(Function f, string v, Expr v_min)
-        : func(std::move(f)), loop_var(std::move(v)), loop_min(std::move(v_min)) {
+    SlidingWindowOnFunctionAndLoop(Function f, string v, Expr v_min, set<int> &slid_dimensions)
+        : func(std::move(f)), loop_var(std::move(v)), loop_min(std::move(v_min)), slid_dimensions(slid_dimensions) {
     }
+
+    Expr new_loop_min;
 };
 
-// Perform sliding window optimization for a particular function
-class SlidingWindowOnFunction : public IRMutator {
-    Function func;
+// In Stmt s, does the production of b depend on a?
+// We can't use produce/consume nodes to determine this, because they're "loose".
+// For example, we get this:
+//
+//  produce a {
+//   a(...) = ...
+//  }
+//  consume a {
+//   produce b {
+//    b(...) = ... // not depending on a
+//   }
+//   consume b {
+//    c(...) = a(...) + b(...)
+//   }
+//  }
+//
+// When we'd rather see this:
+//
+//  produce a {
+//   a(...) = ...
+//  }
+//  produce b {
+//   b(...) = ... // not depending on a
+//  }
+//  consume a {
+//   consume b {
+//    c(...) = a(...) + b(...)
+//   }
+//  }
+//
+// TODO: We might also need to figure out transitive dependencies...? If so, it
+// would be best to just fix the produce/consume relationships as above. We would
+// just be able to look for produce b inside produce a.
+class Dependencies : public IRVisitor {
+    using IRVisitor::visit;
 
-    using IRMutator::visit;
+    const string &producer;
+    bool in_producer = false;
 
-    Stmt visit(const For *op) override {
-        debug(3) << " Doing sliding window analysis over loop: " << op->name << "\n";
+    void visit(const ProducerConsumer *op) override {
+        ScopedValue<bool> old_finding_a(in_producer, in_producer || (op->is_producer && op->name == producer));
+        return IRVisitor::visit(op);
+    }
 
-        Stmt new_body = op->body;
+    void visit(const Call *op) override {
+        if (in_producer && op->call_type == Call::Halide) {
+            if (op->name != producer) {
+                dependencies.insert(op->name);
+            }
+        }
+        IRVisitor::visit(op);
+    }
 
-        new_body = mutate(new_body);
+public:
+    set<string> dependencies;
+
+    Dependencies(const string &producer)
+        : producer(producer) {
+    }
+};
 
-        if (op->for_type == ForType::Serial ||
-            op->for_type == ForType::Unrolled) {
-            new_body = SlidingWindowOnFunctionAndLoop(func, op->name, op->min).mutate(new_body);
+bool depends_on(const string &a, const string &b, const Stmt &s, map<string, bool> &cache) {
+    if (a == b) {
+        return true;
+    }
+    auto cached = cache.find(b);
+    if (cached != cache.end()) {
+        return cached->second;
+    }
+    Dependencies deps(b);
+    s.accept(&deps);
+    // Recursively search for dependencies.
+    for (const string &i : deps.dependencies) {
+        if (depends_on(a, i, s, cache)) {
+            cache[b] = true;
+            return true;
         }
+    }
+    cache[b] = false;
+    return false;
+}
 
-        if (new_body.same_as(op->body)) {
-            return op;
+bool depends_on(const string &a, const string &b, const Stmt &s) {
+    map<string, bool> cache;
+    return depends_on(a, b, s, cache);
+}
+
+// Update the loop variable referenced by prefetch directives.
+class SubstitutePrefetchVar : public IRMutator {
+    const string &old_var;
+    const string &new_var;
+
+    using IRMutator::visit;
+
+    Stmt visit(const Prefetch *op) override {
+        Stmt new_body = mutate(op->body);
+        if (op->prefetch.var == old_var) {
+            PrefetchDirective p = op->prefetch;
+            p.var = new_var;
+            return Prefetch::make(op->name, op->types, op->bounds, p, op->condition, new_body);
+        } else if (!new_body.same_as(op->body)) {
+            return Prefetch::make(op->name, op->types, op->bounds, op->prefetch, op->condition, new_body);
         } else {
-            return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, new_body);
+            return op;
         }
     }
 
 public:
-    SlidingWindowOnFunction(Function f)
-        : func(std::move(f)) {
+    SubstitutePrefetchVar(const string &old_var, const string &new_var)
+        : old_var(old_var), new_var(new_var) {
     }
 };
 
@@ -380,6 +612,13 @@ class SlidingWindowOnFunction : public IRMutator {
 class SlidingWindow : public IRMutator {
     const map<string, Function> &env;
 
+    // A map of which dimensions we've already slid over, by Func name.
+    map<string, set<int>> slid_dimensions;
+
+    // Keep track of realizations we want to slide, from innermost to
+    // outermost.
+    list<Function> sliding;
+
     using IRMutator::visit;
 
     Stmt visit(const Realize *op) override {
@@ -399,13 +638,17 @@ class SlidingWindow : public IRMutator {
             return IRMutator::visit(op);
         }
 
-        Stmt new_body = op->body;
-
-        debug(3) << "Doing sliding window analysis on realization of " << op->name << "\n";
-
-        new_body = SlidingWindowOnFunction(iter->second).mutate(new_body);
-
-        new_body = mutate(new_body);
+        // We want to slide innermost first, so put it on the front of
+        // the list.
+        sliding.push_front(iter->second);
+        Stmt new_body = mutate(op->body);
+        sliding.pop_front();
+        // Remove tracking of slid dimensions when we're done realizing
+        // it in case a realization appears elsewhere.
+        auto slid_it = slid_dimensions.find(iter->second.name());
+        if (slid_it != slid_dimensions.end()) {
+            slid_dimensions.erase(slid_it);
+        }
 
         if (new_body.same_as(op->body)) {
             return op;
@@ -415,16 +658,135 @@ class SlidingWindow : public IRMutator {
         }
     }
 
+    Stmt visit(const For *op) override {
+        if (!(op->for_type == ForType::Serial || op->for_type == ForType::Unrolled)) {
+            return IRMutator::visit(op);
+        }
+        debug(3) << "Doing sliding window analysis on loop " << op->name << "\n";
+
+        string name = op->name;
+        Stmt body = op->body;
+        Expr loop_min = op->min;
+        Expr loop_extent = op->extent;
+        Expr loop_max = Variable::make(Int(32), op->name + ".loop_max");
+
+        list<pair<string, Expr>> prev_loop_mins;
+        list<pair<string, Expr>> new_lets;
+        for (const Function &func : sliding) {
+            debug(3) << "Doing sliding window analysis on function " << func.name() << "\n";
+
+            // Figure out where we should start sliding from. If no
+            // other func needs this func, we can just start at the
+            // original loop min.
+            Expr prev_loop_min = op->min;
+            // If a previously slid func needs this func to be warmed
+            // up, then we need to back up the loop to warm up this
+            // func before the already slid func starts warming up.
+            for (const auto &i : prev_loop_mins) {
+                if (depends_on(func.name(), i.first, body)) {
+                    prev_loop_min = i.second;
+                    break;
+                }
+            }
+
+            SlidingWindowOnFunctionAndLoop slider(func, name, prev_loop_min, slid_dimensions[func.name()]);
+            body = slider.mutate(body);
+
+            if (slider.new_loop_min.defined()) {
+                Expr new_loop_min = slider.new_loop_min;
+                if (!prev_loop_min.same_as(loop_min)) {
+                    // If we didn't start sliding from the previous
+                    // loop min, we the old loop min might already
+                    // be further back than this new one.
+                    new_loop_min = min(new_loop_min, loop_min);
+                }
+
+                // Put this at the front of the list, so we find it first
+                // when checking subsequent funcs.
+                prev_loop_mins.emplace_front(func.name(), new_loop_min);
+
+                // Update the loop body to use the adjusted loop min.
+                string new_name = name + ".$n";
+                loop_min = Variable::make(Int(32), new_name + ".loop_min");
+                loop_extent = Variable::make(Int(32), new_name + ".loop_extent");
+                body = substitute({
+                                      {name, Variable::make(Int(32), new_name)},
+                                      {name + ".loop_min", loop_min},
+                                      {name + ".loop_extent", loop_extent},
+                                  },
+                                  body);
+                body = SubstitutePrefetchVar(name, new_name).mutate(body);
+
+                name = new_name;
+
+                // The new loop interval is the new loop min to the loop max.
+                new_lets.emplace_front(name + ".loop_min", new_loop_min);
+                new_lets.emplace_front(name + ".loop_min.orig", loop_min);
+                new_lets.emplace_front(name + ".loop_extent", (loop_max - loop_min) + 1);
+            }
+        }
+
+        body = mutate(body);
+
+        if (body.same_as(op->body) && loop_min.same_as(op->min) && loop_extent.same_as(op->extent) && name == op->name) {
+            return op;
+        } else {
+            Stmt result = For::make(name, loop_min, loop_extent, op->for_type, op->device_api, body);
+            if (!new_lets.empty()) {
+                result = LetStmt::make(name + ".loop_max", loop_max, result);
+            }
+            for (const auto &i : new_lets) {
+                result = LetStmt::make(i.first, i.second, result);
+            }
+            return result;
+        }
+    }
+
+    Stmt visit(const IfThenElse *op) override {
+        // Don't let specializations corrupt the tracking of which
+        // dimensions have been slid.
+        map<string, set<int>> old_slid_dimensions = slid_dimensions;
+        Stmt then_case = mutate(op->then_case);
+        slid_dimensions = old_slid_dimensions;
+        Stmt else_case = mutate(op->else_case);
+        slid_dimensions = old_slid_dimensions;
+        if (then_case.same_as(op->then_case) && else_case.same_as(op->else_case)) {
+            return op;
+        } else {
+            return IfThenElse::make(op->condition, then_case, else_case);
+        }
+    }
+
 public:
     SlidingWindow(const map<string, Function> &e)
         : env(e) {
     }
 };
 
+// It is convenient to be able to assume that loops have a .loop_min.orig
+// let in addition to .loop_min. Most of these will get simplified away.
+class AddLoopMinOrig : public IRMutator {
+    using IRMutator::visit;
+
+    Stmt visit(const For *op) override {
+        Stmt body = mutate(op->body);
+        Expr min = mutate(op->min);
+        Expr extent = mutate(op->extent);
+
+        Stmt result;
+        if (body.same_as(op->body) && min.same_as(op->min) && extent.same_as(op->extent)) {
+            result = op;
+        } else {
+            result = For::make(op->name, min, extent, op->for_type, op->device_api, body);
+        }
+        return LetStmt::make(op->name + ".loop_min.orig", Variable::make(Int(32), op->name + ".loop_min"), result);
+    }
+};
+
 }  // namespace
 
 Stmt sliding_window(const Stmt &s, const map<string, Function> &env) {
-    return SlidingWindow(env).mutate(s);
+    return SlidingWindow(env).mutate(AddLoopMinOrig().mutate(s));
 }
 
 }  // namespace Internal
diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
index 1c4c4c62cb0f..548d46cb57cf 100644
--- a/src/StorageFolding.cpp
+++ b/src/StorageFolding.cpp
@@ -512,6 +512,8 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
 
         Box provided = box_provided(body, func.name());
         Box required = box_required(body, func.name());
+        // For storage folding, we don't care about conditional reads.
+        required.used = Expr();
         Box box = box_union(provided, required);
 
         Expr loop_var = Variable::make(Int(32), op->name);
@@ -674,8 +676,10 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
                     // Can't do much with this dimension
                     if (!explicit_only) {
                         debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n"
-                                 << "min = " << min << "\n"
-                                 << "max = " << max << "\n";
+                                 << "min_initial = " << min_initial << "\n"
+                                 << "min_steady = " << min_steady << "\n"
+                                 << "max_initial = " << max_initial << "\n"
+                                 << "max_steady = " << max_steady << "\n";
                     } else {
                         debug(3) << "Not folding because there is no explicit storage folding factor\n";
                     }
@@ -789,22 +793,16 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
                         to_release = max_required - max_required_next;  // This is the last time we use these entries
                     }
 
-                    // Logically we acquire the entire extent on
-                    // the first iteration:
-
-                    // to_acquire = select(loop_var > loop_min, to_acquire, extent);
-
-                    // However it's simpler to implement this by
-                    // just reducing the initial value on the
-                    // semaphore by the difference, as long as it
-                    // doesn't lift any inner names out of scope.
-
-                    Expr fudge = simplify(substitute(op->name, loop_min, extent - to_acquire));
-                    if (is_const(fudge) && can_prove(fudge <= sema.init)) {
-                        sema.init -= fudge;
-                    } else {
-                        to_acquire = select(loop_var > loop_min, likely(to_acquire), extent);
+                    if (provided.used.defined()) {
+                        to_acquire = select(provided.used, to_acquire, 0);
                     }
+                    // We should always release the required region, even if we don't use it.
+
+                    // On the first iteration, we need to acquire the extent of the region shared
+                    // between the producer and consumer, and we need to release it on the last
+                    // iteration.
+                    to_acquire = select(loop_var > loop_min, to_acquire, extent);
+                    to_release = select(loop_var < loop_max, to_release, extent);
 
                     // We may need dynamic assertions that a positive
                     // amount of the semaphore is acquired/released,
@@ -864,10 +862,8 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
             } else {
                 stmt = op;
                 debug(3) << "Not folding because loop min or max not monotonic in the loop variable\n"
-                         << "min_initial = " << min_initial << "\n"
-                         << "min_steady = " << min_steady << "\n"
-                         << "max_initial = " << max_initial << "\n"
-                         << "max_steady = " << max_steady << "\n";
+                         << "min = " << min << "\n"
+                         << "max = " << max << "\n";
                 break;
             }
         }
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 9873896b4223..fc1a9a182f49 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -14,11 +14,6 @@ tests(GROUPS correctness
       atomic_tuples.cpp
       atomics.cpp
       autodiff.cpp
-      autotune_bug.cpp
-      autotune_bug_2.cpp
-      autotune_bug_3.cpp
-      autotune_bug_4.cpp
-      autotune_bug_5.cpp
       bad_likely.cpp
       bit_counting.cpp
       bitwise_ops.cpp
diff --git a/test/correctness/autotune_bug.cpp b/test/correctness/autotune_bug.cpp
deleted file mode 100644
index f8403b6d49ec..000000000000
--- a/test/correctness/autotune_bug.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#define AUTOTUNE_N 16, 16
-
-// This tests a segfault generated by an autotuned schedule.
-
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-int main(int argc, char **argv) {
-
-    ImageParam in_img(UInt(16), 2);
-    Func blur_x("blur_x"), blur_y("blur_y");
-    Var x("x"), y("y"), xi("xi"), yi("yi");
-
-    Func input;
-    input(x, y) = in_img(clamp(x, 1, in_img.width() - 1),
-                         clamp(y, 1, in_img.height()) - 1);
-
-    // The algorithm
-    blur_x(x, y) = (input(x, y) + input(x + 1, y) + input(x + 2, y)) / 3;
-    blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3;
-
-    Halide::Var _x2;
-    input
-        .reorder_storage(y, x)
-        .compute_root();
-    blur_x
-        .split(x, x, _x2, 4)
-        .compute_at(blur_y, y);
-    blur_y
-        .reorder(y, x);
-
-    blur_y.compile_jit();
-    blur_y.infer_input_bounds({AUTOTUNE_N});
-    assert(in_img.get().data());
-    blur_y.realize({AUTOTUNE_N});
-
-    printf("Success!\n");
-    return 0;
-}
diff --git a/test/correctness/autotune_bug_2.cpp b/test/correctness/autotune_bug_2.cpp
deleted file mode 100644
index 65fc7d507e80..000000000000
--- a/test/correctness/autotune_bug_2.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-int my_trace(void *user_context, const halide_trace_event_t *e) {
-    // The schedule implies that f will be stored from 0 to 8
-    if (e->event == 2 && std::string(e->func) == "f") {
-        if (e->coordinates[1] < 8) {
-            printf("Bounds on realization of f were supposed to be >= [0, 9]\n"
-                   "Instead they are: %d %d\n",
-                   e->coordinates[0], e->coordinates[1]);
-            exit(-1);
-        }
-    }
-    return 0;
-}
-
-int main(int argc, char **argv) {
-    Func f("f"), g("g");
-    Var x("x");
-    f(x) = x;
-    RDom r(17, 1);
-    f(x) = r;
-    f.store_root();
-
-    g(x) = f(x) + f(x + 1);
-    f.compute_at(g, x);
-
-    Var xo("xo"), xi("xi");
-    f.split(x, xo, xi, 8);
-    f.update();
-
-    f.trace_realizations().trace_stores();
-
-    g.set_custom_trace(&my_trace);
-    g.bound(x, 0, 2);
-    g.output_buffer().dim(0).set_bounds(0, 2);
-    g.realize({2});
-
-    printf("Success!\n");
-
-    return 0;
-}
diff --git a/test/correctness/autotune_bug_3.cpp b/test/correctness/autotune_bug_3.cpp
deleted file mode 100644
index 4bab12448b76..000000000000
--- a/test/correctness/autotune_bug_3.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-int my_trace(void *user_context, const halide_trace_event_t *e) {
-    // The schedule implies that f will be stored from 0 to 8
-    if (e->event == 2 && std::string(e->func) == "f") {
-        if (e->coordinates[1] < 8) {
-            printf("Bounds on realization of f were supposed to be >= [0, 8]\n"
-                   "Instead they are: %d %d\n",
-                   e->coordinates[0], e->coordinates[1]);
-            exit(-1);
-        }
-    }
-    return 0;
-}
-
-int main(int argc, char **argv) {
-    Func f("f"), g("g");
-    Var x("x");
-    f(x) = x;
-    f.store_root();
-
-    g(x) = f(x) + f(x + 1);
-    f.compute_at(g, x);
-
-    Var xo("xo"), xi("xi");
-    f.split(x, xo, xi, 8);
-
-    f.trace_realizations().trace_stores();
-
-    g.set_custom_trace(&my_trace);
-    g.bound(x, 0, 2);
-    g.output_buffer().dim(0).set_bounds(0, 2);
-    g.realize({2});
-
-    printf("Success!\n");
-
-    return 0;
-}
diff --git a/test/correctness/autotune_bug_4.cpp b/test/correctness/autotune_bug_4.cpp
deleted file mode 100644
index 6fc5a0751f6e..000000000000
--- a/test/correctness/autotune_bug_4.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-int my_trace(void *user_context, const halide_trace_event_t *e) {
-    // The schedule implies that f and g will be stored from 0 to 7
-    if (e->event == 2 && std::string(e->func) == "f") {
-        if (e->coordinates[1] < 7) {
-            printf("Bounds on realization were supposed to be = [0, 7]\n"
-                   "Instead they are: %d %d\n",
-                   e->coordinates[0], e->coordinates[1]);
-            exit(-1);
-        }
-    }
-    return 0;
-}
-
-int main(int argc, char **argv) {
-    Func f("f"), g("g"), h("h");
-    Var x("x");
-
-    f(x) = x;
-    g(x) = f(x);
-    h(x) = g(x) + g(x + 1);
-
-    Var xo("xo"), xi("xi");
-    f.split(x, xo, xi, 4);
-    g.split(x, xo, xi, 5);
-    h.split(x, xo, xi, 6);
-    f.compute_at(h, xo);
-    g.compute_at(h, xo);
-    g.store_root();
-
-    f.trace_realizations().trace_stores().trace_loads();
-    g.trace_realizations().trace_stores().trace_loads();
-
-    h.set_custom_trace(&my_trace);
-    h.bound(x, 0, 6);
-    h.realize({6});
-
-    printf("Success!\n");
-
-    return 0;
-}
diff --git a/test/correctness/autotune_bug_5.cpp b/test/correctness/autotune_bug_5.cpp
deleted file mode 100644
index e012a1121501..000000000000
--- a/test/correctness/autotune_bug_5.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-int main(int argc, char **argv) {
-    Buffer<float> input(1024, 1024);
-
-    Func upsampled("upsampled");
-    Func upsampledx("upsampledx");
-    Var x("x"), y("y");
-
-    Func clamped("clamped");
-    clamped(x, y) = input(x, y);
-
-    upsampledx(x, y) = select((x % 2) == 0,
-                              clamped(x, y),
-                              clamped(x + 1, y));
-    upsampled(x, y) = upsampledx(x, y);
-
-    Var xi("xi"), yi("yi");
-    clamped.compute_root();  // passes if this is removed, switched to inline
-    upsampled
-        .split(y, y, yi, 8)
-        .reorder(yi, y, x)
-        .compute_root();
-
-    upsampledx.compute_at(upsampled, yi);
-
-    upsampled.realize({100, 100});
-
-    printf("Success!\n");
-    return 0;
-}
diff --git a/test/correctness/sliding_reduction.cpp b/test/correctness/sliding_reduction.cpp
index 087bc2d06b7d..3ce75056a09b 100644
--- a/test/correctness/sliding_reduction.cpp
+++ b/test/correctness/sliding_reduction.cpp
@@ -87,10 +87,8 @@ int main(int argc, char **argv) {
         // clobber earlier values of the final stage of f, so we have
         // to compute the final stage of f two rows at a time as well.
 
-        // The result is that we evaluate the first three rows of f
-        // for the first scanline of g, and then another two rows for
-        // every row of g thereafter. This adds up to 2*(3 + 9*2) = 42
-        // evaluations of f.
+        // The result is that we extend the loop to warm up f by 2
+        // iterations. This adds up to 2*(12*2) = 48 evaluations of f.
         Func f("f");
         f(x, y) = x;
         f(0, y) += f(1, y) + f(2, y);
@@ -108,7 +106,7 @@ int main(int argc, char **argv) {
         counter = 0;
         check(g.realize({2, 10}));
 
-        int correct = 42;
+        int correct = 48;
         if (counter != correct) {
             printf("Failed sliding a reduction: %d evaluations instead of %d\n", counter, correct);
             return -1;
diff --git a/test/correctness/sliding_window.cpp b/test/correctness/sliding_window.cpp
index 413bf9233160..31875158600a 100644
--- a/test/correctness/sliding_window.cpp
+++ b/test/correctness/sliding_window.cpp
@@ -40,6 +40,9 @@ int main(int argc, char **argv) {
 
         f.store_root().compute_at(g, x);
 
+        // Test that sliding window works when specializing.
+        g.specialize(g.output_buffer().dim(0).min() == 0);
+
         Buffer<int> im = g.realize({100});
 
         // f should be able to tell that it only needs to compute each value once
@@ -49,6 +52,44 @@ int main(int argc, char **argv) {
         }
     }
 
+    // Try two producers used by the same consumer.
+    {
+        count = 0;
+        Func f, g, h;
+
+        f(x) = call_counter(2 * x + 0, 0);
+        g(x) = call_counter(2 * x + 1, 0);
+        h(x) = f(x) + f(x - 1) + g(x) + g(x - 1);
+
+        f.store_root().compute_at(h, x);
+        g.store_root().compute_at(h, x);
+
+        Buffer<int> im = h.realize({100});
+        if (count != 202) {
+            printf("f was called %d times instead of %d times\n", count, 202);
+            return -1;
+        }
+    }
+
+    // Try a sequence of two sliding windows.
+    {
+        count = 0;
+        Func f, g, h;
+
+        f(x) = call_counter(2 * x + 0, 0);
+        g(x) = f(x) + f(x - 1);
+        h(x) = g(x) + g(x - 1);
+
+        f.store_root().compute_at(h, x);
+        g.store_root().compute_at(h, x);
+
+        Buffer<int> im = h.realize({100});
+        if (count != 102) {
+            printf("f was called %d times instead of %d times\n", count, 102);
+            return -1;
+        }
+    }
+
     // Try again where there's a containing stage
     {
         count = 0;
@@ -201,6 +242,24 @@ int main(int argc, char **argv) {
         }
     }
 
+    {
+        // Sliding where we only need a new value every third iteration of the consumer.
+        // This test checks that we don't ask for excessive bounds.
+        ImageParam f(Int(32), 1);
+        Func g;
+
+        g(x) = f(x / 3);
+
+        Var xo;
+        g.split(x, xo, x, 10);
+        f.in().store_at(g, xo).compute_at(g, x);
+
+        Buffer<int> buf(33);
+        f.set(buf);
+
+        Buffer<int> im = g.realize({98});
+    }
+
     {
         // Sliding with an unrolled producer
         Var x, xi;
@@ -221,6 +280,102 @@ int main(int argc, char **argv) {
         }
     }
 
+    {
+        // Sliding with a vectorized producer and consumer.
+        count = 0;
+        Func f, g;
+        f(x) = call_counter(x, 0);
+        g(x) = f(x + 1) + f(x - 1);
+
+        f.store_root().compute_at(g, x).vectorize(x, 4);
+        g.vectorize(x, 4);
+
+        Buffer<int> im = g.realize({100});
+        if (count != 104) {
+            printf("f was called %d times instead of %d times\n", count, 104);
+            return -1;
+        }
+    }
+
+    {
+        // A sequence of stencils, all computed at the output.
+        count = 0;
+        Func f, g, h, u, v;
+        f(x, y) = call_counter(x, y);
+        g(x, y) = f(x, y - 1) + f(x, y + 1);
+        h(x, y) = g(x - 1, y) + g(x + 1, y);
+        u(x, y) = h(x, y - 1) + h(x, y + 1);
+        v(x, y) = u(x - 1, y) + u(x + 1, y);
+
+        u.compute_at(v, y);
+        h.store_root().compute_at(v, y);
+        g.store_root().compute_at(v, y);
+        f.store_root().compute_at(v, y);
+
+        v.realize({10, 10});
+        if (count != 14 * 14) {
+            printf("f was called %d times instead of %d times\n", count, 14 * 14);
+            return -1;
+        }
+    }
+
+    {
+        // A sequence of stencils, sliding computed at the output.
+        count = 0;
+        Func f, g, h, u, v;
+        f(x, y) = call_counter(x, y);
+        g(x, y) = f(x, y - 1) + f(x, y + 1);
+        h(x, y) = g(x - 1, y) + g(x + 1, y);
+        u(x, y) = h(x, y - 1) + h(x, y + 1);
+        v(x, y) = u(x - 1, y) + u(x + 1, y);
+
+        u.compute_at(v, y);
+        h.store_root().compute_at(v, y);
+        g.compute_at(h, y);
+        f.store_root().compute_at(v, y);
+
+        v.realize({10, 10});
+        if (count != 14 * 14) {
+            printf("f was called %d times instead of %d times\n", count, 14 * 14);
+            return -1;
+        }
+    }
+
+    {
+        // Sliding a func that has a boundary condition before the beginning
+        // of the loop. This needs an explicit warmup before we start sliding.
+        count = 0;
+        Func f, g;
+        f(x) = call_counter(x, 0);
+        g(x) = f(max(x, 3));
+
+        f.store_root().compute_at(g, x);
+
+        g.realize({10});
+        if (count != 7) {
+            printf("f was called %d times instead of %d times\n", count, 7);
+            return -1;
+        }
+    }
+
+    {
+        // Sliding a func that has a boundary condition on both sides.
+        count = 0;
+        Func f, g, h;
+        f(x) = call_counter(x, 0);
+        g(x) = f(clamp(x, 0, 9));
+        h(x) = g(x - 1) + g(x + 1);
+
+        f.store_root().compute_at(h, x);
+        g.store_root().compute_at(h, x);
+
+        h.realize({10});
+        if (count != 10) {
+            printf("f was called %d times instead of %d times\n", count, 10);
+            return -1;
+        }
+    }
+
     printf("Success!\n");
     return 0;
 }
diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp
index 3bbfb672f512..bf412e52d7ab 100644
--- a/test/correctness/storage_folding.cpp
+++ b/test/correctness/storage_folding.cpp
@@ -385,6 +385,42 @@ int main(int argc, char **argv) {
         }
     }
 
+    {
+        custom_malloc_sizes.clear();
+        Func f, g, h;
+
+        // Two stages of upsampling is even trickier.
+        h(x, y) = x * y;
+        g(x, y) = h(x, y / 2) + h(x, y / 2 + 1);
+        f(x, y) = g(x, y / 2) + g(x, y / 2 + 1);
+
+        h.compute_at(f, y).store_root().fold_storage(y, 4);
+        g.compute_at(f, y).store_root().fold_storage(y, 2);
+
+        f.set_custom_allocator(my_malloc, my_free);
+
+        Buffer<int> im = f.realize({1000, 1000});
+
+        // Halide allocates one extra scalar, so we account for that.
+        size_t expected_size_g = 1000 * 4 * sizeof(int) + sizeof(int);
+        size_t expected_size_h = 1000 * 2 * sizeof(int) + sizeof(int);
+        if (!check_expected_mallocs({expected_size_g, expected_size_h})) {
+            return -1;
+        }
+
+        for (int y = 0; y < im.height(); y++) {
+            for (int x = 0; x < im.width(); x++) {
+                auto correct_h = [](int x, int y) { return x * y; };
+                auto correct_g = [=](int x, int y) { return correct_h(x, y / 2) + correct_h(x, y / 2 + 1); };
+                auto correct_f = [=](int x, int y) { return correct_g(x, y / 2) + correct_g(x, y / 2 + 1); };
+                if (im(x, y) != correct_f(x, y)) {
+                    printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct_f(x, y));
+                    return -1;
+                }
+            }
+        }
+    }
+
     for (bool interleave : {false, true}) {
         Func f, g;
 
diff --git a/test/correctness/tracing.cpp b/test/correctness/tracing.cpp
index 92cce6f2f752..1eeab85513c0 100644
--- a/test/correctness/tracing.cpp
+++ b/test/correctness/tracing.cpp
@@ -243,9 +243,9 @@ int main(int argc, char **argv) {
         {103, 11, 1, 2, 32, 4, 1, 4, {1, 2, 3, 4}, {0.995004f, 0.980067f, 0.955337f, 0.921061f}, ""},
         {103, 11, 5, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 9, 6, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
+        {105, 1, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 17, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.099833f, 0.198669f, 0.295520f}, ""},
         {103, 17, 0, 2, 32, 4, 1, 4, {1, 2, 3, 4}, {0.995004f, 0.980067f, 0.955337f, 0.921061f}, ""},
-        {105, 1, 0, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {102, 10, 1, 2, 32, 4, 0, 4, {0, 1, 2, 3}, {0.995004f, 1.079900f, 1.154006f, 1.216581f}, ""},
         {103, 17, 7, 3, 0, 0, 0, 2, {0, 5, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 9, 4, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
@@ -253,9 +253,9 @@ int main(int argc, char **argv) {
         {103, 23, 1, 2, 32, 4, 1, 4, {5, 6, 7, 8}, {0.877583f, 0.825336f, 0.764842f, 0.696707f}, ""},
         {103, 23, 5, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 9, 6, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
+        {105, 1, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 27, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.389418f, 0.479426f, 0.564642f, 0.644218f}, ""},
         {103, 27, 0, 2, 32, 4, 1, 4, {5, 6, 7, 8}, {0.877583f, 0.825336f, 0.764842f, 0.696707f}, ""},
-        {105, 1, 0, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {102, 10, 1, 2, 32, 4, 0, 4, {4, 5, 6, 7}, {1.267001f, 1.304761f, 1.329485f, 1.340924f}, ""},
         {103, 27, 7, 3, 0, 0, 0, 2, {5, 4, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 9, 4, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
@@ -263,9 +263,9 @@ int main(int argc, char **argv) {
         {103, 33, 1, 2, 32, 4, 1, 4, {7, 8, 9, 10}, {0.764842f, 0.696707f, 0.621610f, 0.540302f}, ""},
         {103, 33, 5, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 9, 6, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
+        {105, 1, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {103, 37, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.564642f, 0.644218f, 0.717356f, 0.783327f}, ""},
         {103, 37, 0, 2, 32, 4, 1, 4, {7, 8, 9, 10}, {0.764842f, 0.696707f, 0.621610f, 0.540302f}, ""},
-        {105, 1, 0, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {102, 10, 1, 2, 32, 4, 0, 4, {6, 7, 8, 9}, {1.329485f, 1.340924f, 1.338966f, 1.323629f}, ""},
         {103, 37, 7, 3, 0, 0, 0, 2, {9, 2, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},
         {102, 10, 5, 3, 0, 0, 0, 2, {0, 10, 0, 0}, {0.000000f, 0.000000f, 0.000000f, 0.000000f}, ""},