From a28a503dbf30ae130eccabd277cb3018e88077cf Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Wed, 3 Mar 2021 09:49:09 -0700
Subject: [PATCH 01/13] Safer version of vldN code generation.

---
 src/CodeGen_ARM.cpp                    | 46 +++++++++-----------------
 test/correctness/simd_op_check.cpp     | 28 ++++++++--------
 test/correctness/simd_op_check.h       |  5 +++
 test/correctness/simd_op_check_hvx.cpp |  8 ++---
 4 files changed, 39 insertions(+), 48 deletions(-)
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 89260d758cc7..7c3293a343b2 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1000,41 +1000,27 @@ void CodeGen_ARM::visit(const Load *op) {
         return;
     }
 
-    // Strided loads with known stride
-    if (stride && stride->value >= 2 && stride->value <= 4) {
-        // Check alignment on the base. Attempt to shift to an earlier
-        // address if it simplifies the expression. This makes
-        // adjacent strided loads shared a vldN op.
+    // Try to rewrite strided loads as shuffles of dense loads,
+    // aligned to the stride. This makes adjacent strided loads
+    // shared a vldN op. However, this is only safe when the new
+    // load would not cross any alignment boundaries not crossed
+    // by the original load.
+    ModulusRemainder alignment = op->alignment;
+    if (stride && stride->value >= 2 && stride->value <= 4 &&
+        alignment.modulus % stride->value == 0) {
         Expr base = ramp->base;
-        int offset = 0;
-        ModulusRemainder mod_rem = modulus_remainder(ramp->base);
-
-        const Add *add = base.as<Add>();
-        const IntImm *add_b = add ? add->b.as<IntImm>() : nullptr;
-
-        if ((mod_rem.modulus % stride->value) == 0) {
-            offset = mod_rem.remainder % stride->value;
-        } else if ((mod_rem.modulus == 1) && add_b) {
-            offset = add_b->value % stride->value;
-            if (offset < 0) {
-                offset += stride->value;
-            }
-        }
-
+        int offset = mod_imp(alignment.remainder, stride->value);
         if (offset) {
             base = simplify(base - offset);
-            mod_rem.remainder -= offset;
-            if (mod_rem.modulus) {
-                mod_rem.remainder = mod_imp(mod_rem.remainder, mod_rem.modulus);
-            }
+            alignment.remainder -= offset;
         }
 
-        int alignment = op->type.bytes();
-        alignment *= gcd(mod_rem.modulus, mod_rem.remainder);
+        int align_bytes =
+            gcd(alignment.modulus, alignment.remainder) * op->type.bytes();
         // Maximum stack alignment on arm is 16 bytes, so we should
         // never claim alignment greater than that.
-        alignment = gcd(alignment, 16);
-        internal_assert(alignment > 0);
+        align_bytes = gcd(align_bytes, 16);
+        internal_assert(align_bytes > 0);
 
         // Decide what width to slice things into. If not a multiple
         // of 64 or 128 bits, then we can't safely slice it up into
@@ -1068,9 +1054,9 @@ void CodeGen_ARM::visit(const Load *op) {
             Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type);
             LoadInst *loadI = cast<LoadInst>(builder->CreateLoad(bitcastI));
 #if LLVM_VERSION >= 110
-            loadI->setAlignment(Align(alignment));
+            loadI->setAlignment(Align(align_bytes));
 #else
-            loadI->setAlignment(MaybeAlign(alignment));
+            loadI->setAlignment(MaybeAlign(align_bytes));
 #endif
             add_tbaa_metadata(loadI, op->name, slice_ramp);
             Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV);
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 274ed02ce1ea..918d60e48bec 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -811,22 +811,22 @@ class SimdOpCheck : public SimdOpCheckTest {
             check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
 
             // VLD3     X       -       Load Three-Element Structures
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3 + y));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3 + y));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3 + y));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3 + y));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3 + y));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3 + y));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3 + y));
+            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3));
+            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3));
+            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3));
 
             // VLD4     X       -       Load Four-Element Structures
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4 + y));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4 + y));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4 + y));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4 + y));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4 + y));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4 + y));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4 + y));
+            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4));
+            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4));
+            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4));
 
             // VLDM     X       F, D    Load Multiple Registers
             // VLDR     X       F, D    Load Single Register
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 36303bd29c16..b8e3f6636134 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -52,6 +52,11 @@ class SimdOpCheckTest {
                      .with_feature(Target::NoRuntime)
                      .with_feature(Target::DisableLLVMLoopOpt);
         num_threads = Internal::ThreadPool<void>::num_processors_online();
+
+        // Some tests need to know the alignment of the buffer.
+        for (ImageParam i : image_params) {
+            i.dim(0).set_min(0);
+        }
     }
     virtual ~SimdOpCheckTest() = default;
     size_t get_num_threads() const {
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 943abd0a7895..8566593da450 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -437,10 +437,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         check("vnot(v*)", hvx_width / 2, ~u16_1);
         check("vnot(v*)", hvx_width / 4, ~u32_1);
 
-        // v62 - Broadcasting unsigned scalars
-        check("v*.b = vsplat(r*)", hvx_width / 1, in_u8(0));
-        check("v*.h = vsplat(r*)", hvx_width / 2, in_u16(0));
-        check("vsplat(r*)", hvx_width / 4, in_u32(0));
+        // v62 - Broadcasting scalars
+        check("vsplat(r*)", hvx_width / 1, in_u8(y));
+        check("vsplat(r*)", hvx_width / 2, in_u16(y));
+        check("vsplat(r*)", hvx_width / 4, in_u32(y));
 
         check("vmux(q*,v*,v*)", hvx_width / 1, select(i8_1 == i8_2, i8_3, i8_2));
         check("vmux(q*,v*,v*)", hvx_width / 2, select(i16_1 == i16_2, i16_3, i16_2));

From 94ca00ae3f5b06960bde58790b10c28d26b9ac4c Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Wed, 3 Mar 2021 10:53:44 -0700
Subject: [PATCH 02/13] Only be more conservative with alignment for external
 buffers.

---
 src/CodeGen_ARM.cpp | 26 +++++++++++++++++++++-----
 src/CodeGen_ARM.h   |  2 ++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 7c3293a343b2..0281c9c556fb 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -972,6 +972,12 @@ void CodeGen_ARM::visit(const Store *op) {
     CodeGen_Posix::visit(op);
 }
 
+int CodeGen_ARM::allocation_padding(Type type) const {
+    // We want to be able to load up to 3 values past the end of the buffer in
+    // the Load visitor.
+    return std::max(CodeGen_Posix::allocation_padding(type), type.bytes() * 3);
+}
+
 void CodeGen_ARM::visit(const Load *op) {
     // Predicated load
     if (!is_const_one(op->predicate)) {
@@ -1002,12 +1008,22 @@ void CodeGen_ARM::visit(const Load *op) {
 
     // Try to rewrite strided loads as shuffles of dense loads,
     // aligned to the stride. This makes adjacent strided loads
-    // shared a vldN op. However, this is only safe when the new
-    // load would not cross any alignment boundaries not crossed
-    // by the original load.
+    // shared a vldN op.
     ModulusRemainder alignment = op->alignment;
-    if (stride && stride->value >= 2 && stride->value <= 4 &&
-        alignment.modulus % stride->value == 0) {
+    if (stride && stride->value >= 2 && stride->value <= 4) {
+        // For internal buffers, we know this is safe because we
+        // allocate an extra 3 values of padding. For external
+        // buffers, we can only do this if we know the alignment
+        // makes it safe.
+        // (In ASAN mode, don't read beyond the end of internal buffers either,
+        // as ASAN will complain even about harmless stack overreads.)
+        bool external = op->param.defined() || op->image.defined();
+        if (alignment.modulus % stride->value != 0 &&
+            (external || target.has_feature(Target::ASAN))) {
+            CodeGen_Posix::visit(op);
+            return;
+        }
+
         Expr base = ramp->base;
         int offset = mod_imp(alignment.remainder, stride->value);
         if (offset) {
diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h
index 39d1a39d7a0f..49d31b9e1662 100644
--- a/src/CodeGen_ARM.h
+++ b/src/CodeGen_ARM.h
@@ -44,6 +44,8 @@ class CodeGen_ARM : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
     // @}
 
+    int allocation_padding(Type type) const override;
+
     /** Various patterns to peephole match against */
     struct Pattern {
         std::string intrin;  ///< Name of the intrinsic

From e4574a7b1644844f14fbfbfa91371ddcb1ef1e3b Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Wed, 3 Mar 2021 13:13:29 -0700
Subject: [PATCH 03/13] Add tolerance to allocation size tests.

---
 test/correctness/nested_tail_strategies.cpp   |  5 +-
 test/correctness/pseudostack_shares_slots.cpp | 31 ++++---
 test/correctness/reorder_storage.cpp          | 15 +--
 test/correctness/storage_folding.cpp          | 93 +++++++++++--------
 4 files changed, 84 insertions(+), 60 deletions(-)

diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp
index e3b11503aff2..2f435f975dd3 100644
--- a/test/correctness/nested_tail_strategies.cpp
+++ b/test/correctness/nested_tail_strategies.cpp
@@ -45,14 +45,15 @@ void check(Func out, int line, std::vector<TailStrategy> tails) {
         largest_allocation = 0;
         out.realize({s});
         size_t expected = (s + 1) * 4;
-        if (largest_allocation > expected) {
+        size_t tolerance = 3 * sizeof(int);
+        if (largest_allocation > expected + tolerance) {
             std::cerr << "Failure on line " << line << "\n"
                       << "with tail strategies: ";
             for (auto t : tails) {
                 std::cerr << t << " ";
             }
             std::cerr << "\n allocation of " << largest_allocation
-                      << " bytes is too large. Expected " << expected << "\n";
+                      << " bytes is too large. Expected " << expected + tolerance << "\n";
             abort();
         }
     }
diff --git a/test/correctness/pseudostack_shares_slots.cpp b/test/correctness/pseudostack_shares_slots.cpp
index fd491ba149ea..a9e2f49dd048 100644
--- a/test/correctness/pseudostack_shares_slots.cpp
+++ b/test/correctness/pseudostack_shares_slots.cpp
@@ -2,10 +2,11 @@
 
 using namespace Halide;
 
-std::vector<size_t> mallocs;
+const int tolerance = 3 * sizeof(int);
+std::vector<int> mallocs;
 
 void *my_malloc(void *user_context, size_t x) {
-    mallocs.push_back(x);
+    mallocs.push_back((int)x);
     void *orig = malloc(x + 32);
     void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5);
     ((void **)ptr)[-1] = orig;
@@ -52,9 +53,11 @@ int main(int argc, char **argv) {
             mallocs.clear();
             p.set(sz);
             chain.back().realize({1024});
-            size_t sz1 = sz + 2 * 20 - 1;
-            size_t sz2 = sz1 - 2;
-            if (mallocs.size() != 2 || mallocs[0] != sz1 || mallocs[1] != sz2) {
+            int sz1 = sz + 2 * 20 - 1;
+            int sz2 = sz1 - 2;
+            if (mallocs.size() != 2 ||
+                std::abs(mallocs[0] - sz1) > tolerance ||
+                std::abs(mallocs[1] - sz2) > tolerance) {
                 printf("Incorrect allocations: %d %d %d\n", (int)mallocs.size(), (int)mallocs[0], (int)mallocs[1]);
                 printf("Expected: 2 %d %d\n", (int)sz1, (int)sz2);
                 return -1;
@@ -93,14 +96,18 @@ int main(int argc, char **argv) {
             mallocs.clear();
             p.set(sz);
             chain.back().realize({1024});
-            size_t sz1 = sz / 8 + 23;
-            size_t sz2 = sz1 - 2;
-            size_t sz3 = sz + 19;
-            size_t sz4 = sz3 - 2;
-            if (mallocs.size() != 4 || mallocs[0] != sz1 || mallocs[1] != sz2 || mallocs[2] != sz3 || mallocs[3] != sz4) {
+            int sz1 = sz / 8 + 23;
+            int sz2 = sz1 - 2;
+            int sz3 = sz + 19;
+            int sz4 = sz3 - 2;
+            if (mallocs.size() != 4 ||
+                std::abs(mallocs[0] - sz1) > tolerance ||
+                std::abs(mallocs[1] - sz2) > tolerance ||
+                std::abs(mallocs[2] - sz3) > tolerance ||
+                std::abs(mallocs[3] - sz4) > tolerance) {
                 printf("Incorrect allocations: %d %d %d %d %d\n", (int)mallocs.size(),
-                       (int)mallocs[0], (int)mallocs[1], (int)mallocs[2], (int)mallocs[3]);
-                printf("Expected: 4 %d %d %d %d\n", (int)sz1, (int)sz2, (int)sz3, (int)sz4);
+                       mallocs[0], mallocs[1], mallocs[2], mallocs[3]);
+                printf("Expected: 4 %d %d %d %d\n", sz1, sz2, sz3, sz4);
                 return -1;
             }
         }
diff --git a/test/correctness/reorder_storage.cpp b/test/correctness/reorder_storage.cpp
index 546812bad57a..953cd2b1e821 100644
--- a/test/correctness/reorder_storage.cpp
+++ b/test/correctness/reorder_storage.cpp
@@ -3,11 +3,13 @@
 
 using namespace Halide;
 
-size_t expected_allocation = 0;
+// Backends allocate up to 3 extra elements.
+int tolerance = 3 * sizeof(int);
+int expected_allocation = 0;
 
 void *my_malloc(void *user_context, size_t x) {
-    if (x != expected_allocation) {
-        printf("Error! Expected allocation of %zu bytes, got %zu bytes\n", expected_allocation, x);
+    if (std::abs((int)x - expected_allocation) > tolerance) {
+        printf("Error! Expected allocation of %d bytes, got %zu bytes (tolerance %d)\n", expected_allocation, x, tolerance);
         exit(-1);
     }
     return malloc(x);
@@ -40,11 +42,10 @@ int main(int argc, char **argv) {
     g.set_custom_allocator(my_malloc, my_free);
 
     // Without any storage alignment, we should expect an allocation
-    // that is the product of the extents of the realization (plus one
-    // for the magical extra Halide element).
+    // that is the product of the extents of the realization.
     int W = 10;
     int H = 11;
-    expected_allocation = (3 * W * H + 1) * sizeof(int);
+    expected_allocation = 3 * W * H * sizeof(int);
 
     g.realize({W, H, 3});
 
@@ -53,7 +54,7 @@ int main(int argc, char **argv) {
 
     // We've aligned the x dimension, make sure the allocation reflects this.
     int W_aligned = (W + x_alignment - 1) & ~(x_alignment - 1);
-    expected_allocation = (W_aligned * H * 3 + 1) * sizeof(int);
+    expected_allocation = W_aligned * H * 3 * sizeof(int);
 
     // Force g to clear it's cache...
     g.compute_root();
diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp
index b91bbeaf7859..50533c22768f 100644
--- a/test/correctness/storage_folding.cpp
+++ b/test/correctness/storage_folding.cpp
@@ -1,14 +1,16 @@
 #include "Halide.h"
 #include <stdio.h>
 
+#include <set>
+
 using namespace Halide;
 
 // Override Halide's malloc and free
-
-size_t custom_malloc_size = 0;
+const int tolerance = 3 * sizeof(int);
+std::set<size_t> custom_malloc_sizes;
 
 void *my_malloc(void *user_context, size_t x) {
-    custom_malloc_size = x;
+    custom_malloc_sizes.insert(x);
     void *orig = malloc(x + 32);
     void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5);
     ((void **)ptr)[-1] = orig;
@@ -19,6 +21,28 @@ void my_free(void *user_context, void *ptr) {
     free(((void **)ptr)[-1]);
 }
 
+bool check_expected_malloc(size_t expected) {
+    for (size_t i : custom_malloc_sizes) {
+        if (std::abs((int)i - (int)expected) <= tolerance) {
+            return true;
+        }
+    }
+    printf("Expected an allocation of size %d (tolerance %d). Got instead:\n", (int)expected, tolerance);
+    for (size_t i : custom_malloc_sizes) {
+        printf("  %d\n", (int)i);
+    }
+    return false;
+}
+
+bool check_expected_mallocs(const std::vector<size_t> &expected) {
+    for (size_t i : expected) {
+        if (!check_expected_malloc(i)) {
+            return false;
+        }
+    }
+    return true;
+}
+
 #ifdef _WIN32
 #define DLLEXPORT __declspec(dllexport)
 #else
@@ -111,9 +135,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000, 3});
 
-        size_t expected_size = 101 * 4 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 4 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
@@ -133,9 +156,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000, 3});
 
-        size_t expected_size = 101 * 1002 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 1002 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
@@ -157,15 +179,14 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000});
 
-        size_t expected_size = 101 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -180,7 +201,7 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        if (custom_malloc_size != 0) {
+        if (!custom_malloc_sizes.empty()) {
             printf("There should not have been a heap allocation\n");
             return -1;
         }
@@ -197,7 +218,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -213,7 +234,7 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        if (custom_malloc_size != 0) {
+        if (!custom_malloc_sizes.empty()) {
             printf("There should not have been a heap allocation\n");
             return -1;
         }
@@ -230,7 +251,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -248,9 +269,8 @@ int main(int argc, char **argv) {
         Buffer<int> im = f.realize({1000, 1000});
 
         // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 2 * 1002 * 4 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 2 * 1000 * 4 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -266,7 +286,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -286,9 +306,8 @@ int main(int argc, char **argv) {
         Buffer<int> im = f.realize({1000, 1000});
 
         // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 1000 * 8 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 1000 * 8 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -304,7 +323,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -323,9 +342,8 @@ int main(int argc, char **argv) {
         Buffer<int> im = f.realize({1000, 1000});
 
         // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 2 * 1002 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 2 * 1000 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -341,24 +359,22 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
+        // This is tricky due to upsampling.
         g(x, y) = x * y;
         f(x, y) = g(x, y / 2) + g(x, y / 2 + 1);
 
-        // The automatic storage folding optimization can't figure
-        // this out due to the downsampling. Explicitly fold it.
-        g.compute_at(f, x).store_root().fold_storage(y, 2);
+        g.compute_at(f, x).store_root();
 
         f.set_custom_allocator(my_malloc, my_free);
 
         Buffer<int> im = f.realize({1000, 1000});
 
         // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 1000 * 2 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 1000 * 2 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -394,12 +410,11 @@ int main(int argc, char **argv) {
 
         size_t expected_size;
         if (interleave) {
-            expected_size = 101 * 3 * 3 * sizeof(int) + sizeof(int);
+            expected_size = 101 * 3 * 3 * sizeof(int);
         } else {
-            expected_size = 101 * 3 * sizeof(int) + sizeof(int);
+            expected_size = 101 * 3 * sizeof(int);
         }
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }

From 50a91e1c753b0643e986a6d0d4c46e01c1ed52ab Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Wed, 3 Mar 2021 13:21:01 -0700
Subject: [PATCH 04/13] Remove old comments.

---
 test/correctness/storage_folding.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp
index 50533c22768f..3bbfb672f512 100644
--- a/test/correctness/storage_folding.cpp
+++ b/test/correctness/storage_folding.cpp
@@ -268,7 +268,6 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
         size_t expected_size = 2 * 1000 * 4 * sizeof(int);
         if (!check_expected_mallocs({expected_size})) {
             return -1;
@@ -305,7 +304,6 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
         size_t expected_size = 1000 * 8 * sizeof(int);
         if (!check_expected_mallocs({expected_size})) {
             return -1;
@@ -341,7 +339,6 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
         size_t expected_size = 2 * 1000 * 3 * sizeof(int);
         if (!check_expected_mallocs({expected_size})) {
             return -1;
@@ -372,7 +369,6 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
         size_t expected_size = 1000 * 2 * sizeof(int);
         if (!check_expected_mallocs({expected_size})) {
             return -1;

From 8c42f5148e4c4ea5107e5d20522ee608ecb8711e Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 4 Mar 2021 20:18:03 -0700
Subject: [PATCH 05/13] Improve ARM alignment and vldN code generation.

---
 src/CodeGen_ARM.cpp              | 47 ++++++++++++++++++++++----------
 test/correctness/simd_op_check.h | 10 +++----
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index df859ddcf484..c3ec666865c2 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1047,29 +1047,46 @@ void CodeGen_ARM::visit(const Load *op) {
     // aligned to the stride. This makes adjacent strided loads
     // shared a vldN op.
     ModulusRemainder alignment = op->alignment;
+    alignment.remainder = mod_imp(alignment.remainder, alignment.modulus);
     if (stride && stride->value >= 2 && stride->value <= 4) {
-        // For internal buffers, we know this is safe because we
-        // allocate an extra 3 values of padding. For external
-        // buffers, we can only do this if we know the alignment
-        // makes it safe.
+        Expr base = ramp->base;
+        int aligned_stride = gcd(stride->value, alignment.modulus);
+        int offset = 0;
+        if (aligned_stride == stride->value) {
+            offset = mod_imp((int)alignment.remainder, aligned_stride);
+        } else {
+            const Add *add = ramp->base.as<Add>();
+            if (const IntImm *add_c = add ? add->b.as<IntImm>() : ramp->base.as<IntImm>()) {
+                offset = mod_imp(add_c->value, stride->value);
+            }
+        }
+
+        if (offset) {
+            base = simplify(base - offset);
+            alignment.remainder = offset;
+        }
+
+        // We want to load a few more bytes than the original load did.
+        // This is safe under these conditions:
+        // - The alignment information is sufficient to determine the
+        //   modified load is safe.
+        // - For internal buffers, we know this is safe because we
+        //   allocate an extra 3 values of padding.
         // (In ASAN mode, don't read beyond the end of internal buffers either,
         // as ASAN will complain even about harmless stack overreads.)
+        // The min moves lower by offset.
+        bool alignment_safe = alignment.remainder >= 0;
+        // The max moves higher by stride->value - 1 - offset
+        alignment_safe &=
+            alignment.remainder + stride->value - 1 - offset <= alignment.modulus;
         bool external = op->param.defined() || op->image.defined();
-        if (alignment.modulus % stride->value != 0 &&
-            (external || target.has_feature(Target::ASAN))) {
+        if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
             CodeGen_Posix::visit(op);
             return;
         }
 
-        Expr base = ramp->base;
-        int offset = mod_imp(alignment.remainder, stride->value);
-        if (offset) {
-            base = simplify(base - offset);
-            alignment.remainder -= offset;
-        }
-
-        int align_bytes =
-            gcd(alignment.modulus, alignment.remainder) * op->type.bytes();
+        alignment.remainder = mod_imp(alignment.remainder, alignment.modulus);
+        int align_bytes = gcd(alignment.modulus, alignment.remainder) * op->type.bytes();
         // Maximum stack alignment on arm is 16 bytes, so we should
         // never claim alignment greater than that.
         align_bytes = gcd(align_bytes, 16);
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index b8e3f6636134..43786143aecd 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -52,11 +52,6 @@ class SimdOpCheckTest {
                      .with_feature(Target::NoRuntime)
                      .with_feature(Target::DisableLLVMLoopOpt);
         num_threads = Internal::ThreadPool<void>::num_processors_online();
-
-        // Some tests need to know the alignment of the buffer.
-        for (ImageParam i : image_params) {
-            i.dim(0).set_min(0);
-        }
     }
     virtual ~SimdOpCheckTest() = default;
     size_t get_num_threads() const {
@@ -307,6 +302,11 @@ class SimdOpCheckTest {
     virtual void setup_images() {
         for (auto p : image_params) {
             p.reset();
+
+            const int alignment_bytes = 16;
+            p.set_host_alignment(alignment_bytes);
+            const int alignment = alignment_bytes / p.type().bytes();
+            p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
         }
     }
     virtual bool test_all() {

From 927583b69cd42e000a2c07bec9f37f0460c4b983 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 4 Mar 2021 22:28:06 -0700
Subject: [PATCH 06/13] Remove merge straggler

---
 src/CodeGen_ARM.h | 74 -----------------------------------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 src/CodeGen_ARM.h

diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h
deleted file mode 100644
index 49d31b9e1662..000000000000
--- a/src/CodeGen_ARM.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef HALIDE_CODEGEN_ARM_H
-#define HALIDE_CODEGEN_ARM_H
-
-/** \file
- * Defines the code-generator for producing ARM machine code
- */
-
-#include <utility>
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-
-struct Target;
-
-namespace Internal {
-
-/** A code generator that emits ARM code from a given Halide stmt. */
-class CodeGen_ARM : public CodeGen_Posix {
-public:
-    /** Create an ARM code generator for the given arm target. */
-    CodeGen_ARM(const Target &);
-
-protected:
-    using CodeGen_Posix::visit;
-
-    /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
-     * takes one vector argument and splits it into two to call inner. */
-    llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const std::string &name);
-    void init_module() override;
-
-    /** Nodes for which we want to emit specific neon intrinsics */
-    // @{
-    void visit(const Cast *) override;
-    void visit(const Sub *) override;
-    void visit(const Mul *) override;
-    void visit(const Min *) override;
-    void visit(const Max *) override;
-    void visit(const Store *) override;
-    void visit(const Load *) override;
-    void visit(const Call *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
-    // @}
-
-    int allocation_padding(Type type) const override;
-
-    /** Various patterns to peephole match against */
-    struct Pattern {
-        std::string intrin;  ///< Name of the intrinsic
-        Expr pattern;        ///< The pattern to match against
-        Pattern() = default;
-        Pattern(const std::string &intrin, Expr p)
-            : intrin(intrin), pattern(std::move(p)) {
-        }
-    };
-    std::vector<Pattern> casts, averagings, negations;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-
-    // NEON can be disabled for older processors.
-    bool neon_intrinsics_disabled() {
-        return target.has_feature(Target::NoNEON);
-    }
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif

From 3099b2db514c90ede69daee4d627012d58ccfa1b Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 4 Mar 2021 22:52:18 -0700
Subject: [PATCH 07/13] Fix alignment condition (again).

---
 src/CodeGen_ARM.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index c3ec666865c2..aca91f468ca3 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1076,9 +1076,10 @@ void CodeGen_ARM::visit(const Load *op) {
         // as ASAN will complain even about harmless stack overreads.)
         // The min moves lower by offset.
         bool alignment_safe = alignment.remainder >= 0;
-        // The max moves higher by stride->value - 1 - offset
-        alignment_safe &=
-            alignment.remainder + stride->value - 1 - offset <= alignment.modulus;
+        // The max needs to not cross an alignment boundary.
+        int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value;
+        int new_max = old_max + stride->value - 1 - offset;
+        alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus);
         bool external = op->param.defined() || op->image.defined();
         if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
             CodeGen_Posix::visit(op);

From 40d18c8b734abb648315382660bbc069a88df17b Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 5 Mar 2021 01:40:45 -0700
Subject: [PATCH 08/13] Fix alignment.

---
 src/CodeGen_ARM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index aca91f468ca3..aa78efee7254 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1063,7 +1063,7 @@ void CodeGen_ARM::visit(const Load *op) {
 
         if (offset) {
             base = simplify(base - offset);
-            alignment.remainder = offset;
+            alignment.remainder -= offset;
         }
 
         // We want to load a few more bytes than the original load did.

From 7a911fb0b9fe5c317a56b47a2b7a319829ea5598 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 5 Mar 2021 11:21:30 -0700
Subject: [PATCH 09/13] Avoid divide by zero.

---
 src/CodeGen_ARM.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index aa78efee7254..a77612aeb5e8 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1077,9 +1077,11 @@ void CodeGen_ARM::visit(const Load *op) {
         // The min moves lower by offset.
         bool alignment_safe = alignment.remainder >= 0;
         // The max needs to not cross an alignment boundary.
-        int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value;
-        int new_max = old_max + stride->value - 1 - offset;
-        alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus);
+        if (alignment.modulus > 0) {
+            int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value;
+            int new_max = old_max + stride->value - 1 - offset;
+            alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus);
+        }
         bool external = op->param.defined() || op->image.defined();
         if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
             CodeGen_Posix::visit(op);

From 1786c5f72099bbd1e2df3f349a8e4cd5d5191d18 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 8 Mar 2021 18:06:31 -0700
Subject: [PATCH 10/13] Move CodeGen_ARM's logic for strided loads to
 CodeGen_LLVM.

---
 src/CodeGen_ARM.cpp   | 111 +---------------------------
 src/CodeGen_LLVM.cpp  | 165 ++++++++++++++++++++++++------------------
 src/CodeGen_LLVM.h    |   5 +-
 src/CodeGen_Posix.cpp |   4 +-
 4 files changed, 101 insertions(+), 184 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index a77612aeb5e8..8054307cf271 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -62,8 +62,6 @@ class CodeGen_ARM : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
     // @}
 
-    int allocation_padding(Type type) const override;
-
     /** Various patterns to peephole match against */
     struct Pattern {
         string intrin;  ///< Name of the intrinsic
@@ -1009,12 +1007,6 @@ void CodeGen_ARM::visit(const Store *op) {
     CodeGen_Posix::visit(op);
 }
 
-int CodeGen_ARM::allocation_padding(Type type) const {
-    // We want to be able to load up to 3 values past the end of the buffer in
-    // the Load visitor.
-    return std::max(CodeGen_Posix::allocation_padding(type), type.bytes() * 3);
-}
-
 void CodeGen_ARM::visit(const Load *op) {
     // Predicated load
     if (!is_const_one(op->predicate)) {
@@ -1035,112 +1027,13 @@ void CodeGen_ARM::visit(const Load *op) {
         return;
     }
 
+    // If the stride is in [-1, 4], we can deal with that using vanilla codegen
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
-
-    // If the stride is one or minus one, we can deal with that using vanilla codegen
-    if (stride && (stride->value == 1 || stride->value == -1)) {
+    if (stride && (-1 <= stride->value && stride->value <= 4)) {
         CodeGen_Posix::visit(op);
         return;
     }
 
-    // Try to rewrite strided loads as shuffles of dense loads,
-    // aligned to the stride. This makes adjacent strided loads
-    // shared a vldN op.
-    ModulusRemainder alignment = op->alignment;
-    alignment.remainder = mod_imp(alignment.remainder, alignment.modulus);
-    if (stride && stride->value >= 2 && stride->value <= 4) {
-        Expr base = ramp->base;
-        int aligned_stride = gcd(stride->value, alignment.modulus);
-        int offset = 0;
-        if (aligned_stride == stride->value) {
-            offset = mod_imp((int)alignment.remainder, aligned_stride);
-        } else {
-            const Add *add = ramp->base.as<Add>();
-            if (const IntImm *add_c = add ? add->b.as<IntImm>() : ramp->base.as<IntImm>()) {
-                offset = mod_imp(add_c->value, stride->value);
-            }
-        }
-
-        if (offset) {
-            base = simplify(base - offset);
-            alignment.remainder -= offset;
-        }
-
-        // We want to load a few more bytes than the original load did.
-        // This is safe under these conditions:
-        // - The alignment information is sufficient to determine the
-        //   modified load is safe.
-        // - For internal buffers, we know this is safe because we
-        //   allocate an extra 3 values of padding.
-        // (In ASAN mode, don't read beyond the end of internal buffers either,
-        // as ASAN will complain even about harmless stack overreads.)
-        // The min moves lower by offset.
-        bool alignment_safe = alignment.remainder >= 0;
-        // The max needs to not cross an alignment boundary.
-        if (alignment.modulus > 0) {
-            int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value;
-            int new_max = old_max + stride->value - 1 - offset;
-            alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus);
-        }
-        bool external = op->param.defined() || op->image.defined();
-        if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
-            CodeGen_Posix::visit(op);
-            return;
-        }
-
-        alignment.remainder = mod_imp(alignment.remainder, alignment.modulus);
-        int align_bytes = gcd(alignment.modulus, alignment.remainder) * op->type.bytes();
-        // Maximum stack alignment on arm is 16 bytes, so we should
-        // never claim alignment greater than that.
-        align_bytes = gcd(align_bytes, 16);
-        internal_assert(align_bytes > 0);
-
-        // Decide what width to slice things into. If not a multiple
-        // of 64 or 128 bits, then we can't safely slice it up into
-        // some number of vlds, so we hand it over the base class.
-        int bit_width = op->type.bits() * op->type.lanes();
-        int intrin_lanes = 0;
-        if (bit_width % 128 == 0) {
-            intrin_lanes = 128 / op->type.bits();
-        } else if (bit_width % 64 == 0) {
-            intrin_lanes = 64 / op->type.bits();
-        } else {
-            CodeGen_Posix::visit(op);
-            return;
-        }
-
-        llvm::Type *load_return_type = llvm_type_of(op->type.with_lanes(intrin_lanes * stride->value));
-        llvm::Type *load_return_pointer_type = load_return_type->getPointerTo();
-        Value *undef = UndefValue::get(load_return_type);
-        SmallVector<Constant *, 256> constants;
-        for (int j = 0; j < intrin_lanes; j++) {
-            Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
-            constants.push_back(constant);
-        }
-        Constant *constantsV = ConstantVector::get(constants);
-
-        vector<Value *> results;
-        for (int i = 0; i < op->type.lanes(); i += intrin_lanes) {
-            Expr slice_base = simplify(base + i * ramp->stride);
-            Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_lanes);
-            Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base);
-            Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type);
-            LoadInst *loadI = cast<LoadInst>(builder->CreateLoad(bitcastI));
-#if LLVM_VERSION >= 110
-            loadI->setAlignment(Align(align_bytes));
-#else
-            loadI->setAlignment(MaybeAlign(align_bytes));
-#endif
-            add_tbaa_metadata(loadI, op->name, slice_ramp);
-            Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV);
-            results.push_back(shuffleInstr);
-        }
-
-        // Concat the results
-        value = concat_vectors(results);
-        return;
-    }
-
     // We have builtins for strided loads with fixed but unknown stride, but they use inline assembly.
     if (target.bits != 64 /* Not yet implemented for aarch64 */) {
         ostringstream builtin;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index eda89aca82da..5d82c3013f7b 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1910,63 +1910,76 @@ void CodeGen_LLVM::visit(const Load *op) {
 
         if (ramp && stride && stride->value == 1) {
             value = codegen_dense_vector_load(op);
-        } else if (ramp && stride && stride->value == 2) {
-            // Load two vectors worth and then shuffle
-            Expr base_a = ramp->base, base_b = ramp->base + ramp->lanes;
-            Expr stride_a = make_one(base_a.type());
-            Expr stride_b = make_one(base_b.type());
-
-            ModulusRemainder align_a = op->alignment;
-            ModulusRemainder align_b = align_a + ramp->lanes;
-
-            // False indicates we should take the even-numbered lanes
-            // from the load, true indicates we should take the
-            // odd-numbered-lanes.
-            bool shifted_a = false, shifted_b = false;
+        } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
+            // Try to rewrite strided loads as shuffles of dense loads,
+            // aligned to the stride. This makes adjacent strided loads
+            // shared a vldN op.
+            ModulusRemainder align = op->alignment;
+            Expr base = ramp->base;
+            int aligned_stride = gcd(stride->value, align.modulus);
+            int offset = 0;
+            if (aligned_stride == stride->value) {
+                offset = mod_imp((int)align.remainder, aligned_stride);
+            } else {
+                const Add *add = base.as<Add>();
+                if (const IntImm *add_c = add ? add->b.as<IntImm>() : base.as<IntImm>()) {
+                    offset = mod_imp(add_c->value, stride->value);
+                }
+            }
 
-            bool external = op->param.defined() || op->image.defined();
+            if (offset) {
+                base = simplify(base - offset);
+                align.remainder -= offset;
+            }
 
-            // Don't read beyond the end of an external buffer.
+            // We want to load a few more bytes than the original load did.
+            // This is safe under these conditions:
+            // - The alignment information is sufficient to determine the
+            //   modified load is safe.
+            // - For internal buffers, we know this is safe because we
+            //   allocate an extra 3 values of padding.
             // (In ASAN mode, don't read beyond the end of internal buffers either,
             // as ASAN will complain even about harmless stack overreads.)
-            if (external || target.has_feature(Target::ASAN)) {
-                base_b -= 1;
-                align_b = align_b - 1;
-                shifted_b = true;
-            } else {
-                // If the base ends in an odd constant, then subtract one
-                // and do a different shuffle. This helps expressions like
-                // (f(2*x) + f(2*x+1) share loads
-                const Add *add = ramp->base.as<Add>();
-                const IntImm *offset = add ? add->b.as<IntImm>() : ramp->base.as<IntImm>();
-                if (offset && offset->value & 1) {
-                    base_a -= 1;
-                    align_a = align_a - 1;
-                    shifted_a = true;
-                    base_b -= 1;
-                    align_b = align_b - 1;
-                    shifted_b = true;
-                }
+            // The min moves lower by offset.
+            int load_lanes = ramp->lanes * stride->value;
+            bool alignment_safe = align.remainder >= 0;
+            // The max needs to not cross an alignment boundary.
+            if (align.modulus > 0) {
+                int old_max = align.remainder + (op->type.lanes() - 1) * stride->value;
+                int new_max = old_max + stride->value - 1 - offset;
+                alignment_safe &= (old_max / align.modulus == new_max / align.modulus);
+            }
+            bool external = op->param.defined() || op->image.defined();
+            if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
+                load_lanes -= (stride->value - 1 - offset);
             }
 
-            // Do each load.
-            Expr ramp_a = Ramp::make(base_a, stride_a, ramp->lanes);
-            Expr ramp_b = Ramp::make(base_b, stride_b, ramp->lanes);
-            Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param, op->predicate, align_a);
-            Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param, op->predicate, align_b);
-            Value *vec_a = codegen(load_a);
-            Value *vec_b = codegen(load_b);
+            int slice_lanes = native_vector_bits() / op->type.bits();
 
-            // Shuffle together the results.
-            vector<int> indices(ramp->lanes);
-            for (int i = 0; i < (ramp->lanes + 1) / 2; i++) {
-                indices[i] = i * 2 + (shifted_a ? 1 : 0);
-            }
-            for (int i = (ramp->lanes + 1) / 2; i < ramp->lanes; i++) {
-                indices[i] = i * 2 + (shifted_b ? 1 : 0);
+            // We need to slice the result in to native vector lanes, otherwise
+            // LLVM misses optimizations like using ldN on ARM.
+            vector<Value *> results;
+            for (int i = 0; i < op->type.lanes(); i += slice_lanes) {
+                int load_lanes_i = std::min<int>(slice_lanes * stride->value, load_lanes - i);
+                int lanes_i = std::min<int>(slice_lanes, op->type.lanes() - i);
+                Expr slice_base = simplify(base + i * ramp->stride);
+
+                Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
+                                                         op->image, op->param, op->alignment, nullptr, false);
+
+                SmallVector<Constant *, 256> constants;
+                for (int j = 0; j < lanes_i; j++) {
+                    Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
+                    constants.push_back(constant);
+                }
+                Constant *constantsV = ConstantVector::get(constants);
+                Value *undef = UndefValue::get(load_i->getType());
+                Value *shuffleInstr = builder->CreateShuffleVector(load_i, undef, constantsV);
+                results.push_back(shuffleInstr);
             }
 
-            value = shuffle_vectors(vec_a, vec_b, indices);
+            // Concat the results
+            value = concat_vectors(results);
         } else if (ramp && stride && stride->value == -1) {
             // Load the vector and then flip it in-place
             Expr flipped_base = ramp->base - ramp->lanes + 1;
@@ -2249,14 +2262,14 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
     }
 }
 
-Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
-    debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n";
-
-    const Ramp *ramp = load->index.as<Ramp>();
-    internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
+llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                                     llvm::Value *vpred, bool slice_to_native) {
+    debug(4) << "Vectorize predicated dense vector load:\n\t"
+             << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n";
 
-    bool is_external = (external_buffer.find(load->name) != external_buffer.end());
-    int alignment = load->type.bytes();  // The size of a single element
+    bool is_external = (external_buffer.find(name) != external_buffer.end());
+    int align_bytes = type.bytes();  // The size of a single element
 
     int native_bits = native_vector_bits();
     int native_bytes = native_bits / 8;
@@ -2266,60 +2279,68 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
     // maximum alignment we can infer based on the index alone.
 
     // Boost the alignment if possible, up to the native vector width.
-    ModulusRemainder mod_rem = load->alignment;
+    ModulusRemainder mod_rem = alignment;
     while ((mod_rem.remainder & 1) == 0 &&
            (mod_rem.modulus & 1) == 0 &&
-           alignment < native_bytes) {
+           align_bytes < native_bytes) {
         mod_rem.modulus /= 2;
         mod_rem.remainder /= 2;
-        alignment *= 2;
+        align_bytes *= 2;
     }
 
     // If it is an external buffer, then we cannot assume that the host pointer
     // is aligned to at least native vector width. However, we may be able to do
     // better than just assuming that it is unaligned.
     if (is_external) {
-        if (load->param.defined()) {
-            int host_alignment = load->param.host_alignment();
-            alignment = gcd(alignment, host_alignment);
-        } else if (get_target().has_feature(Target::JIT) && load->image.defined()) {
+        if (param.defined()) {
+            int host_alignment = param.host_alignment();
+            align_bytes = gcd(align_bytes, host_alignment);
+        } else if (get_target().has_feature(Target::JIT) && image.defined()) {
             // If we're JITting, use the actual pointer value to determine alignment for embedded buffers.
-            alignment = gcd(alignment, (int)(((uintptr_t)load->image.data()) & std::numeric_limits<int>::max()));
+            align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits<int>::max()));
         }
     }
 
     // For dense vector loads wider than the native vector
     // width, bust them up into native vectors
-    int load_lanes = load->type.lanes();
-    int native_lanes = std::max(1, native_bits / load->type.bits());
+    int load_lanes = type.lanes();
+    int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes;
     vector<Value *> slices;
     for (int i = 0; i < load_lanes; i += native_lanes) {
         int slice_lanes = std::min(native_lanes, load_lanes - i);
-        Expr slice_base = simplify(ramp->base + i);
+        Expr slice_base = simplify(base + i);
         Expr slice_stride = make_one(slice_base.type());
         Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes);
-        llvm::Type *slice_type = get_vector_type(llvm_type_of(load->type.element_of()), slice_lanes);
-        Value *elt_ptr = codegen_buffer_pointer(load->name, load->type.element_of(), slice_base);
+        llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes);
+        Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base);
         Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo());
 
         Instruction *load_inst;
         if (vpred != nullptr) {
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
 #if LLVM_VERSION >= 110
-            load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(alignment), slice_mask);
+            load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(align_bytes), slice_mask);
 #else
-            load_inst = builder->CreateMaskedLoad(vec_ptr, alignment, slice_mask);
+            load_inst = builder->CreateMaskedLoad(vec_ptr, align_bytes, slice_mask);
 #endif
         } else {
-            load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(alignment));
+            load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(align_bytes));
         }
-        add_tbaa_metadata(load_inst, load->name, slice_index);
+        add_tbaa_metadata(load_inst, name, slice_index);
         slices.push_back(load_inst);
     }
     value = concat_vectors(slices);
     return value;
 }
 
+Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) {
+    const Ramp *ramp = load->index.as<Ramp>();
+    internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
+
+    return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param,
+                                     load->alignment, vpred, slice_to_native);
+}
+
 void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 092bc7713b5b..06ec634938ec 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -552,7 +552,10 @@ class CodeGen_LLVM : public IRVisitor {
 
     llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false);
 
-    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr);
+    llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                           const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                           llvm::Value *vpred = nullptr, bool slice_to_native = true);
+    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);
 
     virtual void codegen_predicated_vector_load(const Load *op);
     virtual void codegen_predicated_vector_store(const Store *op);
diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp
index 32b3202477aa..49a627b767b1 100644
--- a/src/CodeGen_Posix.cpp
+++ b/src/CodeGen_Posix.cpp
@@ -75,10 +75,10 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type
 }
 
 int CodeGen_Posix::allocation_padding(Type type) const {
-    // We potentially load one scalar value past the end of the
+    // We potentially load 3 scalar values past the end of the
     // buffer, so pad the allocation with an extra instance of the
     // scalar type.
-    return type.bytes();
+    return 3 * type.bytes();
 }
 
 CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type,

From 945ae21bffae8abab34117326f2e117254299969 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 8 Mar 2021 18:28:32 -0700
Subject: [PATCH 11/13] Fix comment.

---
 src/CodeGen_LLVM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 5d82c3013f7b..f5aeb156b674 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1913,7 +1913,7 @@ void CodeGen_LLVM::visit(const Load *op) {
         } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
             // Try to rewrite strided loads as shuffles of dense loads,
             // aligned to the stride. This makes adjacent strided loads
-            // shared a vldN op.
+            // share the same underlying dense loads.
             ModulusRemainder align = op->alignment;
             Expr base = ramp->base;
             int aligned_stride = gcd(stride->value, align.modulus);

From ecb8daa40d3d49312e52a1d97ae674a8d7139b18 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 8 Mar 2021 18:53:26 -0700
Subject: [PATCH 12/13] clang-format.

---
 src/CodeGen_LLVM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f5aeb156b674..749ad56e3e73 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1965,7 +1965,7 @@ void CodeGen_LLVM::visit(const Load *op) {
                 Expr slice_base = simplify(base + i * ramp->stride);
 
                 Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
-                                                         op->image, op->param, op->alignment, nullptr, false);
+                                                          op->image, op->param, op->alignment, nullptr, false);
 
                 SmallVector<Constant *, 256> constants;
                 for (int j = 0; j < lanes_i; j++) {

From ace9fb3bc81bc6370fd8e9aad5dadda450534cf5 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Tue, 9 Mar 2021 12:20:02 -0700
Subject: [PATCH 13/13] Remove sketchy alignment check.

---
 src/CodeGen_LLVM.cpp               | 16 ++---------
 test/correctness/simd_op_check.cpp | 45 ++++++++++++++++--------------
 2 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 749ad56e3e73..f8fdf854e6b2 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1933,24 +1933,14 @@ void CodeGen_LLVM::visit(const Load *op) {
             }
 
             // We want to load a few more bytes than the original load did.
-            // This is safe under these conditions:
-            // - The alignment information is sufficient to determine the
-            //   modified load is safe.
-            // - For internal buffers, we know this is safe because we
-            //   allocate an extra 3 values of padding.
+            // We know this is safe for internal buffers because we allocate
+            // padding.
             // (In ASAN mode, don't read beyond the end of internal buffers either,
             // as ASAN will complain even about harmless stack overreads.)
             // The min moves lower by offset.
             int load_lanes = ramp->lanes * stride->value;
-            bool alignment_safe = align.remainder >= 0;
-            // The max needs to not cross an alignment boundary.
-            if (align.modulus > 0) {
-                int old_max = align.remainder + (op->type.lanes() - 1) * stride->value;
-                int new_max = old_max + stride->value - 1 - offset;
-                alignment_safe &= (old_max / align.modulus == new_max / align.modulus);
-            }
             bool external = op->param.defined() || op->image.defined();
-            if (!alignment_safe && (external || target.has_feature(Target::ASAN))) {
+            if (external || target.has_feature(Target::ASAN)) {
                 load_lanes -= (stride->value - 1 - offset);
             }
 
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index d2bebe1899a7..a67af7602a18 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -802,31 +802,34 @@ class SimdOpCheck : public SimdOpCheckTest {
             }
 
             // VLD2     X       -       Load Two-Element Structures
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_i32(x * 2) + in_i32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_u32(x * 2) + in_u32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_f32(x * 2) + in_f32(x * 2 + 1));
-            check(arm32 ? "vld2.8" : "ld2", 8 * w, in_i8(x * 2) + in_i8(x * 2 + 1));
-            check(arm32 ? "vld2.8" : "ld2", 8 * w, in_u8(x * 2) + in_u8(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 4 * w, in_i16(x * 2) + in_i16(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
+            // These need to be vectorized at least 2 native vectors wide,
+            // so we get a full vectors' worth that we know is safe to
+            // access.
+            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1));
+            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1));
+            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1));
+            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1));
 
             // VLD3     X       -       Load Three-Element Structures
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3));
+            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3));
+            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3));
 
             // VLD4     X       -       Load Four-Element Structures
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4));
+            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4));
+            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4));
 
             // VLDM     X       F, D    Load Multiple Registers
             // VLDR     X       F, D    Load Single Register