diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index e43097b95180..8054307cf271 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1027,96 +1027,13 @@ void CodeGen_ARM::visit(const Load *op) {
         return;
     }
 
+    // If the stride is in [-1, 4], we can deal with that using vanilla codegen
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
-
-    // If the stride is one or minus one, we can deal with that using vanilla codegen
-    if (stride && (stride->value == 1 || stride->value == -1)) {
+    if (stride && (-1 <= stride->value && stride->value <= 4)) {
         CodeGen_Posix::visit(op);
         return;
     }
 
-    // Strided loads with known stride
-    if (stride && stride->value >= 2 && stride->value <= 4) {
-        // Check alignment on the base. Attempt to shift to an earlier
-        // address if it simplifies the expression. This makes
-        // adjacent strided loads shared a vldN op.
-        Expr base = ramp->base;
-        int offset = 0;
-        ModulusRemainder mod_rem = modulus_remainder(ramp->base);
-
-        const Add *add = base.as<Add>();
-        const IntImm *add_b = add ? add->b.as<IntImm>() : nullptr;
-
-        if ((mod_rem.modulus % stride->value) == 0) {
-            offset = mod_rem.remainder % stride->value;
-        } else if ((mod_rem.modulus == 1) && add_b) {
-            offset = add_b->value % stride->value;
-            if (offset < 0) {
-                offset += stride->value;
-            }
-        }
-
-        if (offset) {
-            base = simplify(base - offset);
-            mod_rem.remainder -= offset;
-            if (mod_rem.modulus) {
-                mod_rem.remainder = mod_imp(mod_rem.remainder, mod_rem.modulus);
-            }
-        }
-
-        int alignment = op->type.bytes();
-        alignment *= gcd(mod_rem.modulus, mod_rem.remainder);
-        // Maximum stack alignment on arm is 16 bytes, so we should
-        // never claim alignment greater than that.
-        alignment = gcd(alignment, 16);
-        internal_assert(alignment > 0);
-
-        // Decide what width to slice things into. If not a multiple
-        // of 64 or 128 bits, then we can't safely slice it up into
-        // some number of vlds, so we hand it over the base class.
-        int bit_width = op->type.bits() * op->type.lanes();
-        int intrin_lanes = 0;
-        if (bit_width % 128 == 0) {
-            intrin_lanes = 128 / op->type.bits();
-        } else if (bit_width % 64 == 0) {
-            intrin_lanes = 64 / op->type.bits();
-        } else {
-            CodeGen_Posix::visit(op);
-            return;
-        }
-
-        llvm::Type *load_return_type = llvm_type_of(op->type.with_lanes(intrin_lanes * stride->value));
-        llvm::Type *load_return_pointer_type = load_return_type->getPointerTo();
-        Value *undef = UndefValue::get(load_return_type);
-        SmallVector<Constant *, 256> constants;
-        for (int j = 0; j < intrin_lanes; j++) {
-            Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
-            constants.push_back(constant);
-        }
-        Constant *constantsV = ConstantVector::get(constants);
-
-        vector<Value *> results;
-        for (int i = 0; i < op->type.lanes(); i += intrin_lanes) {
-            Expr slice_base = simplify(base + i * ramp->stride);
-            Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_lanes);
-            Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base);
-            Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type);
-            LoadInst *loadI = cast<LoadInst>(builder->CreateLoad(bitcastI));
-#if LLVM_VERSION >= 110
-            loadI->setAlignment(Align(alignment));
-#else
-            loadI->setAlignment(MaybeAlign(alignment));
-#endif
-            add_tbaa_metadata(loadI, op->name, slice_ramp);
-            Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV);
-            results.push_back(shuffleInstr);
-        }
-
-        // Concat the results
-        value = concat_vectors(results);
-        return;
-    }
-
     // We have builtins for strided loads with fixed but unknown stride, but they use inline assembly.
     if (target.bits != 64 /* Not yet implemented for aarch64 */) {
         ostringstream builtin;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index eda89aca82da..f8fdf854e6b2 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1910,63 +1910,66 @@ void CodeGen_LLVM::visit(const Load *op) {
 
         if (ramp && stride && stride->value == 1) {
             value = codegen_dense_vector_load(op);
-        } else if (ramp && stride && stride->value == 2) {
-            // Load two vectors worth and then shuffle
-            Expr base_a = ramp->base, base_b = ramp->base + ramp->lanes;
-            Expr stride_a = make_one(base_a.type());
-            Expr stride_b = make_one(base_b.type());
-
-            ModulusRemainder align_a = op->alignment;
-            ModulusRemainder align_b = align_a + ramp->lanes;
-
-            // False indicates we should take the even-numbered lanes
-            // from the load, true indicates we should take the
-            // odd-numbered-lanes.
-            bool shifted_a = false, shifted_b = false;
+        } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
+            // Try to rewrite strided loads as shuffles of dense loads,
+            // aligned to the stride. This makes adjacent strided loads
+            // share the same underlying dense loads.
+            ModulusRemainder align = op->alignment;
+            Expr base = ramp->base;
+            int aligned_stride = gcd(stride->value, align.modulus);
+            int offset = 0;
+            if (aligned_stride == stride->value) {
+                offset = mod_imp((int)align.remainder, aligned_stride);
+            } else {
+                const Add *add = base.as<Add>();
+                if (const IntImm *add_c = add ? add->b.as<IntImm>() : base.as<IntImm>()) {
+                    offset = mod_imp(add_c->value, stride->value);
+                }
+            }
 
-            bool external = op->param.defined() || op->image.defined();
+            if (offset) {
+                base = simplify(base - offset);
+                align.remainder -= offset;
+            }
 
-            // Don't read beyond the end of an external buffer.
+            // We want to load a few more bytes than the original load did.
+            // We know this is safe for internal buffers because we allocate
+            // padding.
             // (In ASAN mode, don't read beyond the end of internal buffers either,
             // as ASAN will complain even about harmless stack overreads.)
+            // The min moves lower by offset.
+            int load_lanes = ramp->lanes * stride->value;
+            bool external = op->param.defined() || op->image.defined();
             if (external || target.has_feature(Target::ASAN)) {
-                base_b -= 1;
-                align_b = align_b - 1;
-                shifted_b = true;
-            } else {
-                // If the base ends in an odd constant, then subtract one
-                // and do a different shuffle. This helps expressions like
-                // (f(2*x) + f(2*x+1) share loads
-                const Add *add = ramp->base.as<Add>();
-                const IntImm *offset = add ? add->b.as<IntImm>() : ramp->base.as<IntImm>();
-                if (offset && offset->value & 1) {
-                    base_a -= 1;
-                    align_a = align_a - 1;
-                    shifted_a = true;
-                    base_b -= 1;
-                    align_b = align_b - 1;
-                    shifted_b = true;
-                }
+                load_lanes -= (stride->value - 1 - offset);
             }
 
-            // Do each load.
-            Expr ramp_a = Ramp::make(base_a, stride_a, ramp->lanes);
-            Expr ramp_b = Ramp::make(base_b, stride_b, ramp->lanes);
-            Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param, op->predicate, align_a);
-            Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param, op->predicate, align_b);
-            Value *vec_a = codegen(load_a);
-            Value *vec_b = codegen(load_b);
+            int slice_lanes = native_vector_bits() / op->type.bits();
 
-            // Shuffle together the results.
-            vector<int> indices(ramp->lanes);
-            for (int i = 0; i < (ramp->lanes + 1) / 2; i++) {
-                indices[i] = i * 2 + (shifted_a ? 1 : 0);
-            }
-            for (int i = (ramp->lanes + 1) / 2; i < ramp->lanes; i++) {
-                indices[i] = i * 2 + (shifted_b ? 1 : 0);
+            // We need to slice the result in to native vector lanes, otherwise
+            // LLVM misses optimizations like using ldN on ARM.
+            vector<Value *> results;
+            for (int i = 0; i < op->type.lanes(); i += slice_lanes) {
+                int load_lanes_i = std::min<int>(slice_lanes * stride->value, load_lanes - i);
+                int lanes_i = std::min<int>(slice_lanes, op->type.lanes() - i);
+                Expr slice_base = simplify(base + i * ramp->stride);
+
+                Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
+                                                          op->image, op->param, op->alignment, nullptr, false);
+
+                SmallVector<Constant *, 256> constants;
+                for (int j = 0; j < lanes_i; j++) {
+                    Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
+                    constants.push_back(constant);
+                }
+                Constant *constantsV = ConstantVector::get(constants);
+                Value *undef = UndefValue::get(load_i->getType());
+                Value *shuffleInstr = builder->CreateShuffleVector(load_i, undef, constantsV);
+                results.push_back(shuffleInstr);
             }
 
-            value = shuffle_vectors(vec_a, vec_b, indices);
+            // Concat the results
+            value = concat_vectors(results);
         } else if (ramp && stride && stride->value == -1) {
             // Load the vector and then flip it in-place
             Expr flipped_base = ramp->base - ramp->lanes + 1;
@@ -2249,14 +2252,14 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
     }
 }
 
-Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
-    debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n";
-
-    const Ramp *ramp = load->index.as<Ramp>();
-    internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
+llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                                     llvm::Value *vpred, bool slice_to_native) {
+    debug(4) << "Vectorize predicated dense vector load:\n\t"
+             << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n";
 
-    bool is_external = (external_buffer.find(load->name) != external_buffer.end());
-    int alignment = load->type.bytes();  // The size of a single element
+    bool is_external = (external_buffer.find(name) != external_buffer.end());
+    int align_bytes = type.bytes();  // The size of a single element
 
     int native_bits = native_vector_bits();
     int native_bytes = native_bits / 8;
@@ -2266,60 +2269,68 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
     // maximum alignment we can infer based on the index alone.
 
     // Boost the alignment if possible, up to the native vector width.
-    ModulusRemainder mod_rem = load->alignment;
+    ModulusRemainder mod_rem = alignment;
     while ((mod_rem.remainder & 1) == 0 &&
            (mod_rem.modulus & 1) == 0 &&
-           alignment < native_bytes) {
+           align_bytes < native_bytes) {
         mod_rem.modulus /= 2;
         mod_rem.remainder /= 2;
-        alignment *= 2;
+        align_bytes *= 2;
     }
 
     // If it is an external buffer, then we cannot assume that the host pointer
     // is aligned to at least native vector width. However, we may be able to do
     // better than just assuming that it is unaligned.
     if (is_external) {
-        if (load->param.defined()) {
-            int host_alignment = load->param.host_alignment();
-            alignment = gcd(alignment, host_alignment);
-        } else if (get_target().has_feature(Target::JIT) && load->image.defined()) {
+        if (param.defined()) {
+            int host_alignment = param.host_alignment();
+            align_bytes = gcd(align_bytes, host_alignment);
+        } else if (get_target().has_feature(Target::JIT) && image.defined()) {
             // If we're JITting, use the actual pointer value to determine alignment for embedded buffers.
-            alignment = gcd(alignment, (int)(((uintptr_t)load->image.data()) & std::numeric_limits<int>::max()));
+            align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits<int>::max()));
         }
     }
 
     // For dense vector loads wider than the native vector
     // width, bust them up into native vectors
-    int load_lanes = load->type.lanes();
-    int native_lanes = std::max(1, native_bits / load->type.bits());
+    int load_lanes = type.lanes();
+    int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes;
     vector<Value *> slices;
     for (int i = 0; i < load_lanes; i += native_lanes) {
         int slice_lanes = std::min(native_lanes, load_lanes - i);
-        Expr slice_base = simplify(ramp->base + i);
+        Expr slice_base = simplify(base + i);
         Expr slice_stride = make_one(slice_base.type());
         Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes);
-        llvm::Type *slice_type = get_vector_type(llvm_type_of(load->type.element_of()), slice_lanes);
-        Value *elt_ptr = codegen_buffer_pointer(load->name, load->type.element_of(), slice_base);
+        llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes);
+        Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base);
         Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo());
 
         Instruction *load_inst;
         if (vpred != nullptr) {
             Value *slice_mask = slice_vector(vpred, i, slice_lanes);
 #if LLVM_VERSION >= 110
-            load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(alignment), slice_mask);
+            load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(align_bytes), slice_mask);
 #else
-            load_inst = builder->CreateMaskedLoad(vec_ptr, alignment, slice_mask);
+            load_inst = builder->CreateMaskedLoad(vec_ptr, align_bytes, slice_mask);
 #endif
         } else {
-            load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(alignment));
+            load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(align_bytes));
         }
-        add_tbaa_metadata(load_inst, load->name, slice_index);
+        add_tbaa_metadata(load_inst, name, slice_index);
         slices.push_back(load_inst);
     }
     value = concat_vectors(slices);
     return value;
 }
 
+Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) {
+    const Ramp *ramp = load->index.as<Ramp>();
+    internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
+
+    return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param,
+                                     load->alignment, vpred, slice_to_native);
+}
+
 void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 092bc7713b5b..06ec634938ec 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -552,7 +552,10 @@ class CodeGen_LLVM : public IRVisitor {
 
     llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false);
 
-    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr);
+    llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
+                                           const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
+                                           llvm::Value *vpred = nullptr, bool slice_to_native = true);
+    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);
 
     virtual void codegen_predicated_vector_load(const Load *op);
     virtual void codegen_predicated_vector_store(const Store *op);
diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp
index 32b3202477aa..49a627b767b1 100644
--- a/src/CodeGen_Posix.cpp
+++ b/src/CodeGen_Posix.cpp
@@ -75,10 +75,10 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type
 }
 
 int CodeGen_Posix::allocation_padding(Type type) const {
-    // We potentially load one scalar value past the end of the
+    // We potentially load 3 scalar values past the end of the
     // buffer, so pad the allocation with an extra instance of the
     // scalar type.
-    return type.bytes();
+    return 3 * type.bytes();
 }
 
 CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type,
diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp
index e3b11503aff2..2f435f975dd3 100644
--- a/test/correctness/nested_tail_strategies.cpp
+++ b/test/correctness/nested_tail_strategies.cpp
@@ -45,14 +45,15 @@ void check(Func out, int line, std::vector<TailStrategy> tails) {
         largest_allocation = 0;
         out.realize({s});
         size_t expected = (s + 1) * 4;
-        if (largest_allocation > expected) {
+        size_t tolerance = 3 * sizeof(int);
+        if (largest_allocation > expected + tolerance) {
             std::cerr << "Failure on line " << line << "\n"
                       << "with tail strategies: ";
             for (auto t : tails) {
                 std::cerr << t << " ";
             }
             std::cerr << "\n allocation of " << largest_allocation
-                      << " bytes is too large. Expected " << expected << "\n";
+                      << " bytes is too large. Expected " << expected + tolerance << "\n";
             abort();
         }
     }
diff --git a/test/correctness/pseudostack_shares_slots.cpp b/test/correctness/pseudostack_shares_slots.cpp
index fd491ba149ea..a9e2f49dd048 100644
--- a/test/correctness/pseudostack_shares_slots.cpp
+++ b/test/correctness/pseudostack_shares_slots.cpp
@@ -2,10 +2,11 @@
 
 using namespace Halide;
 
-std::vector<size_t> mallocs;
+const int tolerance = 3 * sizeof(int);
+std::vector<int> mallocs;
 
 void *my_malloc(void *user_context, size_t x) {
-    mallocs.push_back(x);
+    mallocs.push_back((int)x);
     void *orig = malloc(x + 32);
     void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5);
     ((void **)ptr)[-1] = orig;
@@ -52,9 +53,11 @@ int main(int argc, char **argv) {
             mallocs.clear();
             p.set(sz);
             chain.back().realize({1024});
-            size_t sz1 = sz + 2 * 20 - 1;
-            size_t sz2 = sz1 - 2;
-            if (mallocs.size() != 2 || mallocs[0] != sz1 || mallocs[1] != sz2) {
+            int sz1 = sz + 2 * 20 - 1;
+            int sz2 = sz1 - 2;
+            if (mallocs.size() != 2 ||
+                std::abs(mallocs[0] - sz1) > tolerance ||
+                std::abs(mallocs[1] - sz2) > tolerance) {
                 printf("Incorrect allocations: %d %d %d\n", (int)mallocs.size(), (int)mallocs[0], (int)mallocs[1]);
                 printf("Expected: 2 %d %d\n", (int)sz1, (int)sz2);
                 return -1;
@@ -93,14 +96,18 @@ int main(int argc, char **argv) {
             mallocs.clear();
             p.set(sz);
             chain.back().realize({1024});
-            size_t sz1 = sz / 8 + 23;
-            size_t sz2 = sz1 - 2;
-            size_t sz3 = sz + 19;
-            size_t sz4 = sz3 - 2;
-            if (mallocs.size() != 4 || mallocs[0] != sz1 || mallocs[1] != sz2 || mallocs[2] != sz3 || mallocs[3] != sz4) {
+            int sz1 = sz / 8 + 23;
+            int sz2 = sz1 - 2;
+            int sz3 = sz + 19;
+            int sz4 = sz3 - 2;
+            if (mallocs.size() != 4 ||
+                std::abs(mallocs[0] - sz1) > tolerance ||
+                std::abs(mallocs[1] - sz2) > tolerance ||
+                std::abs(mallocs[2] - sz3) > tolerance ||
+                std::abs(mallocs[3] - sz4) > tolerance) {
                 printf("Incorrect allocations: %d %d %d %d %d\n", (int)mallocs.size(),
-                       (int)mallocs[0], (int)mallocs[1], (int)mallocs[2], (int)mallocs[3]);
-                printf("Expected: 4 %d %d %d %d\n", (int)sz1, (int)sz2, (int)sz3, (int)sz4);
+                       mallocs[0], mallocs[1], mallocs[2], mallocs[3]);
+                printf("Expected: 4 %d %d %d %d\n", sz1, sz2, sz3, sz4);
                 return -1;
             }
         }
diff --git a/test/correctness/reorder_storage.cpp b/test/correctness/reorder_storage.cpp
index 546812bad57a..953cd2b1e821 100644
--- a/test/correctness/reorder_storage.cpp
+++ b/test/correctness/reorder_storage.cpp
@@ -3,11 +3,13 @@
 
 using namespace Halide;
 
-size_t expected_allocation = 0;
+// Backends allocate up to 3 extra elements.
+int tolerance = 3 * sizeof(int);
+int expected_allocation = 0;
 
 void *my_malloc(void *user_context, size_t x) {
-    if (x != expected_allocation) {
-        printf("Error! Expected allocation of %zu bytes, got %zu bytes\n", expected_allocation, x);
+    if (std::abs((int)x - expected_allocation) > tolerance) {
+        printf("Error! Expected allocation of %d bytes, got %zu bytes (tolerance %d)\n", expected_allocation, x, tolerance);
         exit(-1);
     }
     return malloc(x);
@@ -40,11 +42,10 @@ int main(int argc, char **argv) {
     g.set_custom_allocator(my_malloc, my_free);
 
     // Without any storage alignment, we should expect an allocation
-    // that is the product of the extents of the realization (plus one
-    // for the magical extra Halide element).
+    // that is the product of the extents of the realization.
     int W = 10;
     int H = 11;
-    expected_allocation = (3 * W * H + 1) * sizeof(int);
+    expected_allocation = 3 * W * H * sizeof(int);
 
     g.realize({W, H, 3});
 
@@ -53,7 +54,7 @@ int main(int argc, char **argv) {
 
     // We've aligned the x dimension, make sure the allocation reflects this.
     int W_aligned = (W + x_alignment - 1) & ~(x_alignment - 1);
-    expected_allocation = (W_aligned * H * 3 + 1) * sizeof(int);
+    expected_allocation = W_aligned * H * 3 * sizeof(int);
 
     // Force g to clear it's cache...
     g.compute_root();
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index a21bdaa71243..a67af7602a18 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -802,31 +802,34 @@ class SimdOpCheck : public SimdOpCheckTest {
             }
 
             // VLD2     X       -       Load Two-Element Structures
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_i32(x * 2) + in_i32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_u32(x * 2) + in_u32(x * 2 + 1));
-            check(arm32 ? "vld2.32" : "ld2", 4 * w, in_f32(x * 2) + in_f32(x * 2 + 1));
-            check(arm32 ? "vld2.8" : "ld2", 8 * w, in_i8(x * 2) + in_i8(x * 2 + 1));
-            check(arm32 ? "vld2.8" : "ld2", 8 * w, in_u8(x * 2) + in_u8(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 4 * w, in_i16(x * 2) + in_i16(x * 2 + 1));
-            check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
+            // These need to be vectorized at least 2 native vectors wide,
+            // so we get a full vectors' worth that we know is safe to
+            // access.
+            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1));
+            check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1));
+            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1));
+            check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1));
+            check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1));
 
             // VLD3     X       -       Load Three-Element Structures
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3 + y));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3 + y));
-            check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3 + y));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3 + y));
-            check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3 + y));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3 + y));
-            check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3 + y));
+            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3));
+            check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3));
+            check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3));
+            check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3));
 
             // VLD4     X       -       Load Four-Element Structures
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4 + y));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4 + y));
-            check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4 + y));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4 + y));
-            check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4 + y));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4 + y));
-            check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4 + y));
+            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4));
+            check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4));
+            check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4));
+            check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4));
 
             // VLDM     X       F, D    Load Multiple Registers
             // VLDR     X       F, D    Load Single Register
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 36303bd29c16..43786143aecd 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -302,6 +302,11 @@ class SimdOpCheckTest {
     virtual void setup_images() {
         for (auto p : image_params) {
             p.reset();
+
+            const int alignment_bytes = 16;
+            p.set_host_alignment(alignment_bytes);
+            const int alignment = alignment_bytes / p.type().bytes();
+            p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
         }
     }
     virtual bool test_all() {
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index dce07531db6a..ee312da988bc 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -437,10 +437,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         check("vnot(v*)", hvx_width / 2, ~u16_1);
         check("vnot(v*)", hvx_width / 4, ~u32_1);
 
-        // v62 - Broadcasting unsigned scalars
-        check("v*.b = vsplat(r*)", hvx_width / 1, in_u8(0));
-        check("v*.h = vsplat(r*)", hvx_width / 2, in_u16(0));
-        check("vsplat(r*)", hvx_width / 4, in_u32(0));
+        // v62 - Broadcasting scalars
+        check("vsplat(r*)", hvx_width / 1, in_u8(y));
+        check("vsplat(r*)", hvx_width / 2, in_u16(y));
+        check("vsplat(r*)", hvx_width / 4, in_u32(y));
 
         check("vmux(q*,v*,v*)", hvx_width / 1, select(i8_1 == i8_2, i8_3, i8_2));
         check("vmux(q*,v*,v*)", hvx_width / 2, select(i16_1 == i16_2, i16_3, i16_2));
diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp
index b91bbeaf7859..3bbfb672f512 100644
--- a/test/correctness/storage_folding.cpp
+++ b/test/correctness/storage_folding.cpp
@@ -1,14 +1,16 @@
 #include "Halide.h"
 #include <stdio.h>
 
+#include <set>
+
 using namespace Halide;
 
 // Override Halide's malloc and free
-
-size_t custom_malloc_size = 0;
+const int tolerance = 3 * sizeof(int);
+std::set<size_t> custom_malloc_sizes;
 
 void *my_malloc(void *user_context, size_t x) {
-    custom_malloc_size = x;
+    custom_malloc_sizes.insert(x);
     void *orig = malloc(x + 32);
     void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5);
     ((void **)ptr)[-1] = orig;
@@ -19,6 +21,28 @@ void my_free(void *user_context, void *ptr) {
     free(((void **)ptr)[-1]);
 }
 
+bool check_expected_malloc(size_t expected) {
+    for (size_t i : custom_malloc_sizes) {
+        if (std::abs((int)i - (int)expected) <= tolerance) {
+            return true;
+        }
+    }
+    printf("Expected an allocation of size %d (tolerance %d). Got instead:\n", (int)expected, tolerance);
+    for (size_t i : custom_malloc_sizes) {
+        printf("  %d\n", (int)i);
+    }
+    return false;
+}
+
+bool check_expected_mallocs(const std::vector<size_t> &expected) {
+    for (size_t i : expected) {
+        if (!check_expected_malloc(i)) {
+            return false;
+        }
+    }
+    return true;
+}
+
 #ifdef _WIN32
 #define DLLEXPORT __declspec(dllexport)
 #else
@@ -111,9 +135,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000, 3});
 
-        size_t expected_size = 101 * 4 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 4 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
@@ -133,9 +156,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000, 3});
 
-        size_t expected_size = 101 * 1002 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 1002 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
@@ -157,15 +179,14 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = g.realize({100, 1000});
 
-        size_t expected_size = 101 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 101 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -180,7 +201,7 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        if (custom_malloc_size != 0) {
+        if (!custom_malloc_sizes.empty()) {
             printf("There should not have been a heap allocation\n");
             return -1;
         }
@@ -197,7 +218,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -213,7 +234,7 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        if (custom_malloc_size != 0) {
+        if (!custom_malloc_sizes.empty()) {
             printf("There should not have been a heap allocation\n");
             return -1;
         }
@@ -230,7 +251,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -247,10 +268,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 2 * 1002 * 4 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 2 * 1000 * 4 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -266,7 +285,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -285,10 +304,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 1000 * 8 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 1000 * 8 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -304,7 +321,7 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
         g(x, y) = x * y;
@@ -322,10 +339,8 @@ int main(int argc, char **argv) {
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 2 * 1002 * 3 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 2 * 1000 * 3 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -341,24 +356,21 @@ int main(int argc, char **argv) {
     }
 
     {
-        custom_malloc_size = 0;
+        custom_malloc_sizes.clear();
         Func f, g;
 
+        // This is tricky due to upsampling.
         g(x, y) = x * y;
         f(x, y) = g(x, y / 2) + g(x, y / 2 + 1);
 
-        // The automatic storage folding optimization can't figure
-        // this out due to the downsampling. Explicitly fold it.
-        g.compute_at(f, x).store_root().fold_storage(y, 2);
+        g.compute_at(f, x).store_root();
 
         f.set_custom_allocator(my_malloc, my_free);
 
         Buffer<int> im = f.realize({1000, 1000});
 
-        // Halide allocates one extra scalar, so we account for that.
-        size_t expected_size = 1000 * 2 * sizeof(int) + sizeof(int);
-        if (custom_malloc_size == 0 || custom_malloc_size > expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        size_t expected_size = 1000 * 2 * sizeof(int);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
 
@@ -394,12 +406,11 @@ int main(int argc, char **argv) {
 
         size_t expected_size;
         if (interleave) {
-            expected_size = 101 * 3 * 3 * sizeof(int) + sizeof(int);
+            expected_size = 101 * 3 * 3 * sizeof(int);
         } else {
-            expected_size = 101 * 3 * sizeof(int) + sizeof(int);
+            expected_size = 101 * 3 * sizeof(int);
         }
-        if (custom_malloc_size == 0 || custom_malloc_size != expected_size) {
-            printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size);
+        if (!check_expected_mallocs({expected_size})) {
             return -1;
         }
     }