diff --git a/.github/workflows/testing-make.yml b/.github/workflows/testing-make.yml
index 4419bc593977..12ea624367fb 100644
--- a/.github/workflows/testing-make.yml
+++ b/.github/workflows/testing-make.yml
@@ -77,6 +77,7 @@ jobs:
               "lld-${LLVM_VERSION}" \
               "liblld-${LLVM_VERSION}-dev"
             echo "LLVM_CONFIG=llvm-config-${LLVM_VERSION}" | tee -a "$GITHUB_ENV"
+            cat /proc/cpuinfo
           elif [ "$RUNNER_OS" = "macOS" ]; then
             brew install libjpeg-turbo libpng pkgconf protobuf "llvm@${LLVM_VERSION}" "lld@${LLVM_VERSION}"
             echo "LLVM_CONFIG=$(brew --prefix "llvm@${LLVM_VERSION}")/bin/llvm-config" | tee -a "$GITHUB_ENV"
diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
index 49104b3e5fa3..92ed5d2a5b0b 100644
--- a/apps/iir_blur/Makefile
+++ b/apps/iir_blur/Makefile
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
 
 $(BIN)/%/out.png: $(BIN)/%/filter
-	$< ../images/rgba.png $(BIN)/$*/out.png
+	$< ../images/rgb.png $(BIN)/$*/out.png
 
 clean:
 	rm -rf $(BIN)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index ef3b44eef461..7f411d7e8fef 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
     if (!skip_schedule) {
         if (!target.has_gpu_feature()) {
             // CPU schedule.
-            // 8.2ms on an Intel i9-9960X using 16 threads
+            // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
-            // and strips (Halide supports nested parallelism).
-            Var xo, yo, t;
+            // and strips.
+            Var xo, yo, t, yi;
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
+                .split(y, y, yi, vec)
+                .vectorize(yi)
                 .vectorize(x)
-                .parallel(yo)
-                .parallel(c);
+                .fuse(yo, c, t)
+                .parallel(t);
+
+            blur.in(transpose)
+                .compute_at(transpose, y)
+                .vectorize(x)
+                .unroll(y);
 
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
-            blur.compute_at(transpose, yo);
+            blur.compute_at(transpose, t);
 
             // Vectorize computations within the strips.
             blur.update(0)
diff --git a/src/CSE.cpp b/src/CSE.cpp
index c2a46d93bc4d..e7e56bb4df09 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -237,10 +237,39 @@ class CSEEveryExprInStmt : public IRMutator {
         }
         const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
         internal_assert(bundle && bundle->args.size() == 2);
-        Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
+
+        Expr value = bundle->args[0], index = bundle->args[1];
+
+        // Figure out which ones are actually needed by the index
+
+        auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
+            visit_with(e, [&](auto *, const Variable *var) {
+                s.insert(var->name);
+            });
+        };
+
+        std::set<string> index_lets;
+        add_all_vars_to_set(index, index_lets);
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                add_all_vars_to_set(val, index_lets);
+            }
+        }
+
+        vector<pair<string, Expr>> deferred;
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                deferred.emplace_back(var, val);
+            } else {
+                value = Let::make(var, val, value);
+            }
+        }
+
+        Stmt s = Store::make(op->name, value, index,
                              op->param, mutate(op->predicate), op->alignment);
-        for (const auto &[var, value] : reverse_view(lets)) {
-            s = LetStmt::make(var, value, s);
+
+        for (const auto &[var, val] : deferred) {
+            s = LetStmt::make(var, val, s);
         }
         return s;
     }
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 7178e82965d8..5cf9ccf77f26 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1478,10 +1478,11 @@ void CodeGen_ARM::visit(const Store *op) {
         intrin_type = t;
         Type elt = t.element_of();
         int vec_bits = t.bits() * t.lanes();
-        if (elt == Float(32) || elt == Float(64) ||
-            is_float16_and_has_feature(elt) ||
-            elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
-            elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
+        if (t.bits() <= target.bits &&
+            (elt == Float(32) || elt == Float(64) ||
+             is_float16_and_has_feature(elt) ||
+             elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
+             elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
             const int target_vector_bits = native_vector_bits();
             if (vec_bits % 128 == 0) {
                 type_ok_for_vst = true;
@@ -1895,6 +1896,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     if (target.os != Target::IOS && target.os != Target::OSX &&
         load &&
         op->vectors.size() == 1 &&
+        op->is_slice() &&
         2 <= stride && stride <= 4 &&
         op->slice_begin() < stride &&
         load->type.lanes() == stride * op->type.lanes()) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 065dcebd1a64..0681dd42605b 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
     llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                  const std::vector<int> &indices) override;
+    llvm::Value *optimization_fence(llvm::Value *v) override;
     using CodeGen_Posix::shuffle_vectors;
     ///@}
 
@@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     return vdelta(concat_vectors({a, b}), indices);
 }
 
+Value *CodeGen_Hexagon::optimization_fence(Value *v) {
+    // As of llvm 21, the base class version seems to trip up LLVM's hexagon
+    // backend, possibly because it relies on a floating point type.
+    return v;
+}
+
 Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
                                 int max_index) {
     llvm::Type *lut_ty = lut->getType();
@@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
     return slice_vector(concat_vectors(result), 0, idx_elements);
 }
 
-bool is_power_of_two(int x) {
-    return (x & (x - 1)) == 0;
-}
-
 // vdelta and vrdelta are instructions that take an input vector and
 // pass it through a network made up of levels. Each element x at each
 // level i can either take the element from the previous level at the
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 300dfa096a1e..2d74f12b4c67 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1359,10 +1359,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) {
     s.accept(this);
 }
 
-bool CodeGen_LLVM::is_power_of_two(int x) const {
-    return (x & (x - 1)) == 0;
-}
-
 Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
     if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) {
         return Float(32, t.lanes());
@@ -2207,10 +2203,13 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         internal_assert(vecs[0]->getType() == vecs[i]->getType());
     }
     int vec_elements = get_vector_num_elements(vecs[0]->getType());
+    const int num_vecs = (int)vecs.size();
 
-    if (vecs.size() == 1) {
+    int factor = gcd(vec_elements, num_vecs);
+
+    if (num_vecs == 1) {
         return vecs[0];
-    } else if (vecs.size() == 2) {
+    } else if (num_vecs == 2) {
         Value *a = vecs[0];
         Value *b = vecs[1];
         vector<int> indices(vec_elements * 2);
@@ -2218,57 +2217,251 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
         return shuffle_vectors(a, b, indices);
-    } else {
-        // Grab the even and odd elements of vecs.
-        vector<Value *> even_vecs;
-        vector<Value *> odd_vecs;
-        for (size_t i = 0; i < vecs.size(); i++) {
-            if (i % 2 == 0) {
-                even_vecs.push_back(vecs[i]);
-            } else {
-                odd_vecs.push_back(vecs[i]);
+    } else if (factor == 1) {
+        // The number of vectors and the vector length is
+        // coprime. (E.g. interleaving an odd number of vectors of some
+        // power-of-two length). Use the algorithm from "A Decomposition for
+        // In-place Matrix Transposition" by Catanzaro et al.
+        std::vector<Value *> v = vecs;
+
+        // Using unary shuffles, get each element into the right ultimate
+        // lane. This works out without collisions because the number of vectors
+        // and the length of each vector is coprime.
+        std::vector<int> shuffle(vec_elements);
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                shuffle[k % vec_elements] = j;
             }
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
+        }
+
+        // We intentionally don't put an optimization fence after the unary
+        // shuffles, because some architectures have a two-way shuffle, so it
+        // helps to fuse the unary shuffle into the first layer of two-way
+        // blends below.
+
+        // Now we need to transfer the elements across the vectors. If we
+        // reorder the vectors, this becomes a rotation across the vectors of a
+        // different amount per lane.
+        std::vector<Value *> new_v(v.size());
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            new_v[i] = v[j];
         }
+        v.swap(new_v);
 
-        // If the number of vecs is odd, save the last one for later.
-        Value *last = nullptr;
-        if (even_vecs.size() > odd_vecs.size()) {
-            last = even_vecs.back();
-            even_vecs.pop_back();
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
         }
-        internal_assert(even_vecs.size() == odd_vecs.size());
+        internal_assert(rotation[0] == 0);
 
-        // Interleave the even and odd parts.
-        Value *even = interleave_vectors(even_vecs);
-        Value *odd = interleave_vectors(odd_vecs);
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        int d = 1;
+        while (d < num_vecs) {
 
-        if (last) {
-            int result_elements = vec_elements * vecs.size();
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
+            }
+
+            for (int i = 0; i < num_vecs; i++) {
+                int j = (i + num_vecs - d) % num_vecs;
+                new_v[i] = shuffle_vectors(v[i], v[j], shuffle);
+            }
+
+            v.swap(new_v);
 
-            // Interleave even and odd, leaving a space for the last element.
-            vector<int> indices(result_elements, -1);
-            for (int i = 0, idx = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size();
-                    idx++;
+            d *= 2;
+        }
+
+        return concat_vectors(v);
+    } else {
+        // The number of vectors shares a factor with the length of the
+        // vectors. Pick some factor of the number of vectors, interleave in
+        // separate groups, and then interleave the results. Do the largest
+        // power of two factor first.
+        int f = largest_power_of_two_factor(num_vecs);
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
+                    f = i;
+                    break;
                 }
             }
-            Value *even_odd = shuffle_vectors(even, odd, indices);
+        }
 
-            // Interleave the last vector into the result.
-            last = slice_vector(last, 0, result_elements);
-            for (int i = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = i;
-                } else {
-                    indices[i] = i / vecs.size() + result_elements;
+        // if f == 1 then the vector length is a multiple of the
+        // interleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad each vector out to a
+        // power of two size, interleave, and discard the tail of the
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            std::vector<Value *> padded(num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                // slice_vector can also be used to pad with don't cares
+                padded[i] = slice_vector(vecs[i], 0, padded_size);
+            }
+            Value *v = interleave_vectors(padded);
+            return slice_vector(v, 0, num_vecs * vec_elements);
+        }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
+
+        vector<vector<Value *>> groups(f);
+        for (int i = 0; i < num_vecs; i++) {
+            groups[i % f].push_back(vecs[i]);
+        }
+
+        // Interleave each group
+        vector<Value *> interleaved(f);
+        for (int i = 0; i < f; i++) {
+            interleaved[i] = optimization_fence(interleave_vectors(groups[i]));
+        }
+
+        // Interleave the result
+        return interleave_vectors(interleaved);
+    }
+}
+
+std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType());
+    internal_assert(vec_elements % num_vecs == 0);
+    vec_elements /= num_vecs;
+
+    int factor = gcd(vec_elements, num_vecs);
+
+    if (num_vecs == 1) {
+        return {vec};
+    } else if (num_vecs == 2) {
+        std::vector<Value *> result(2);
+        std::vector<int> indices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i] = i * 2;
+        }
+        result[0] = shuffle_vectors(vec, vec, indices);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i]++;
+        }
+        result[1] = shuffle_vectors(vec, vec, indices);
+        return result;
+    } else if (factor == 1) {
+        // Use the inverse of Catanzaro's algorithm from above. We slice into
+        // distinct vectors, then rotate each element into the correct final
+        // vector, then do a unary permutation of each vector.
+
+        // Instead of concatenating, we slice.
+        std::vector<Value *> v(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            v[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+
+        // Compute the same rotation as above
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
+        }
+        internal_assert(rotation[0] == 0);
+
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        std::vector<int> shuffle(vec_elements);
+        std::vector<Value *> new_v(v.size());
+        int d = 1;
+        while (d < num_vecs) {
+
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
+            }
+
+            for (int i = 0; i < num_vecs; i++) {
+                // The rotation is in the opposite direction to the interleaving
+                // version, so num_vecs - d becomes just d.
+                int j = (i + d) % num_vecs;
+                // An optimization fence here keeps it as a blend and stops it
+                // from getting fused with the unary shuffle below.
+                new_v[i] = optimization_fence(shuffle_vectors(v[i], v[j], shuffle));
+            }
+
+            v.swap(new_v);
+            d *= 2;
+        }
+
+        // Now reorder the vectors in the inverse order to the above.
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            // j and i are swapped below, because we're doing the inverse of the
+            // algorithm above. This map is 1:1 because vec_elements and
+            // num_vecs are coprime, so every slot of new_v is stored to.
+            new_v[j] = v[i];
+        }
+        v.swap(new_v);
+
+        // The elements are now in the correct vector. Finish up with a unary
+        // shuffle of each.
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                // This is the inverse shuffle of the interleaving version, so
+                // the index and the arg of the assignment below are swapped
+                // compared to the above.
+                shuffle[j] = k % vec_elements;
+            }
+
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
+        }
+
+        return v;
+
+    } else {
+        // Do a lower-factor deinterleave, then deinterleave each result
+        // again. We know there's a non-trivial factor because if it were prime
+        // the gcd above would have been 1. Do the largest power-of-two factor
+        // first.
+        int f = largest_power_of_two_factor(num_vecs);
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
+                    f = i;
+                    break;
                 }
             }
+        }
 
-            return shuffle_vectors(even_odd, last, indices);
-        } else {
-            return interleave_vectors({even, odd});
+        // if f == 1 then the final vector length is a multiple of the
+        // deinterleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad the vector out to a
+        // power of two size, deinterleave, and discard the tail of each vector
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            Value *padded = slice_vector(vec, 0, padded_size * num_vecs);
+            std::vector<Value *> result = deinterleave_vector(padded, num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                result[i] = slice_vector(result[i], 0, vec_elements);
+            }
+            return result;
         }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
+
+        auto partial = deinterleave_vector(vec, f);
+        std::vector<Value *> result(num_vecs);
+        for (size_t i = 0; i < partial.size(); i++) {
+            Value *v = partial[i];
+            auto vecs = deinterleave_vector(v, num_vecs / f);
+            for (size_t j = 0; j < vecs.size(); j++) {
+                result[j * f + i] = vecs[j];
+            }
+        }
+
+        return result;
     }
 }
 
@@ -4162,6 +4355,24 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
 
     if (op->is_interleave()) {
         value = interleave_vectors(vecs);
+    } else if (op->is_transpose()) {
+        int cols = op->transpose_factor();
+        int rows = op->vectors[0].type().lanes() / cols;
+        if (is_power_of_two(cols) &&
+            !is_power_of_two(rows)) {
+            // We're doing something like vectorizing over c and x when storing
+            // packed rgb. Best handled as an interleave.
+            std::vector<Value *> slices(rows);
+            for (int i = 0; i < rows; i++) {
+                slices[i] = slice_vector(vecs[0], i * cols, cols);
+            }
+            value = interleave_vectors(slices);
+        } else {
+            // Deinterleave out the cols of the input matrix and concat
+            // them. Occurs when, for example, loading packed RGB and
+            // vectorizing across x.
+            value = concat_vectors(deinterleave_vector(vecs[0], cols));
+        }
     } else if (op->is_concat()) {
         value = concat_vectors(vecs);
     } else {
@@ -4981,8 +5192,12 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
 
 Value *CodeGen_LLVM::optimization_fence(Value *v) {
     llvm::Type *t = v->getType();
-    internal_assert(!t->isScalableTy())
-        << "optimization_fence does not support scalable vectors yet";
+    if (t->isScalableTy()) {
+        // Convert to fixed, fence, convert back.
+        Value *fixed = scalable_to_fixed_vector_type(v);
+        fixed = optimization_fence(fixed);
+        return fixed_to_scalable_vector_type(fixed);
+    }
     const int bits = t->getPrimitiveSizeInBits();
     if (bits % 32) {
         const int lanes = get_vector_num_elements(t);
@@ -4994,7 +5209,7 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
         v = slice_vector(v, 0, lanes);
         return v;
     }
-    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
+    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index bdd267020f1a..57d78172c4fa 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -462,6 +462,9 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** The inverse of interleave_vectors. */
+    virtual std::vector<llvm::Value *> deinterleave_vector(llvm::Value *vec, int num_vecs);
+
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -530,8 +533,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** Shorthand for shuffling a single vector. */
     llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);
 
-    bool is_power_of_two(int x) const;
-
     bool is_scalable_vector(llvm::Value *v) const;
 
     /** Go looking for a vector version of a runtime function. Will
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index f5cd7713884c..3e52ac7fa1a5 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -11,6 +11,8 @@
 #include "Substitute.h"
 #include "Util.h"
 
+#include <algorithm>
+
 namespace Halide {
 namespace Internal {
 
@@ -111,6 +113,9 @@ class CodeGen_X86 : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
     // @}
 
+    std::vector<llvm::Value *> deinterleave_vector(llvm::Value *, int) override;
+    llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &) override;
+
 private:
     Scope<MemoryType> mem_type;
 };
@@ -929,6 +934,753 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+std::vector<Value *> CodeGen_X86::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType()) / num_vecs;
+    const size_t element_bits = vec->getType()->getScalarSizeInBits();
+    if (target.has_feature(Target::AVX) &&
+        is_power_of_two(num_vecs) &&
+        is_power_of_two(vec_elements) &&
+        (int)(vec_elements * num_vecs * element_bits) > native_vector_bits()) {
+
+        // Our interleaving logic below supports this case
+        std::vector<Value *> slices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            slices[i] = slice_vector(vec, i * num_vecs, num_vecs);
+        }
+        vec = interleave_vectors(slices);
+        std::vector<Value *> result(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            result[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+        return result;
+    } else {
+        return CodeGen_Posix::deinterleave_vector(vec, num_vecs);
+    }
+}
+
+Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
+    // Only use x86-specific interleaving for AVX and above
+    if (vecs.empty() || !target.has_feature(Target::AVX)) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    if (vecs.size() == 1) {
+        return vecs[0];
+    }
+
+    // Get the element type and vector properties
+    llvm::Type *vec_type = vecs[0]->getType();
+    llvm::Type *element_type = get_vector_element_type(vec_type);
+    int vec_elements = get_vector_num_elements(vec_type);
+    const size_t element_bits = element_type->getScalarSizeInBits();
+    const size_t elems_per_native_vec = native_vector_bits() / element_bits;
+    const size_t elems_per_slice = 128 / element_bits;
+
+    // Only apply special x86 logic for power-of-two interleaves for avx and
+    // above where we're going to end up with multiple native vectors.
+
+    if (!is_power_of_two(vec_elements) &&
+        vec_elements % elems_per_native_vec == 0) {
+        // It's not a power of two, but it's a multiple of the native vector
+        // length, so slice it and recurse.
+        std::vector<Value *> results;
+        for (int i = 0; i < vec_elements; i += elems_per_native_vec) {
+            std::vector<Value *> slices;
+            slices.reserve(vecs.size());
+            for (auto *v : vecs) {
+                slices.push_back(slice_vector(v, i, (int)elems_per_native_vec));
+            }
+            results.push_back(interleave_vectors(slices));
+        }
+        return concat_vectors(results);
+    }
+
+    if (!is_power_of_two(vec_elements) ||
+        !is_power_of_two(vecs.size()) ||
+        (vecs.size() * vec_elements * element_bits) <= (size_t)native_vector_bits()) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    /*
+      x86 has a weird set of vector shuffle instructions due to historical
+      baggage, and the strategy in the base class for interleaving vectors
+      works poorly. Here we have a somewhat complex algorithm for generating
+      better sequences of shuffle instructions for avx and avx-512.
+
+      Consider the location of one of the elements of one of the vectors. It has
+      a vector index, which says which vector it's in, and a vector lane index,
+      which gives the lane. x86 shuffles work in terms of 128-bit subvectors,
+      which we will call slices. So we'll decompose that lane index into a slice
+      index, to identify the 128-bit slice within a vector, and the lane index
+      within that slice. For avx the slice index is either zero or one, and for
+      avx-512 it can be zero through three. Because we have limited everything
+      to be a power of two, we can write out these indices in binary. We'll use
+      v for the vector index, s for the slice index, and l for the lane
+      index. For an avx-512 interleave of 16 vectors of 32 elements each
+      (i.e. uint16s), a location could thus be written as:
+
+      [l0 l1 l2] [s0 s1] [v0 v1 v2 v3]
+
+      where l0 is the least-significant bit of the lane index, and so on.
+
+      An interleave takes the bits that give the vector index and moves them to
+      be the least significant bits, shifting everything else over. So the
+      indices of our vectors after the interleave should be:
+
+      [v0 v1 v2] [v3 l0] [l1 l2 s0 s1]
+
+      Assigning numbers to each according to their final location, we start with:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      and we want to issue some sequence of instructions to get us to:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      Now let's consider the instructions we have available. These generally
+      permute these bits. E.g. an instruction that interleaves two entire
+      vectors, applied to pairs of vectors, would take the same vector bit and
+      make it the lowest lane bit instead, shuffling the other bits upwards,
+      with the highest-order within-vector bit taking the place of the vector
+      bit (because we produce separate vectors for the low and high half of the
+      result. So if we used this instruction to push the highest vector bit
+      inwards, we could turn this:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      into this:
+
+      [3 4 5] [6 7] [0 1 2 8]
+
+      If we did this three more times, pulling a different vector bit in each
+      time, we'd get:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      and we'd be done! This is what the base class does. Unfortunately, x86 has
+      no such instruction, so we'll have to figure out something else.
+      Interleaving vectors often happens in contexts with high register
+      pressure, so we will restrict our attention to instructions that take
+      immediates. The most important one is vunpckl/h. This interleaves lanes
+      between two vectors but staying within each 128-bit slice. So the slice
+      bits will be unchanged, and the lane bits will be rotated right along with
+      one of the vector bits. So if we interleave vectors starting from the
+      second-highest vector bit, we can turn this:
+
+      [4 5 6] [_ _] [_ _ 2 _]
+
+      into this:
+
+      [2 4 5] [_ _] [_ _ 6 _]
+
+      where the underscores indicate bits that are unchanged.
+
+      Unlike a full vector interleave, the slice bits stayed fixed, and the
+      highest within-slice lane bit (6) took the place of the vector bit
+      instead. This is at least a good start. If we do this two more times,
+      pulling in vector bits 0 and 1, we can make this:
+
+      [0 1 2] [7 8] [4 5 6 3]
+
+      The lane bits are now in the desired state. The next instruction to
+      consider is shufi. It's more general than this, but for our purposes there
+      are two interesting things we can do with it. We concatenate the low halves
+      of two vectors or the high halves of two vectors, which swaps the
+      high-order slice bit with one of the vector bits:
+
+      [_ _ _] [_ 8] [_ _ _ 3] -> [_ _ _] [_ 3] [_ _ _ 8]
+
+      We can also interleave the even slices of a vector with the even slices of
+      another (and do the same for odd), which rotates left the two slice bits
+      together with one of the vector bits:
+
+      [_ _ _] [7 3] [4 _ _ _] -> [_ _ _] [3 4] [7 _ _ _]
+
+      The vector bit became the high slice bit, the low slice bit took the place
+      of the vector bit, and the high slice bit becomes the low slice
+      bit. Filling in the underscores, we're now in this state:
+
+      [0 1 2] [3 4] [7 5 6 8]
+
+      Only the vector bits are wrong, but permuting entire vectors is free,
+      because that's just changing which register names we're referring to
+      (shuffling our array of llvm::Value *). So all totalled, per vector, we
+      needed three unckl/h instructions, and one shufi instruction of each
+      kind. If the vectors were a narrower type, it would have just added one
+      more unpckl.
+
+      If you're interleaving lots of complete vectors, that's the whole story,
+      but there are other situations to consider. It's not uncommon to want to
+      interleave half-vectors to make some number of full vectors. We can model
+      this by having some slice or even lane bits start as missing. So
+      interleaving 16 half-vectors of uint16s to 8 full vectors would be
+      starting from this:
+
+      [4 5 6] [7] [0 1 2 3]
+
+      and trying to get here:
+
+      [0 1 2] [3 4] [5 6 7]
+
+      Each of our instructions has to operate on every vector, so to reduce the
+      number of instructions so we'd first like to do something to create that
+      missing high slice bit, halving the number of vectors. E.g. we could
+      identify pairs of vectors to concatenate. Let's try concatenating pairs
+      using the high vector bit (3):
+
+      [4 5 6] [7 3] [0 1 2]
+
+      Now we do three unpcks to rotate 0 1 2 into the correct place:
+
+      [0 1 2] [7 3] [4 5 6]
+
+      Now a single shufi can rotate 7 3 and 4:
+
+      [0 1 2] [3 4] [7 5 6]
+
+      and we just need to reorder whole vectors and we're done. So in this case
+      we needed only a single shufi instruction, because our desired low slice
+      bit (3) was already sitting there as the high slice bit after
+      pairwise concatenation.
+
+      Now consider the case where we had only four half-vectors to interleave to
+      produce two whole vectors:
+
+      [2 3 4] [5] [0 1]
+
+      Let's concatenate adjacent pairs as before.
+
+      [2 3 4] [5 0] [1]
+
+      Now we do one unpck
+
+      [1 2 3] [5 0] [4]
+
+      And we encounter a problem when it comes to the second one. The next bit
+      we want pull in is hiding in the slice bits, which unpck instructions
+      can't access. So at this point we use a shufi to push it back into the
+      vector bits, swapping 0 and 4.
+
+      [1 2 3] [5 4] [0]
+
+      Now we can do the last unpck.
+
+      [0 1 2] [5 4] [3]
+
+      From here we can use two shufi instructions to fix up the vector and slice
+      bits.
+
+      So there are many possible paths depending on the number of elements per
+      vector, the number of elements per 128-bit slice of each vector, and the
+      number of vectors to interleave. The way to stay sane is to just
+      explicitly track the vectors above as l_bits, s_bits, and v_bits, and
+      transform it alongside all our instructions as we try to get the right
+      bits in the right final places.
+    */
+
+    // Make a working copy
+    std::vector<llvm::Value *> v = vecs;
+
+    // The number of 128-bit slices per vector is 2 for avx and 4 for avx512
+    const int final_num_s_bits = ctz64(native_vector_bits() / 128);
+    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2)
+        << native_vector_bits() << " " << final_num_s_bits;
+
+    const int num_v_bits = ctz64(v.size());
+    const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice);
+    const int num_l_bits = ctz64(std::min((size_t)vec_elements, elems_per_slice));
+
+    // Construct the initial tracking vectors for each bit location
+    std::vector<int> v_bits(num_v_bits), l_bits(num_l_bits), s_bits(num_s_bits);
+    int c = 0;
+    for (int i = 0; i < num_v_bits; i++) {
+        // We want the v bits to end up innermost, so number them 0, 1, 2 ...
+        v_bits[i] = c++;
+    }
+    for (int i = 0; i < num_l_bits; i++) {
+        // Then come the l bits
+        l_bits[i] = c++;
+    }
+    for (int i = 0; i < num_s_bits; i++) {
+        // and finally, the slice bits
+        s_bits[i] = c++;
+    }
+
+    // Now we define helpers for each instruction we are going to use
+
+    // Useful for debugging or enhancing this algorithm
+    /*
+    auto dump_bits = [&]() {
+        for (int b : l_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : s_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : v_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "\n";
+    };
+    */
+
+    // unpckl/h instruction
+    auto unpck = [&](Value *a, Value *b) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+
+        for (int i = 0; i < n; i += (int)elems_per_slice) {
+            int half = (int)elems_per_slice / 2;
+            // For the low result, interleave the first half of each slice
+            for (int j = 0; j < half; j++) {
+                lo_indices.push_back(i + j);
+                lo_indices.push_back(n + i + j);
+            }
+            // For the high result, interleave the second half of each slice
+            for (int j = half; j < (int)elems_per_slice; j++) {
+                hi_indices.push_back(i + j);
+                hi_indices.push_back(n + i + j);
+            }
+        }
+
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        // Everything falls apart if we let LLVM fuse shuffles, so we add
+        // optimization fences around the results to ensure we get the
+        // instructions we're asking for.
+        return {optimization_fence(lo), optimization_fence(hi)};
+    };
+
+    // shufi instruction, with or without cross-over
+    auto shufi = [&](Value *a, Value *b, bool crossover) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+        if (final_num_s_bits == 2) {
+            // AVX-512
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + (crossover ? 1 : 2) * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i + (crossover ? 2 : 1) * (int)elems_per_slice);
+                hi_indices.push_back(i + 3 * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice * 2; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        } else {
+            // AVX-2
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        }
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        return {lo, hi};
+    };
+
+    // A 2x2 transpose of slices within a single vector
+    auto self_shufi = [&](Value *a) -> Value * {
+        internal_assert(4 * (int)elems_per_slice == vec_elements)
+            << "Should only be using shufi helper when targeting avx-512 shuffles on native vectors\n"
+            << elems_per_slice << " " << vec_elements << " " << native_vector_bits() << "\n";
+        std::vector<int> indices;
+        for (int j : {0, 2, 1, 3}) {
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                indices.push_back(i + j * (int)elems_per_slice);
+            }
+        }
+        return shuffle_vectors(a, a, indices);
+    };
+
+    // A helper to iterate over all pairs of entries in v, separated by some
+    // power-of-two spacing.
+    auto for_all_pairs = [&](size_t log_step, auto fn) {
+        size_t step = 1 << log_step;
+        for (size_t i = 0; i < v.size(); i++) {
+            // Pair each vector with the one separated by the step.
+            size_t j = i ^ step;
+
+            // Don't process vectors twice.
+            if (j < i) {
+                continue;
+            }
+
+            fn(&v[i], &v[j]);
+        }
+    };
+
+    // First, if the vectors are wider than native, that will manifest as too
+    // many slice bits. Cut them into separate native vectors. This will not
+    // create any instructions.
+    while ((size_t)vec_elements > elems_per_native_vec) {
+        int cut = vec_elements / 2;
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() * 2);
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, 0, cut));
+        }
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, cut, cut));
+        }
+        v = new_v;
+        vec_elements = cut;
+
+        v_bits.push_back(s_bits.back());
+        s_bits.pop_back();
+    }
+
+    // If adjacent vectors are shuffles of the same underlying vector(s),
+    // concatenate pairs, because this is probably free.
+    while ((size_t)vec_elements < elems_per_native_vec && !v_bits.empty()) {
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        bool fail = false;
+        std::vector<int> indices;
+        indices.reserve(vec_elements * 2);
+        for (size_t i = 0; i < v.size(); i += 2) {
+            ShuffleVectorInst *a = llvm::dyn_cast<ShuffleVectorInst>(v[i]);
+            ShuffleVectorInst *b = llvm::dyn_cast<ShuffleVectorInst>(v[i + 1]);
+            if (a &&
+                b &&
+                a->getOperand(0) == b->getOperand(0) &&
+                a->getOperand(1) == b->getOperand(1)) {
+
+                // Concatenate the two shuffles
+                indices.clear();
+                for (int j : a->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                for (int j : b->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                new_v.push_back(shuffle_vectors(a->getOperand(0), a->getOperand(1), indices));
+            } else {
+                fail = true;
+            }
+        }
+        if (fail) {
+            break;
+        }
+
+        v.swap(new_v);
+        // The lowest vector bit becomes the highest lane or slice bit
+        if ((size_t)vec_elements < elems_per_slice) {
+            l_bits.push_back(v_bits[0]);
+        } else {
+            s_bits.push_back(v_bits[0]);
+        }
+        v_bits.erase(v_bits.begin());
+        vec_elements *= 2;
+    }
+
+    if (final_num_s_bits > 1 &&
+        (size_t)vec_elements == elems_per_native_vec &&
+        (size_t)v_bits[0] >= l_bits.size() - 1) {
+        // A big binary shuffle of adjacent pairs will fix the l bits
+        // entirely. AVX-512 has these. Yes, this will use registers for the
+        // shuffle indices, but the alternative requires very many unpck
+        // operations to completely cycle out the v_bits that are hiding in the
+        // bottom of the l_bits.
+
+        std::vector<int> lo_indices(vec_elements);
+        std::vector<int> hi_indices(vec_elements);
+        std::vector<int> sorted_bits = l_bits;
+        sorted_bits.insert(sorted_bits.end(), s_bits.begin(), s_bits.end());
+        sorted_bits.push_back(v_bits[0]);
+        std::sort(sorted_bits.begin(), sorted_bits.end());
+        std::vector<int> idx_of_bit(l_bits.size() + s_bits.size() + v_bits.size(), 0);
+        for (size_t b = 0; b < sorted_bits.size(); b++) {
+            idx_of_bit[sorted_bits[b]] = b;
+        }
+
+        for (size_t dst_idx = 0; dst_idx < (size_t)vec_elements * 2; dst_idx++) {
+            size_t src_idx = 0;
+            for (size_t b = 0; b < l_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[l_bits[b]]) & 1) << b;
+            }
+            for (size_t b = 0; b < s_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[s_bits[b]]) & 1) << (b + l_bits.size());
+            }
+            src_idx |= ((dst_idx >> idx_of_bit[v_bits[0]]) & 1) << (l_bits.size() + s_bits.size());
+            if (dst_idx < (size_t)vec_elements) {
+                lo_indices[dst_idx] = (int)src_idx;
+            } else {
+                hi_indices[dst_idx - vec_elements] = (int)src_idx;
+            }
+        }
+
+        for_all_pairs(0, [&](auto *a, auto *b) {
+            Value *lo = shuffle_vectors(*a, *b, lo_indices);
+            Value *hi = shuffle_vectors(*a, *b, hi_indices);
+            *a = lo;
+            *b = hi;
+        });
+
+        auto first_s_bit = sorted_bits.begin() + l_bits.size();
+        std::copy(sorted_bits.begin(), first_s_bit, l_bits.begin());
+        std::copy(first_s_bit, first_s_bit + s_bits.size(), s_bits.begin());
+        v_bits[0] = sorted_bits.back();
+    }
+
+    // Interleave pairs if we have vectors smaller than a single slice. Choosing
+    // which pairs to interleave is important because we want to pull down v
+    // bits that are destined to end up as l bits, and we want to pull them down
+    // in order.
+    if ((size_t)vec_elements < elems_per_slice) {
+        int highest_desired_l_bit = ctz64(elems_per_slice) - 1;
+        int bit = highest_desired_l_bit;
+        if (!v_bits.empty() && std::find(v_bits.begin(), v_bits.end(), bit) == v_bits.end()) {
+            bit = v_bits.back();
+        }
+
+        while (bit >= 0 && (size_t)vec_elements < elems_per_slice && !v_bits.empty()) {
+            auto it = std::find(v_bits.begin(), v_bits.end(), bit);
+            if (it == v_bits.end()) {
+                break;
+            }
+            int j = it - v_bits.begin();
+            v_bits.erase(it);
+            l_bits.insert(l_bits.begin(), bit);
+
+            // The distance in the vecs array is the index of the corresponding
+            // v bit we're pulling down.
+            std::vector<Value *> new_v;
+            new_v.reserve(v.size() / 2);
+            for_all_pairs(j, [&](auto *a, auto *b) {
+                // Just interleave the two vectors. Because we have fewer
+                // elements than one slice, unpckl/h is a straight interleave.
+                std::vector<int> indices;
+                for (int k = 0; k < vec_elements; k++) {
+                    indices.push_back(k);
+                    indices.push_back(vec_elements + k);
+                }
+                new_v.push_back(shuffle_vectors(*a, *b, indices));
+            });
+            v.swap(new_v);
+            vec_elements *= 2;
+            bit--;
+        }
+    }
+
+    // Concatenate/repack to get at least the desired number of slice bits.
+    while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) {
+        const int desired_low_slice_bit = ctz64(elems_per_slice);
+        const int desired_high_slice_bit = desired_low_slice_bit + 1;
+        int bit;
+        if (!s_bits.empty() &&
+            s_bits[0] == desired_low_slice_bit) {
+            // Only the avx-512 path should land here due to the while condition.
+            internal_assert(final_num_s_bits == 2);
+            bit = desired_high_slice_bit;
+        } else {
+            bit = desired_low_slice_bit;
+        }
+
+        auto v_it = std::find(v_bits.begin(), v_bits.end(), bit);
+
+        if (v_it == v_bits.end()) {
+            // Just concatenate according to the lowest vector bit.
+            v_it = v_bits.begin();
+            bit = *v_it;
+        }
+
+        int j = v_it - v_bits.begin();
+        v_bits.erase(v_it);
+        s_bits.push_back(bit);
+
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            new_v.push_back(concat_vectors({*a, *b}));
+        });
+        v.swap(new_v);
+        vec_elements *= 2;
+    }
+
+    // There should be more than one vector left
+    internal_assert(v.size() > 1);
+
+    // Now we have at least two whole vectors. Next we try to finalize lane bits using
+    // unpck instructions.
+    while (l_bits[0] != 0) {
+
+        int first_s_bit = (int)ctz64(elems_per_slice);
+        int bit = std::min(l_bits[0], first_s_bit) - 1;
+
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+
+        // internal_assert(vb_it != v_bits.end());
+        if (vb_it == v_bits.end()) {
+            // The next bit is not in vector bits. It must be hiding in the
+            // slice bits due to earlier concatenation. Move it into the v_bits
+            // with a shufi. We'll need to pick a v bit to take its place,
+            // ideally one destined to end up in the s bits.
+            vb_it = std::find_if(v_bits.begin(), v_bits.end(), [&](int b) { return b >= first_s_bit; });
+            if (vb_it == v_bits.end()) {
+                vb_it = v_bits.begin();
+            }
+
+            if (s_bits.back() == bit) {
+                // It's the last (or sole) slice bit. Swap it with the first v bit
+                std::swap(s_bits.back(), *vb_it);
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
+            } else {
+                internal_assert(s_bits.size() == 2 && s_bits[0] == bit);
+                // It's the low slice bit. We need shufi with crossover.
+                int v_bit = *vb_it;
+                *vb_it = s_bits[0];
+                s_bits[0] = s_bits[1];
+                s_bits[1] = v_bit;
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+            }
+        }
+
+        int j = vb_it - v_bits.begin();
+        *vb_it = l_bits.back();
+        l_bits.pop_back();
+        l_bits.insert(l_bits.begin(), bit);
+
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = unpck(*a, *b);
+            *a = lo;
+            *b = hi;
+        });
+    }
+
+    // Lane bits should now be 0, 1, 2, 3...
+    for (int i = 0; i < (int)l_bits.size(); i++) {
+        internal_assert(l_bits[i] == i);
+    }
+
+    // Time to fix the slice bits
+
+    // First the low slice bit. If it's one of the v bits, move it to be the
+    // high slice bit with a shufi.
+    int low_slice_bit = l_bits.size();
+    auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
+    if (ls_in_v != v_bits.end()) {
+        int j = ls_in_v - v_bits.begin();
+        std::swap(*ls_in_v, s_bits.back());
+
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = shufi(*a, *b, false);
+            *a = lo;
+            *b = hi;
+        });
+    }
+
+    // And then the high slice bit, if there is one
+    if (final_num_s_bits == 2) {
+        // AVX-512
+        int high_slice_bit = low_slice_bit + 1;
+        auto hs_in_v = std::find(v_bits.begin(), v_bits.end(), high_slice_bit);
+        if (hs_in_v != v_bits.end()) {
+            // The high slice bit is in the v_bits. Note that if it's not, it'll
+            // be one of the slice bits. It can't be an l bit, because we've
+            // already finalized them.
+            int j = hs_in_v - v_bits.begin();
+
+            if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
+                // The low slice bit is currently occupying the high slice bit slot,
+                // so we need to shuffle it over at the same time by using the
+                // crossover variant of shufi.
+                int temp = s_bits[0];
+                s_bits[0] = s_bits.back();
+                s_bits.back() = *hs_in_v;
+                *hs_in_v = temp;
+
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+
+            } else {
+                // The low slice bit must be already in place, so no crossover required.
+                internal_assert(s_bits[0] == low_slice_bit);
+                std::swap(*hs_in_v, s_bits.back());
+
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
+            }
+        } else if (s_bits.size() == 2 &&
+                   s_bits[0] == high_slice_bit &&
+                   s_bits[1] == low_slice_bit) {
+            // The slice bits are both there, but in the wrong order
+            std::swap(s_bits[0], s_bits[1]);
+            for (auto &vec : v) {
+                vec = self_shufi(vec);
+            }
+        }
+
+        // Both slice bits should be correct now
+        internal_assert(s_bits.size() == 2 &&
+                        s_bits[0] == low_slice_bit &&
+                        s_bits[1] == high_slice_bit);
+
+    } else {
+        // AVX-2 The sole slice bit should be correct now.
+        internal_assert(s_bits.size() == 1 &&
+                        s_bits[0] == low_slice_bit);
+    }
+
+    // The lane and slice bits are correct, but the vectors are in some
+    // arbitrary order. We'll reorder them by deinterleaving the list according
+    // to each bit position, in increasing order.
+    for (size_t i = 0; i < v_bits.size(); i++) {
+        int bit = i + s_bits.size() + l_bits.size();
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        internal_assert(vb_it != v_bits.end());
+
+        int j = vb_it - v_bits.begin();
+        v_bits.erase(vb_it);
+        v_bits.push_back(bit);
+
+        std::vector<Value *> a, b;
+        a.reserve(v.size() / 2);
+        b.reserve(v.size() / 2);
+        int mask = 1 << j;
+        for (size_t k = 0; k < v.size(); k++) {
+            if ((k & mask) == 0) {
+                a.push_back(v[k]);
+            } else {
+                b.push_back(v[k]);
+            }
+        }
+        v.clear();
+        v.insert(v.end(), a.begin(), a.end());
+        v.insert(v.end(), b.begin(), b.end());
+    }
+
+    // The v bits should be correct now
+    for (int i = 0; i < (int)v_bits.size(); i++) {
+        internal_assert(v_bits[i] == i + (int)(l_bits.size() + s_bits.size()));
+    }
+
+    // Concatenate all results into a single vector. Phew.
+    return concat_vectors(v);
+}
+
 void CodeGen_X86::visit(const Allocate *op) {
     ScopedBinding<MemoryType> bind(mem_type, op->name, op->memory_type);
     CodeGen_Posix::visit(op);
diff --git a/src/IR.cpp b/src/IR.cpp
index 2c91d16b50e1..c5158728f367 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -815,6 +815,21 @@ Expr Shuffle::make_interleave(const std::vector<Expr> &vectors) {
     return make(vectors, indices);
 }
 
+Expr Shuffle::make_transpose(Expr e, int cols) {
+    internal_assert(e.type().lanes() % cols == 0)
+        << "Transpose cols must divide the number of lanes.\n";
+    int rows = e.type().lanes() / cols;
+
+    std::vector<int> indices(e.type().lanes());
+    for (int j = 0; j < cols; j++) {
+        for (int i = 0; i < rows; i++) {
+            indices[j * rows + i] = i * cols + j;
+        }
+    }
+
+    return make({std::move(e)}, indices);
+}
+
 Expr Shuffle::make_concat(const std::vector<Expr> &vectors) {
     internal_assert(!vectors.empty()) << "Concat of zero vectors.\n";
 
@@ -1012,6 +1027,33 @@ bool Shuffle::is_concat() const {
     return indices.size() == input_lanes && is_ramp(indices);
 }
 
+bool Shuffle::is_transpose() const {
+    if (vectors.size() > 1 ||
+        (int)indices.size() != vectors[0].type().lanes() ||
+        indices.size() < 2 ||
+        indices[0] != 0 ||
+        indices[1] <= 0) {
+        return false;
+    }
+    int cols = indices[1];
+    int rows = vectors[0].type().lanes() / cols;
+    if ((int)indices.size() != rows * cols) {
+        return false;
+    }
+    for (int row = 0; row < rows; row++) {
+        for (int col = 0; col < cols; col++) {
+            if (indices[col * rows + row] != row * cols + col) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+int Shuffle::transpose_factor() const {
+    return indices[1] - indices[0];
+}
+
 bool Shuffle::is_slice() const {
     size_t input_lanes = 0;
     for (const Expr &i : vectors) {
diff --git a/src/IR.h b/src/IR.h
index 3666581803db..c1d5a57483e3 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -990,6 +990,13 @@ struct Shuffle : public ExprNode<Shuffle> {
      * interleaving of vectors of the same length. */
     static Expr make_interleave(const std::vector<Expr> &vectors);
 
+    /** Convenience constructor for making a shuffle representing an in-place
+     * transpose of a row-major matrix with the given number of columns. The
+     * output, interpreted as a row-major matrix, therefore has than number of
+     * rows. For example, to turn the vector RGBRGBRGBRGB into RRRRGGGGBBBB cols
+     * would be 3, and to do the reverse cols would be 4. */
+    static Expr make_transpose(Expr e, int cols);
+
     /** Convenience constructor for making a shuffle representing a
      * concatenation of the vectors. */
     static Expr make_concat(const std::vector<Expr> &vectors);
@@ -1010,6 +1017,13 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
+    /** Check if this shuffle is an in-place transpose of a single vector. The
+     * factor is the number of columns of the source matrix, or equivalently,
+     * the number of rows of the destination matrix, interpreting a vector as a
+     * matrix stored row-major. */
+    bool is_transpose() const;
+    int transpose_factor() const;
+
     /** Check if this shuffle can be represented as a repeating pattern that
      * repeats the same shuffle of the single input vector some number of times.
      * For example: 0, 3, 1, 1,  0, 3, 1, 1, .....,  0, 3, 1, 1
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 4ec8b2694e3f..dc5922a80028 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2249,6 +2249,60 @@ HALIDE_ALWAYS_INLINE auto slice(Vec vec, Base base, Stride stride, Lanes lanes)
     return {pattern_arg(vec), pattern_arg(base), pattern_arg(stride), pattern_arg(lanes)};
 }
 
+template<typename Vec, typename Factor>
+struct TransposeOp {
+    struct pattern_tag {};
+    Vec vec;
+    Factor factor;
+
+    static constexpr uint32_t binds = Vec::binds | Factor::binds;
+
+    constexpr static IRNodeType min_node_type = IRNodeType::Shuffle;
+    constexpr static IRNodeType max_node_type = IRNodeType::Shuffle;
+    constexpr static bool canonical = Vec::canonical && Factor::canonical;
+
+    template<uint32_t bound>
+    HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
+        if (e.node_type != IRNodeType::Shuffle) {
+            return false;
+        }
+        const Shuffle &v = (const Shuffle &)e;
+        return v.vectors.size() == 1 &&
+               v.is_transpose() &&
+               vec.template match<bound>(*v.vectors[0].get(), state) &&
+               factor.template match<(bound | bindings<Vec>::mask)>(v.transpose_factor(), state);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        halide_scalar_value_t factor_val;
+        halide_type_t ty;
+        factor.make_folded_const(factor_val, ty, state);
+        int f = (int)factor_val.u.i64;
+        return Shuffle::make_transpose(vec.make(state, type_hint), f);
+    }
+
+    constexpr static bool foldable = false;
+
+    HALIDE_ALWAYS_INLINE
+    TransposeOp(Vec v, Factor f)
+        : vec(v), factor(f) {
+        static_assert(Factor::foldable, "Factor of transpose should consist only of operations that constant-fold");
+    }
+};
+
+template<typename Vec, typename Factor>
+std::ostream &operator<<(std::ostream &s, const TransposeOp<Vec, Factor> &op) {
+    s << "transpose(" << op.vec << ", " << op.factor << ")";
+    return s;
+}
+
+template<typename Vec, typename Factor>
+HALIDE_ALWAYS_INLINE auto transpose(Vec vec, Factor factor) noexcept
+    -> TransposeOp<decltype(pattern_arg(vec)), decltype(pattern_arg(factor))> {
+    return {pattern_arg(vec), pattern_arg(factor)};
+}
+
 template<typename A>
 struct Fold {
     struct pattern_tag {};
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index e95286af03ee..9cd5527b09a6 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1461,6 +1461,11 @@ void IRPrinter::visit(const Shuffle *op) {
         stream << paren(", ") << imm_int(op->slice_begin())
                << paren(", ") << imm_int(op->slice_stride())
                << paren(", ") << imm_int(op->indices.size());
+    } else if (op->is_transpose()) {
+        openf("transpose_vector");
+        print_list(op->vectors);
+        stream << paren(", ") << imm_int(op->transpose_factor());
+
     } else {
         openf("shuffle");
         print_list(op->vectors);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 9b55bd20840d..08e6f8dd5b97 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -376,7 +376,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after partitioning loops:", s);
 
     debug(1) << "Staging strided loads...\n";
-    s = stage_strided_loads(s);
+    s = stage_strided_loads(s, t);
     log("Lowering after staging strided loads:", s);
 
     debug(1) << "Trimming loops to the region over which they do something...\n";
diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp
index 6158cc9cd48c..06967a8d32d3 100644
--- a/src/Simplify_Add.cpp
+++ b/src/Simplify_Add.cpp
@@ -120,6 +120,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) + z), slice(x + y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (z - slice(y, c0, c1, c2)), slice(x - y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) - z), slice(x + y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) + transpose(y, c0), transpose(x + y, c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(x + x * y, x * (y + 1)) ||
diff --git a/src/Simplify_EQ.cpp b/src/Simplify_EQ.cpp
index 994d14cd4cee..5d8c09901b49 100644
--- a/src/Simplify_EQ.cpp
+++ b/src/Simplify_EQ.cpp
@@ -195,6 +195,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *info) {
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) == slice(y, c0, c1, c2) + z,
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) == transpose(y, c0), transpose(x == y, c0)) ||
          false) ||
         (no_overflow(a.type()) && EVAL_IN_LAMBDA  //
          (rewrite(x * y == 0, (x == 0) || (y == 0)) ||
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index bbd67a5bace0..da8a96e41f4f 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -328,8 +328,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
 
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
@@ -349,18 +350,41 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                                op->image, op->param, const_true(new_lanes, nullptr), align);
         return Broadcast::make(load, b_index->lanes);
     } else if (s_index &&
-               is_const_one(predicate) &&
                (s_index->is_concat() ||
                 s_index->is_interleave())) {
-        // Loads of concats/interleaves should be concats/interleaves of loads
+        // Loads of concats/interleaves should be concats/interleaves of
+        // loads. We'll need to slice up the predicate though.
         std::vector<Expr> loaded_vecs;
         for (const Expr &new_index : s_index->vectors) {
             int new_lanes = new_index.type().lanes();
+            Expr predicate_slice =
+                is_const_one(predicate) ? const_true(new_lanes, nullptr) :
+                s_index->is_concat() ?
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size() * new_lanes, 1, new_lanes) :
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size(), op->type.lanes() / new_lanes, new_lanes);
+            predicate_slice = mutate(predicate_slice, nullptr);
+
             Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index,
-                                   op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{});
+                                   op->image, op->param, predicate_slice, ModulusRemainder{});
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense loads and transpose the
+        // resulting vector instead.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
+        Expr transposed_load =
+            Load::make(op->type, op->name, transposed_index, op->image, op->param, transposed_predicate, align);
+        return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Max.cpp b/src/Simplify_Max.cpp
index 1926bc9a069e..cc4253ca718f 100644
--- a/src/Simplify_Max.cpp
+++ b/src/Simplify_Max.cpp
@@ -212,6 +212,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *info) {
          rewrite(max(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(max(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(slice(y, c0, c1, c2), z)), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(z, slice(y, c0, c1, c2))), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(max(transpose(x, c0), transpose(y, c0)), transpose(max(x, y), c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(max(max(x, y) + c0, x), max(x, y + c0), c0 < 0) ||
diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp
index 3f6084c6c4f1..e6515ab280e9 100644
--- a/src/Simplify_Min.cpp
+++ b/src/Simplify_Min.cpp
@@ -214,6 +214,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *info) {
          rewrite(min(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(min(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(slice(y, c0, c1, c2), z)), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(z, slice(y, c0, c1, c2))), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(min(transpose(x, c0), transpose(y, c0)), transpose(min(x, y), c0)) ||
          (no_overflow(op->type) &&
           (rewrite(min(min(x, y) + c0, x), min(x, y + c0), c0 > 0) ||
            rewrite(min(min(x, y) + c0, x), min(x, y) + c0, c0 < 0) ||
diff --git a/src/Simplify_Mul.cpp b/src/Simplify_Mul.cpp
index dfa38d39111c..e1bcb68fe7bc 100644
--- a/src/Simplify_Mul.cpp
+++ b/src/Simplify_Mul.cpp
@@ -81,6 +81,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *info) {
         rewrite(slice(x, c0, c1, c2) * slice(y, c0, c1, c2), slice(x * y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (slice(y, c0, c1, c2) * z), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (z * slice(y, c0, c1, c2)), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+        rewrite(transpose(x, c0) * transpose(y, c0), transpose(x * y, c0)) ||
 
         false) {
         return mutate(rewrite.result, info);
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index aecb4c6fc99a..2a614ac81744 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -95,10 +95,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
     // broadcast. Note that it doesn't matter what the indices
     // are.
     const Broadcast *b1 = new_vectors[0].as<Broadcast>();
-    if (b1) {
+    if (b1 && b1->value.type().is_scalar()) {
         bool can_collapse = true;
         for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) {
-            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>()) {
+            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>();
+                b2 && b2->value.type().is_scalar()) {
                 Expr check = mutate(b1->value - b2->value, nullptr);
                 can_collapse &= is_const_zero(check);
             } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 152c8b0fe797..3a6b459b5c88 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -342,12 +342,14 @@ Stmt Simplify::visit(const Store *op) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
     const Load *load = value.as<Load>();
+    const Shuffle *shuf = index.as<Shuffle>();
     const Broadcast *scalar_pred = predicate.as<Broadcast>();
     if (scalar_pred && !scalar_pred->value.type().is_scalar()) {
         // Nested vectorization
@@ -365,6 +367,45 @@ Stmt Simplify::visit(const Store *op) {
     } else if (is_undef(value) || (load && load->name == op->name && equal(load->index, index))) {
         // foo[x] = foo[x] or foo[x] = undef is a no-op
         return Evaluate::make(0);
+    } else if (shuf && shuf->is_concat()) {
+        // Break a store of a concat of vector indices into separate stores. A
+        // concat index will result in a general scatter at codegen time. We
+        // should just break it up here, where there is a hope that the
+        // individual elements might be simplifiable to dense ramps.
+        std::string var_name = unique_name('t');
+        Expr var = Variable::make(value.type(), var_name);
+        std::vector<Stmt> stores;
+        int lanes = 0;
+        for (const Expr &idx : shuf->vectors) {
+            stores.push_back(Store::make(op->name,
+                                         Shuffle::make_slice(var, lanes, 1, idx.type().lanes()),
+                                         idx,
+                                         op->param,
+                                         Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()),
+                                         ModulusRemainder{}));
+            lanes += idx.type().lanes();
+        }
+        Stmt s = Block::make(stores);
+        s = LetStmt::make(var_name, value, s);
+        return mutate(s);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense stores and transpose the
+        // index and value instead. Later in lowering after flattening the
+        // nested ramps it will turn into a concat of dense ramps and hit the
+        // case above.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_value = Shuffle::make_transpose(value, inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
+        return mutate(Store::make(op->name, transposed_value, transposed_index,
+                                  op->param, transposed_predicate, align));
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp
index 29bd02c78ed6..2444cb6fd1d9 100644
--- a/src/Simplify_Sub.cpp
+++ b/src/Simplify_Sub.cpp
@@ -177,6 +177,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) - (slice(y, c0, c1, c2) + z), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((slice(x, c0, c1, c2) - z) - slice(y, c0, c1, c2), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((z - slice(x, c0, c1, c2)) - slice(y, c0, c1, c2), z - slice(x + y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) - transpose(y, c0), transpose(x - y, c0)) ||
 
          (no_overflow(op->type) && EVAL_IN_LAMBDA  //
           (rewrite(max(x, y) - x, max(y - x, 0)) ||
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 5073d6194522..896a33b5193e 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -104,7 +104,7 @@ class FindStridedLoads : public IRVisitor {
                 // TODO: We do not yet handle nested vectorization here for
                 // ramps which have not already collapsed. We could potentially
                 // handle more interesting types of shuffle than simple flat slices.
-                if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) {
+                if (stride >= 2 && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
                     if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
@@ -283,7 +283,7 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
 
 }  // namespace
 
-Stmt stage_strided_loads(const Stmt &stmt) {
+Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
     FindStridedLoads finder;
     ReplaceStridedLoads replacer;
 
@@ -334,9 +334,23 @@ Stmt stage_strided_loads(const Stmt &stmt) {
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
 
+            int last_offset = first_offset;
+            int64_t biggest_gap = 0;
             std::set<const Load *> all_loads;
             for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) {
                 all_loads.insert(l->second.begin(), l->second.end());
+                biggest_gap = std::max(biggest_gap, l->first - last_offset);
+                last_offset = l->first;
+            }
+            biggest_gap = std::max(biggest_gap, (first_offset + k.stride) - last_offset);
+
+            // If our contiguous shared load has contiguous vectors in it of
+            // size at least k.lanes that are going to be entirely unused, this
+            // is a bad idea (e.g. a cluster of {ramp(0, 1024, 8) and ramp(37,
+            // 1024, 8)} should not be staged).
+            if (biggest_gap >= k.lanes) {
+                load++;
+                continue;
             }
 
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
@@ -353,15 +367,28 @@ Stmt stage_strided_loads(const Stmt &stmt) {
             const IRNode *outermost = k.scope ? k.scope : s.get();
             const IRNode *let_site = innermost_containing_node(outermost, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
+                // For larger strides we can do a better job at shuffling if we
+                // do it as one big task. For stride 2 it interferes with
+                // horizontal add pattern matching. On ARM it also interferes
+                // with LLVM's pattern matching for vld3 and vld4.
+                bool transpose_shared_load = k.stride > 2;
+                if (target.arch == Target::ARM || target.arch == Target::Hexagon) {
+                    transpose_shared_load = k.stride > 4;
+                }
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;
-                    Expr shuf = Shuffle::make_slice(var, row, k.stride, k.lanes);
+                    Expr shuf = transpose_shared_load ?
+                                    Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
+                                    Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
                         replacer.replacements.emplace(l, shuf);
                     }
                 }
+                if (transpose_shared_load) {
+                    shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                }
                 replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
@@ -378,7 +405,7 @@ Stmt stage_strided_loads(const Stmt &stmt) {
         // picked up in a cluster, but for whom we know it's safe to do a
         // dense load before their start.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t delta = k.stride - 1;
@@ -403,7 +430,7 @@ Stmt stage_strided_loads(const Stmt &stmt) {
         // Look for any loads we can densify because an overlapping load occurs
         // in any parent scope.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t min_offset = offset;
@@ -443,7 +470,7 @@ Stmt stage_strided_loads(const Stmt &stmt) {
         // external allocations by doing a dense load at a trimmed size. We rely
         // on codegen to do a good job at loading vectors of a funny size.
         for (const auto &[offset, loads] : v) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
 
diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h
index a29cef2438f1..b6afd3770981 100644
--- a/src/StageStridedLoads.h
+++ b/src/StageStridedLoads.h
@@ -37,7 +37,7 @@ namespace Internal {
  * internal allocations it adds padding to the allocation explicitly, by setting
  * the padding field on Allocate nodes.
  */
-Stmt stage_strided_loads(const Stmt &s);
+Stmt stage_strided_loads(const Stmt &s, const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Util.h b/src/Util.h
index 6382b497dc5d..f29e0ad9b6f0 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -575,6 +575,16 @@ inline int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
+/** Returns the largest power of two which is a factor of the argument. */
+inline int64_t largest_power_of_two_factor(int64_t x) {
+    return x & -x;
+}
+
+/** Return whether or not an integer is a power of two. */
+inline bool is_power_of_two(int64_t x) {
+    return (x & (x - 1)) == 0;
+}
+
 template<typename T>
 inline T align_up(T x, int n) {
     return (x + n - 1) / n * n;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 6d41a9e71219..04c087f33338 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -322,6 +322,7 @@ tests(GROUPS correctness
       tracing_broadcast.cpp
       tracing_stack.cpp
       transitive_bounds.cpp
+      transpose_idioms.cpp
       trim_no_ops.cpp
       tuple_partial_update.cpp
       tuple_reduction.cpp
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index f0183412323a..226884e2cac6 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -447,13 +447,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 Expr shift = (i_2 % bits) - (bits / 2);
                 Expr round_s = (cast_i(1) >> min(shift, 0)) / 2;
                 Expr round_u = (cast_u(1) >> min(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift));
+                // The r suffix is optional - it just changes which of the two args gets clobbered
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) << shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) << shift));
 
                 round_s = (cast_i(1) << max(shift, 0)) / 2;
                 round_u = (cast_u(1) << max(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift));
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) >> shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) >> shift));
 
                 // VRSHR    I       -       Rounding Shift Right
                 add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1));
@@ -1220,6 +1221,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             std::stringstream type_name_stream;
             type_name_stream << e.type();
             std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor);
+
+            // Some regex symbols are illegal in filenames on windows
+            std::string illegal = "<>:\"/\\|?*";
+            std::replace_if(decorated_op_name.begin(), decorated_op_name.end(),  //
+                            [&](char c) { return illegal.find(c) != std::string::npos; }, '_');
+
             auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size());
 
             // Bail out after generating the unique_name, so that names are
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index 8a82f5ca33d1..e0373a5be69e 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -200,7 +200,7 @@ int main(int argc, char **argv) {
     {
         Func f;
         Var x;
-        f(x) = buf(17 * x) + buf(17 * x + 15);
+        f(x) = buf(50 * x) + buf(50 * x + 15);
         f.vectorize(x, 16, TailStrategy::RoundUp);
 
         checker.check_not(f, 0);
diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
new file mode 100644
index 000000000000..9fb29c2883e0
--- /dev/null
+++ b/test/correctness/transpose_idioms.cpp
@@ -0,0 +1,212 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+// This test enumerates all the scheduling idioms in Halide that *should*
+// produce good code for a transpose/interleave/deinterleave operation.
+
+class Checker : public IRMutator {
+
+    using IRMutator::visit;
+
+    Expr visit(const Load *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_loads++;
+        } else if (op->type.is_vector()) {
+            gathers++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_stores++;
+        } else if (op->index.type().is_vector()) {
+            scatters++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        transposes += op->is_transpose();
+        interleaves += op->is_interleave();
+        if (op->is_slice()) {
+            if (op->slice_stride() == 1) {
+                dense_slices++;
+            } else {
+                strided_slices++;
+            }
+        }
+        return IRMutator::visit(op);
+    }
+
+public:
+    int dense_loads = 0;
+    int gathers = 0;
+    int dense_stores = 0;
+    int scatters = 0;
+    int dense_slices = 0;
+    int strided_slices = 0;
+    int interleaves = 0;
+    int transposes = 0;
+
+    void check() {
+        internal_assert(gathers == 0) << "Vector gathers found";
+        internal_assert(scatters == 0) << "Vector scatters found";
+        internal_assert(strided_slices == 0) << "strided slices found";
+        internal_assert(dense_loads) << "No dense loads found";
+        internal_assert(dense_stores) << "No dense stores found";
+        internal_assert(interleaves + transposes) << "No interleaves or transposes found";
+    }
+};
+
+void check(Func g) {
+    Checker checker;
+    g.add_custom_lowering_pass(&checker, nullptr);
+
+    // Choose a shape with lots of factors so that our RoundUp schedules work
+    int n = 16 * 9 * 7;
+    Buffer<int> out = g.realize({n, n});
+    for (int y = 0; y < out.height(); y++) {
+        for (int x = 0; x < out.width(); x++) {
+            int correct = 100 * x + y;
+            internal_assert(out(x, y) == correct)
+                << "out(" << x << ", " << y << ") = " << out(x, y)
+                << " instead of " << correct << "\n";
+        }
+    }
+
+    checker.check();
+}
+
+int main(int argc, char **argv) {
+    Var x{"x"}, y{"y"}, xi{"xi"}, yi{"yi"};
+
+    // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power
+    // of two sizes, and sizes that are coprime, and sizes that are neither
+    // coprime no powers of two. We'll use sizes larger than 4, because some
+    // backends like to do different things for small strides.
+
+    for (auto tile : {std::pair{8, 16}, {7, 5}, {6, 9}}) {
+        {
+            // Idiom 1: Strided stores into a staged transposed copy of the
+            // input. The strided stores that get mashed together into one big
+            // interleave + store by the pass that interleaves strided
+            // stores. This has to be done on a staged copy of the input rather
+            // than g so that the strided stores have a constant stride.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).reorder_storage(y, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 2: Vectorize x, unroll y. Stage a copy of the input but
+            // don't transpose it. This will create strided loads from the
+            // staged input that get hoisted out into one big dense load +
+            // transpose by the stage_strided_stores pass. The staging is
+            // required so that the strides are constant.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 3: Vectorize both, x innermost. This should be handled by
+            // shuffle optimization logic in the simplifier: a store of a concat
+            // of ramps turns into a sequence of stores of slices of the RHS,
+            // and a load of a ramp of a ramp where the *outer* ramp has stride
+            // 1 but the inner doesn't turns into a transpose of a concat of
+            // dense loads.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+
+        {
+            // Idiom 4: Vectorize both, y innermost. In this case the store of a
+            // ramp of a ramp gets rewritten by the simplifier to move the ramp
+            // with stride one innermost, transposing the RHS.
+
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .reorder(yi, xi)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    {
+        // Check the double-vectorization approaches also work when there is a
+        // vector predicate on one of the two vectors, to be sure the simplifier
+        // is transforming the predicate correctly. We can't predicate both,
+        // because the vectorizer can't handle it and generates a scalar tail.
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::Predicate)
+                .split(y, y, yi, 11, TailStrategy::ShiftInwards)
+                .reorder(xi, yi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::ShiftInwards)
+                .split(y, y, yi, 11, TailStrategy::Predicate)
+                .reorder(yi, xi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    printf("Success!\n");
+}
diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt
index 3e8142f882c0..4c2b515814b5 100644
--- a/test/performance/CMakeLists.txt
+++ b/test/performance/CMakeLists.txt
@@ -18,6 +18,7 @@ tests(GROUPS performance
       fast_pow.cpp
       fast_sine_cosine.cpp
       gpu_half_throughput.cpp
+      interleave.cpp
       jit_stress.cpp
       lots_of_inputs.cpp
       memcpy.cpp
diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 740908358443..921d7f9a913b 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -7,108 +7,77 @@
 using namespace Halide;
 using namespace Halide::Tools;
 
-enum {
-    scalar_trans,
-    vec_y_trans,
-    vec_x_trans
+struct Result {
+    int type_size, block_width, block_height;
+    double bandwidth;
 };
 
-Buffer<uint16_t> test_transpose(int mode) {
-    Func input, block, block_transpose, output;
-    Var x, y;
-
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+template<typename T>
+Result test_transpose(int block_width, int block_height, const Target &t) {
+    const int N = 256;
+    Buffer<T> in(N, N), out(N, N);
 
-    block(x, y) = input(x, y);
-    block_transpose(x, y) = block(y, x);
-    output(x, y) = block_transpose(x, y);
-
-    Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block.compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block_transpose.compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block_transpose.compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block_transpose.compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x + y * N);
+        }
     }
 
-    Buffer<uint16_t> result(1024, 1024);
-    output.compile_jit();
-
-    output.realize(result);
-
-    double t = benchmark([&]() {
-        output.realize(result);
-    });
-
-    std::cout << "Dummy Func version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
-}
-
-/* This illustrates how to achieve the same scheduling behavior using the 'in()'
- * directive as opposed to creating dummy Funcs as done in 'test_transpose()' */
-Buffer<uint16_t> test_transpose_wrap(int mode) {
     Func input, block_transpose, block, output;
     Var x, y;
 
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+    input(x, y) = in(x, y);
 
     output(x, y) = input(y, x);
 
     Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
-    }
+    output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp)
+        .vectorize(xi)
+        .vectorize(yi);
+
+    // Explicitly vectorized loads from the input. Was necessary before we
+    // automatically swizzled the 2D load into dense order.
+    // input.in().compute_at(output, x).vectorize(x).unroll(y);
+
+    // Explicit transpose in registers. This used to be the idiom, but is no
+    // longer necessary because stage_strided_loads should detect the strided
+    // loads from input.in() and turn it into a transpose.
+    // input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+    // TODO: Should not be necessary, but prevents licm from doing something dumb.
+    output.output_buffer().dim(0).set_bounds(0, 256);
 
-    Buffer<uint16_t> result(1024, 1024);
     output.compile_jit();
 
-    output.realize(result);
+    output.realize(out);
 
-    double t = benchmark([&]() {
-        output.realize(result);
+    double time = benchmark(10, 10, [&]() {
+        output.realize(out);
     });
 
-    std::cout << "Wrapper version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            T actual = out(x, y), correct = in(y, x);
+            if (actual != correct) {
+                std::cerr << "For block size (" << block_width << ", " << block_height << "): "
+                          << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    /*
+    output.compile_to_assembly(Internal::get_test_tmp_dir() + "transpose_uint" +
+                                   std::to_string(sizeof(T) * 8) + "_" +
+                                   std::to_string(block_width) + "x" +
+                                   std::to_string(block_height) + ".s",
+                               std::vector<Argument>{in}, "transpose", t);
+    */
+
+    return Result{(int)sizeof(T), block_width, block_height,
+                  out.size_in_bytes() / (1.0e9 * time)};
 }
 
 int main(int argc, char **argv) {
@@ -118,23 +87,48 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    test_transpose(scalar_trans);
-    test_transpose_wrap(scalar_trans);
-    test_transpose(vec_y_trans);
-    test_transpose_wrap(vec_y_trans);
-
-    Buffer<uint16_t> im1 = test_transpose(vec_x_trans);
-    Buffer<uint16_t> im2 = test_transpose_wrap(vec_x_trans);
-
-    // Check correctness of the wrapper version
-    for (int y = 0; y < im2.height(); y++) {
-        for (int x = 0; x < im2.width(); x++) {
-            if (im2(x, y) != im1(x, y)) {
-                printf("wrapper(%d, %d) = %d instead of %d\n",
-                       x, y, im2(x, y), im1(x, y));
-                return 1;
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "Computing best tile sizes for each type\n";
+    std::vector<Result> results;
+    int limit = 64 * 64;
+    for (int bh : {1, 2, 4, 8, 16, 32, 64}) {
+        for (int bw : {1, 2, 4, 8, 16, 32, 64}) {
+            std::cout << "." << std::flush;
+            results.push_back(test_transpose<uint8_t>(bw, bh, target));
+            if (bw * bh <= limit / 2) {
+                results.push_back(test_transpose<uint16_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 4) {
+                results.push_back(test_transpose<uint32_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 8) {
+                results.push_back(test_transpose<uint64_t>(bw, bh, target));
+            }
+        }
+    }
+    std::cout << "\nbytes, tile width, tile height, bandwidth (GB/s):\n";
+
+    // Sort the results by bandwidth
+    std::sort(results.begin(), results.end(),
+              [](const Result &a, const Result &b) {
+                  return a.bandwidth > b.bandwidth;
+              });
+
+    // Print top n tile sizes for each type
+    for (int t : {1, 2, 4, 8}) {
+        int top_n = 5;
+        for (size_t i = 0; i < results.size() && top_n > 0; i++) {
+            if (results[i].type_size == t) {
+                std::cout << t << " "
+                          << results[i].block_width << " "
+                          << results[i].block_height << " "
+                          << results[i].bandwidth << "\n";
+                top_n--;
             }
         }
+        std::cout << "\n";
     }
 
     printf("Success!\n");
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
new file mode 100644
index 000000000000..ee1598e40d41
--- /dev/null
+++ b/test/performance/interleave.cpp
@@ -0,0 +1,158 @@
+#include "Halide.h"
+#include "halide_benchmark.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+
+using namespace Halide;
+using namespace Halide::Tools;
+
+struct Result {
+    int type_size, factor;
+    double bandwidth;
+};
+
+template<typename T>
+Result test_interleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N, factor), out(N * factor);
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x * factor + y);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x) = in(x / factor, x % factor);
+
+    Var xi, yi;
+    // We'll use the interleaving-stores scheduling idiom, where unrolling
+    // strided stores gets mashed together into a single dense store of a
+    // interleave_vectors call.
+    output.unroll(x, factor, TailStrategy::RoundUp)
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    output.output_buffer().dim(0).set_min(0);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x * factor + y), correct = in(x, y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << " * " << factor << " + " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "interleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+template<typename T>
+Result test_deinterleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N * factor), out(N, factor);
+
+    for (int x = 0; x < N; x++) {
+        for (int y = 0; y < factor; y++) {
+            in(x * factor + y) = (T)(x + y * N);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x, y) = in(x * factor + y);
+
+    Var xi, yi;
+    output.bound(y, 0, factor)
+        .reorder(y, x)
+        .unroll(y)  // Also works if we vectorize y
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x, y), correct = in(x * factor + y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "deinterleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+    if (target.arch == Target::WebAssembly) {
+        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
+        return 0;
+    }
+
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";
+    for (int t : {1, 2, 4, 8}) {
+        for (int f = 2; f < 16; f++) {
+            Result r1, r2;
+            switch (t) {
+            case 1:
+                r1 = test_interleave<uint8_t>(f, target);
+                r2 = test_deinterleave<uint8_t>(f, target);
+                break;
+            case 2:
+                r1 = test_interleave<uint16_t>(f, target);
+                r2 = test_deinterleave<uint16_t>(f, target);
+                break;
+            case 4:
+                r1 = test_interleave<uint32_t>(f, target);
+                r2 = test_deinterleave<uint32_t>(f, target);
+                break;
+            case 8:
+                r1 = test_interleave<uint64_t>(f, target);
+                r2 = test_deinterleave<uint64_t>(f, target);
+                break;
+            default:
+                break;
+            }
+            std::cout << r1.type_size << " "
+                      << r1.factor << " "
+                      << r1.bandwidth << " "
+                      << r2.bandwidth << "\n";
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}