From 9e89b7cee5656925ce782303efab3c94fff9584e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Jan 2026 15:52:50 -0800
Subject: [PATCH 01/34] Specialized x86 implementation of interleave_vectors

---
 apps/iir_blur/Makefile               |   2 +-
 apps/iir_blur/iir_blur_generator.cpp |  18 +-
 src/CodeGen_Hexagon.cpp              |   4 -
 src/CodeGen_LLVM.cpp                 |  15 +-
 src/CodeGen_LLVM.h                   |   6 +-
 src/CodeGen_X86.cpp                  | 604 +++++++++++++++++++++++++++
 src/Util.h                           |   5 +
 7 files changed, 638 insertions(+), 16 deletions(-)

diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
index 49104b3e5fa3..92ed5d2a5b0b 100644
--- a/apps/iir_blur/Makefile
+++ b/apps/iir_blur/Makefile
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
 
 $(BIN)/%/out.png: $(BIN)/%/filter
-	$< ../images/rgba.png $(BIN)/$*/out.png
+	$< ../images/rgb.png $(BIN)/$*/out.png
 
 clean:
 	rm -rf $(BIN)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index ef3b44eef461..4e4db6e61410 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -38,17 +38,25 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
             // CPU schedule.
             // 8.2ms on an Intel i9-9960X using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
-            // and strips (Halide supports nested parallelism).
-            Var xo, yo, t;
+            // and strips.
+            Var xo, yo, t, yi;
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
+                .split(y, y, yi, vec)
+                .unroll(yi)
                 .vectorize(x)
-                .parallel(yo)
-                .parallel(c);
+                .fuse(yo, c, t)
+                .parallel(t);
+
+            blur.in(transpose)
+                .reorder_storage(y, x)
+                .compute_at(transpose, y)
+                .vectorize(x)
+                .unroll(y);
 
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
-            blur.compute_at(transpose, yo);
+            blur.compute_at(transpose, t);
 
             // Vectorize computations within the strips.
             blur.update(0)
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 05b68447b6a4..5347f69b279c 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1404,10 +1404,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
     return slice_vector(concat_vectors(result), 0, idx_elements);
 }
 
-bool is_power_of_two(int x) {
-    return (x & (x - 1)) == 0;
-}
-
 // vdelta and vrdelta are instructions that take an input vector and
 // pass it through a network made up of levels. Each element x at each
 // level i can either take the element from the previous level at the
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 4a5b45475533..7715fce28c34 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1363,10 +1363,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) {
     s.accept(this);
 }
 
-bool CodeGen_LLVM::is_power_of_two(int x) const {
-    return (x & (x - 1)) == 0;
-}
-
 Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
     if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) {
         return Float(32, t.lanes());
@@ -2194,6 +2190,17 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
     value = create_broadcast(v, op->lanes);
 }
 
+Value *CodeGen_LLVM::optimization_fence(Value *v) {
+    llvm::Type *t = v->getType();
+    internal_assert(!t->isScalableTy())
+        << "optimization_fence does not support scalable vectors yet";
+    const int bits = t->getPrimitiveSizeInBits();
+    llvm::Type *float_type = llvm_type_of(Float(64, bits / 64));
+    v = builder->CreateBitCast(v, float_type);
+    v = builder->CreateArithmeticFence(v, float_type);
+    return builder->CreateBitCast(v, t);
+}
+
 Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     internal_assert(!vecs.empty());
     for (size_t i = 1; i < vecs.size(); i++) {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 183463d5fdb6..e006a885fc57 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -460,6 +460,10 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
+     * abuse it to prevent shufflevector fusion too. */
+    llvm::Value *optimization_fence(llvm::Value *);
+
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -523,8 +527,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** Shorthand for shuffling a single vector. */
     llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);
 
-    bool is_power_of_two(int x) const;
-
     bool is_scalable_vector(llvm::Value *v) const;
 
     /** Go looking for a vector version of a runtime function. Will
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 3d2388fdf89c..ab854f72e897 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -11,6 +11,8 @@
 #include "Substitute.h"
 #include "Util.h"
 
+#include <algorithm>
+
 namespace Halide {
 namespace Internal {
 
@@ -111,6 +113,8 @@ class CodeGen_X86 : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
     // @}
 
+    llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &) override;
+
 private:
     Scope<MemoryType> mem_type;
 };
@@ -913,6 +917,606 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
+    // Only use x86-specific interleaving for AVX and above
+    if (vecs.empty() || !target.has_feature(Target::AVX)) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    if (vecs.size() == 1) {
+        return vecs[0];
+    }
+
+    // Get the element type and vector properties
+    llvm::Type *vec_type = vecs[0]->getType();
+    llvm::Type *element_type = get_vector_element_type(vec_type);
+    int vec_elements = get_vector_num_elements(vec_type);
+    const size_t element_bits = element_type->getScalarSizeInBits();
+    const size_t elems_per_native_vec = native_vector_bits() / element_bits;
+    const size_t elems_per_slice = 128 / element_bits;
+
+    // Only apply special x86 logic for power-of-two interleaves for avx and
+    // above (TODO: Could slice into native vectors and concat results even if
+    // not power of two)
+
+    if (!is_power_of_two(vec_elements) ||
+        !is_power_of_two(vecs.size())) {
+        return CodeGen_Posix::interleave_vectors(vecs);
+    }
+
+    /*
+      x86 has a weird set of vector shuffle instructions due to historical
+      baggage, and the strategy in the base class for interleaving vectors
+      works poorly. Here we have a somewhat complex algorithm for generating
+      better sequences of shuffle instructions for avx and avx-512.
+
+      Consider the location of one of the elements of one of the vectors. It
+      has a vector index, which says which vector it's in, and a vector lane
+      index, which gives the lane. x86 shuffles work in terms of 128-bit
+      subvectors, which we will call slices. So we'll decompose that lane index
+      into a slice index, to identify the 128-bit slice within a vector, and
+      the lane index within that slice. For avx the slice index is either zero
+      or one, and for avx-512 it's 0, 1, 2, or 3. Because we have limited
+      everything to be a power of two, we can write out these indices in
+      binary. We'll use v for the vector index, s for the slice index, and l
+      for the lane index. For an avx-512 interleave of 16 vectors of 32
+      elements each (i.e. uint16s), a location could thus be written as:
+
+      [l0 l1 l2] [s0 s1] [v0 v1 v2 v3]
+
+      where l0 is the least-significant bit of the lane index, and so on.
+
+      An interleave takes the bits that give the vector index and moves them to
+      be the least significant bits, shifting everything else over. So the
+      indices of our vectors after the interleave should be:
+
+      [v0 v1 v2] [v3 l0] [l1 l2 s0 s1]
+
+      Assigning numbers to each according to their final location, we start with:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      and we want to issue some sequence of instructions to get us to:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      Now let's consider the instructions we have available. These generally
+      permute these bits. E.g. an instruction that interleaves two entire
+      vectors, applied to every pairs of vectors, would take the some vector bit
+      and make it the lowest lane bit instead, shuffling the other bits upwards,
+      with the highest-order within-vector bit taking the place of the vector
+      bit (because we produce separate vectors for the low and high half of the
+      result. So if we used this instruction to push the highest vector bit
+      inwards, we could turn this:
+
+      [4 5 6] [7 8] [0 1 2 3]
+
+      into this:
+
+      [3 4 5] [6 7] [0 1 2 8]
+
+      If we did this three more times, pulling a different vector bit in each
+      time, we'd get:
+
+      [0 1 2] [3 4] [5 6 7 8]
+
+      and we'd be done! This is what the base class does. Unfortunately, x86 has
+      no such instruction, so we'll have to figure out something else.
+      Interleaving vectors often happens in contexts with high register
+      pressure, so we will restrict our attention to instructions that take
+      immediates. The most important one is vunpckl/h. This interleaves lanes
+      between two vectors but staying within each 128-bit slice. So the slice
+      bits will be unchanged, and the lane bits will be rotated right along with
+      one of the vector bits. So if we interleave vectors starting from the
+      second-highest vector bit, we can turn this:
+
+      [4 5 6] [_ _] [_ _ 2 _]
+
+      into this:
+
+      [2 4 5] [_ _] [_ _ 6 _]
+
+      where the underscores indicate bits that are unchanged.
+
+      Unlike a full vector interleave, the slice bits stayed fixed, and the
+      highest within-slice lane bit (6) took the place of the vector bit
+      instead. This is at least a good start. If we do this two more times,
+      pulling in vector bits 0 and 1, we can make this:
+
+      [0 1 2] [7 8] [4 5 6 3]
+
+      The lane bits are now in the desired state. The next instruction to
+      consider is shufi. It's more general than this, but for our purposes there
+      are two interesting things we can do with it. We concatenate the low halves
+      of two vectors or the high halves of two vectors, which swaps the
+      high-order slice bit with one of the vector bits:
+
+      [_ _ _] [_ 8] [_ _ _ 3] -> [_ _ _] [_ 3] [_ _ _ 8]
+
+      We can also interleave the even slices of a vector with the even slices of
+      another (and do the same for odd), which rotates left the two slice bits
+      together with one of the vector bits:
+
+      [_ _ _] [7 3] [4 _ _ _] -> [_ _ _] [3 4] [7 _ _ _]
+
+      The vector bit became the high slice bit, the low slice bit took the place
+      of the vector bit, and the high slice bit becomes the low slice
+      bit. Filling in the underscores, we're now in this state:
+
+      [0 1 2] [3 4] [7 5 6 8]
+
+      Only the vector bits are wrong, but permuting entire vectors is free,
+      because that's just changing which register names we're referring to
+      (shuffling our array of llvm::Value *). So all totalled, per vector, we
+      needed three unckl/h instructions, and one shufi instruction of each
+      kind. If the vectors were a narrower type, it would have just added one
+      more unpckl.
+
+      If you're interleaving lots of complete vectors, that's the whole story,
+      but there are other situations to consider. It's not uncommon to want to
+      interleave half-vectors to make some number of full vectors. We can model
+      this by having some slice or even lane bits start as missing. So
+      interleaving 16 half-vectors of uint16s to 8 full vectors would be
+      starting from this:
+
+      [4 5 6] [7] [0 1 2 3]
+
+      and trying to get here:
+
+      [0 1 2] [3 4] [5 6 7]
+
+      Each of our instructions has to operate on every vector, so to reduce the
+      number of instructions so we'd first like to do something to create that
+      missing high slice bit, halving the number of vectors. E.g. we could
+      identify pairs of vectors to concatenate. Let's try concatenating pairs
+      using the high vector bit (3):
+
+      [4 5 6] [7 3] [0 1 2]
+
+      Now we do three unpcks to rotate 0 1 2 into the correct place:
+
+      [0 1 2] [7 3] [4 5 6]
+
+      Now a single shufi can rotate 7 3 and 4:
+
+      [0 1 2] [3 4] [7 5 6]
+
+      and we just need to reorder whole vectors and we're done. So in this case
+      we needed only a single shufi instruction, because our desired low slice
+      bit (3) was already sitting there as the high slice bit after
+      pairwise concatenation.
+
+      Now consider the case where we had only four half-vectors to interleave to
+      produce two whole vectors:
+
+      [2 3 4] [5] [0 1]
+
+      There's no good concatenation we can do to make whole vectors. That 0 and 1
+      both need to end up as lanes bits, and we have no instructions that swap
+      slice bits with lanes bits. So we'll just have to run unpck instructions at
+      half-vector width to push that 4 into the vector bit range:
+
+      [1 2 3] [5] [0 4]
+
+      and now we can concatenate according to bit 4 to make whole vectors
+
+      [1 2 3] [5 4] [0]
+
+      We then do one more unpck to pull the 0 down:
+
+      [0 1 2] [5 4] [3]
+
+      Next, we need to make 3 a slice bit. We can use shufi to swap it with 4:
+
+      [0 1 2] [5 3] [4]
+
+      and then another shufi to rotate those three
+
+      [0 1 2] [3 4] [5]
+
+      and we're done.
+
+      Depending on how many of each bit we start with, we can also end up in
+      situations where everything is correct except the two slice bits are in
+      the wrong order, in which case we can use a shufi instruction with a
+      vector and itself to swap those two bits.
+
+      So there are many possible paths depending on the number of elements per
+      vector, the number of elements per 128-bit slice of each vector, and the
+      number of vectors to interleave. The way to stay sane is to just
+      explicitly track the vectors above as l_bits, s_bits, and v_bits, and
+      transform it alongside all our instructions as we try to get the right
+      bits in the right final places.
+    */
+
+    // Make a working copy
+    std::vector<llvm::Value *> v = vecs;
+
+    // The number of 128-bit slices per vector is 2 for avx and 4 for avx512
+    const int final_num_s_bits = ctz64(native_vector_bits() / 128);
+    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2) << native_vector_bits() << " " << final_num_s_bits << "\n";
+
+    const int num_v_bits = ctz64(v.size());
+    const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice);
+    const int num_l_bits = ctz64(std::min((size_t)vec_elements, elems_per_slice));
+
+    // Construct the initial tracking vectors for each bit location
+    std::vector<int> v_bits(num_v_bits), l_bits(num_l_bits), s_bits(num_s_bits);
+    int c = 0;
+    for (int i = 0; i < num_v_bits; i++) {
+        // We want the v bits to end up innermost, so number them 0, 1, 2 ...
+        v_bits[i] = c++;
+    }
+    for (int i = 0; i < num_l_bits; i++) {
+        // Then come the l bits
+        l_bits[i] = c++;
+    }
+    for (int i = 0; i < num_s_bits; i++) {
+        // and finally, the slice bits
+        s_bits[i] = c++;
+    }
+
+    // Now we define helpers for each instruction we are going to use
+
+    // unpckl/h instruction
+    auto unpck = [&](Value *a, Value *b) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+
+        for (int i = 0; i < n; i += (int)elems_per_slice) {
+            int half = (int)elems_per_slice / 2;
+            // For the low result, interleave the first half of each slice
+            for (int j = 0; j < half; j++) {
+                lo_indices.push_back(i + j);
+                lo_indices.push_back(n + i + j);
+            }
+            // For the high result, interleave the second half of each slice
+            for (int j = half; j < (int)elems_per_slice; j++) {
+                hi_indices.push_back(i + j);
+                hi_indices.push_back(n + i + j);
+            }
+        }
+
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        // Everything falls apart if we let LLVM fuse shuffles, so we add
+        // optimization fences around the results to ensure we get the
+        // instructions we're asking for.
+        return {optimization_fence(lo), optimization_fence(hi)};
+    };
+
+    // shufi instruction, with or without cross-over
+    auto shufi = [&](Value *a, Value *b, bool crossover) -> std::pair<Value *, Value *> {
+        int n = get_vector_num_elements(a->getType());
+        std::vector<int> lo_indices, hi_indices;
+        if (final_num_s_bits == 2) {
+            // AVX-512
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + (crossover ? 1 : 2) * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i + (crossover ? 2 : 1) * (int)elems_per_slice);
+                hi_indices.push_back(i + 3 * (int)elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice * 2; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        } else {
+            // AVX-2
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(i);
+                hi_indices.push_back(i + elems_per_slice);
+            }
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                lo_indices.push_back(lo_indices[i] + n);
+                hi_indices.push_back(hi_indices[i] + n);
+            }
+        }
+        Value *lo = shuffle_vectors(a, b, lo_indices);
+        Value *hi = shuffle_vectors(a, b, hi_indices);
+        return {optimization_fence(lo), optimization_fence(hi)};
+    };
+
+    // A 2x2 transpose of slices within a single vector
+    auto self_shufi = [&](Value *a) -> Value * {
+        internal_assert(4 * (int)elems_per_slice == vec_elements)
+            << "Should only be using shufi helper when targeting avx-512 shuffles on native vectors\n"
+            << elems_per_slice << " " << vec_elements << " " << native_vector_bits() << "\n";
+        std::vector<int> indices;
+        for (int j : {0, 2, 1, 3}) {
+            for (int i = 0; i < (int)elems_per_slice; i++) {
+                indices.push_back(i + j * (int)elems_per_slice);
+            }
+        }
+        return optimization_fence(shuffle_vectors(a, a, indices));
+    };
+
+    // First, if the vectors are wider than native, that will manifest as too
+    // many slice bits. Cut them into separate native vectors. This will not
+    // create any instructions.
+    while ((size_t)vec_elements > elems_per_native_vec) {
+        int cut = vec_elements / 2;
+        std::vector<Value *> new_v;
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, 0, cut));
+        }
+        for (auto *vec : v) {
+            new_v.push_back(slice_vector(vec, cut, cut));
+        }
+        v = new_v;
+        vec_elements = cut;
+
+        v_bits.push_back(s_bits.back());
+        s_bits.pop_back();
+    }
+
+    // Interleave pairs if we have vectors smaller than a single slice. Choosing
+    // which pairs to interleave is important because we want to pull down v
+    // bits that are destined to end up as l bits, and we want to pull them down
+    // in order.
+    if ((size_t)vec_elements < elems_per_slice) {
+        int highest_desired_l_bit = ctz64(elems_per_slice) - 1;
+        int bit = highest_desired_l_bit;
+        if (!v_bits.empty() && std::find(v_bits.begin(), v_bits.end(), bit) == v_bits.end()) {
+            bit = v_bits.back();
+        }
+
+        while (bit >= 0 && (size_t)vec_elements < elems_per_slice && !v_bits.empty()) {
+            auto it = std::find(v_bits.begin(), v_bits.end(), bit);
+            if (it == v_bits.end()) {
+                break;
+            }
+            int j = it - v_bits.begin();
+            v_bits.erase(it);
+            l_bits.insert(l_bits.begin(), bit);
+
+            // The distance in the vecs array is the index of the corresponding
+            // v bit we're pulling down.
+            int step = 1 << j;
+            std::vector<Value *> new_v;
+            new_v.reserve(v.size() / 2);
+            for (size_t i = 0; i < v.size(); i++) {
+                // Pair each vector with the one separated by the step.
+                size_t j = i ^ step;
+
+                // Don't process vectors twice.
+                if (j < i) continue;
+
+                // Just interleave the two vectors. Because we have fewer
+                // elements than one slice, unpckl/h is a straight interleave.
+                std::vector<int> indices;
+                for (int k = 0; k < vec_elements; k++) {
+                    indices.push_back(k);
+                    indices.push_back(vec_elements + k);
+                }
+                new_v.push_back(shuffle_vectors(v[i], v[j], indices));
+            }
+            v.swap(new_v);
+            vec_elements *= 2;
+            bit--;
+        }
+    }
+
+    // Concatenate/repack to get at least the desired number of slice bits.
+    while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) {
+        int desired_low_slice_bit = ctz64(elems_per_slice);
+        int desired_high_slice_bit = desired_low_slice_bit + 1;
+
+        int bit;
+        if (!s_bits.empty() &&
+            s_bits[0] == desired_low_slice_bit) {
+            // Only the avx-512 path should land here due to the while condition.
+            internal_assert(final_num_s_bits == 2);
+            bit = desired_high_slice_bit;
+        } else {
+            bit = desired_low_slice_bit;
+        }
+
+        auto v_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        if (v_it != v_bits.end()) {
+            int j = v_it - v_bits.begin();
+            v_bits.erase(v_it);
+            s_bits.push_back(bit);
+
+            int step = 1 << j;
+            std::vector<Value *> new_v;
+            new_v.reserve(v.size() / 2);
+            for (size_t i = 0; i < v.size(); i++) {
+                size_t k = i ^ step;
+                if (k < i) continue;
+                new_v.push_back(concat_vectors({v[i], v[k]}));
+            }
+            v.swap(new_v);
+            vec_elements *= 2;
+        } else {
+            // Oh no, the bit we wanted to use isn't in v_bits, it's in l_bits.
+            // We'll do sub-width unpck instead with an appropriate v bit to try
+            // to push it out. This is in a while loop, so it will keep doing
+            // this until it pops out the top of the l bits and we identify it
+            // as a v bit.
+            if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
+                int b = l_bits[0] - 1;
+                if (std::find(v_bits.begin(), v_bits.end(), b) == v_bits.end()) {
+                    b = v_bits.back();
+                }
+
+                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
+                int j = vb_it - v_bits.begin();
+                *vb_it = l_bits.back();
+                l_bits.pop_back();
+                l_bits.insert(l_bits.begin(), b);
+
+                int step = 1 << j;
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) continue;
+                    auto [lo, hi] = unpck(v[i], v[k]);
+                    v[i] = lo;
+                    v[k] = hi;
+                }
+            }
+        }
+    }
+
+    // If only one vector is left, we just need to check if the slice bits are
+    // in the right order:
+    if (v_bits.empty()) {
+        internal_assert(v.size() == 1);
+        if (s_bits.size() == 2 && s_bits[0] > s_bits[1]) {
+            v[0] = self_shufi(v[0]);
+            std::swap(s_bits[0], s_bits[1]);
+        }
+        return v[0];
+    }
+
+    // Now we have at least two whole vectors. Next we finalize lane bits using
+    // unpck instructions.
+    while (l_bits[0] != 0) {
+        int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
+
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        internal_assert(vb_it != v_bits.end());
+
+        int j = vb_it - v_bits.begin();
+        *vb_it = l_bits.back();
+        l_bits.pop_back();
+        l_bits.insert(l_bits.begin(), bit);
+
+        int step = 1 << j;
+        for (size_t i = 0; i < v.size(); i++) {
+            size_t k = i ^ step;
+            if (k < i) continue;
+            auto [lo, hi] = unpck(v[i], v[k]);
+            v[i] = lo;
+            v[k] = hi;
+        }
+    }
+
+    // They should be 0, 1, 2, 3...
+    for (int i = 0; i < (int)l_bits.size(); i++) {
+        internal_assert(l_bits[i] == i);
+    }
+
+    // Then we fix the slice bits with shufi instructions
+
+    // First the low slice bit
+    int low_slice_bit = l_bits.size();
+    auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
+    if (ls_in_v != v_bits.end()) {
+        int i = ls_in_v - v_bits.begin();
+        int step = 1 << i;
+        std::swap(*ls_in_v, s_bits.back());
+
+        for (size_t idx = 0; idx < v.size(); idx++) {
+            size_t j = idx ^ step;
+            if (j <= idx) continue;
+            auto [lo, hi] = shufi(v[idx], v[j], false);
+            v[idx] = lo;
+            v[j] = hi;
+        }
+    }
+
+    // And then the high slice bit, if there is one
+    if (final_num_s_bits == 2) {
+        // AVX-512
+        int high_slice_bit = low_slice_bit + 1;
+        auto hs_in_v = std::find(v_bits.begin(), v_bits.end(), high_slice_bit);
+        if (hs_in_v != v_bits.end()) {
+            // The high slice bit is in the v_bits. Note that if it's not, it'll
+            // be one of the slice bits. It can't be an l bit, because we've
+            // already finalized them.
+            int i = hs_in_v - v_bits.begin();
+            int step = 1 << i;
+
+            if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
+                // The low slice bit is currently occupying the high slice bit slot,
+                // so we need to shuffle it over at the same time by using the
+                // crossover variant of shufi.
+                int temp = s_bits[0];
+                s_bits[0] = s_bits.back();
+                s_bits.back() = *hs_in_v;
+                *hs_in_v = temp;
+
+                for (size_t idx = 0; idx < v.size(); idx++) {
+                    size_t j = idx ^ step;
+                    if (j <= idx) continue;
+                    auto [lo, hi] = shufi(v[idx], v[j], true);
+                    v[idx] = lo;
+                    v[j] = hi;
+                }
+            } else {
+                // The low slice bit must be already in place, so no crossover required.
+                internal_assert(s_bits[0] == low_slice_bit);
+                std::swap(*hs_in_v, s_bits.back());
+
+                for (size_t idx = 0; idx < v.size(); idx++) {
+                    size_t j = idx ^ step;
+                    if (j <= idx) continue;
+                    auto [lo, hi] = shufi(v[idx], v[j], false);
+                    v[idx] = lo;
+                    v[j] = hi;
+                }
+            }
+        } else if (s_bits.size() == 2 &&
+                   s_bits[0] == high_slice_bit &&
+                   s_bits[1] == low_slice_bit) {
+            // The slice bits are both there, but in the wrong order
+            std::swap(s_bits[0], s_bits[1]);
+            for (size_t i = 0; i < v.size(); i++) {
+                v[i] = self_shufi(v[i]);
+            }
+        }
+
+        // Both slice bits should be correct now
+        internal_assert(s_bits.size() == 2 &&
+                        s_bits[0] == low_slice_bit &&
+                        s_bits[1] == high_slice_bit);
+
+    } else {
+        // AVX-2 The sole slice bit should be correct now.
+        internal_assert(s_bits.size() == 1 &&
+                        s_bits[0] == low_slice_bit);
+    }
+
+    // The lane and slice bits are correct, but the vectors are in some
+    // arbitrary order. We'll reorder them by deinterleaving the list according
+    // to each bit position, in increasing order.
+    for (size_t i = 0; i < v_bits.size(); i++) {
+        int bit = i + s_bits.size() + l_bits.size();
+        auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
+        internal_assert(vb_it != v_bits.end());
+
+        int j = vb_it - v_bits.begin();
+        v_bits.erase(vb_it);
+        v_bits.push_back(bit);
+
+        std::vector<Value *> a, b;
+        a.reserve(v.size() / 2);
+        b.reserve(v.size() / 2);
+        int mask = 1 << j;
+        for (size_t k = 0; k < v.size(); k++) {
+            if ((k & mask) == 0) {
+                a.push_back(v[k]);
+            } else {
+                b.push_back(v[k]);
+            }
+        }
+        v.clear();
+        v.insert(v.end(), a.begin(), a.end());
+        v.insert(v.end(), b.begin(), b.end());
+    }
+
+    // The v bits should be correct now
+    for (int i = 0; i < (int)v_bits.size(); i++) {
+        internal_assert(v_bits[i] == i + (int)(l_bits.size() + s_bits.size()));
+    }
+
+    // Concatenate all results into a single vector. Phew.
+    return concat_vectors(v);
+}
+
 void CodeGen_X86::visit(const Allocate *op) {
     ScopedBinding<MemoryType> bind(mem_type, op->name, op->memory_type);
     CodeGen_Posix::visit(op);
diff --git a/src/Util.h b/src/Util.h
index 3196c1966cbb..4a6c84d9e594 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -568,6 +568,11 @@ inline int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
+/** Return whether or not an integer is a power of two. */
+inline bool is_power_of_two(int64_t x) {
+    return (x & (x - 1)) == 0;
+}
+
 template<typename T>
 inline T align_up(T x, int n) {
     return (x + n - 1) / n * n;

From 188bee0d01f154fd08d594bbe94403e9a8a03e1e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:00:39 -0800
Subject: [PATCH 02/34] Update test to be more exhaustive

---
 test/performance/block_transpose.cpp | 188 +++++++++++++--------------
 1 file changed, 89 insertions(+), 99 deletions(-)

diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 740908358443..9915cf8e5f51 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -7,108 +7,73 @@
 using namespace Halide;
 using namespace Halide::Tools;
 
-enum {
-    scalar_trans,
-    vec_y_trans,
-    vec_x_trans
+struct Result {
+    int type_size, block_width, block_height;
+    double bandwidth;
 };
 
-Buffer<uint16_t> test_transpose(int mode) {
-    Func input, block, block_transpose, output;
-    Var x, y;
-
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+template<typename T>
+Result test_transpose(int block_width, int block_height, const Target &t) {
+    const int N = 256;
+    Buffer<T> in(N, N), out(N, N);
 
-    block(x, y) = input(x, y);
-    block_transpose(x, y) = block(y, x);
-    output(x, y) = block_transpose(x, y);
-
-    Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block.compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block_transpose.compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block_transpose.compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block_transpose.compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x + y * N);
+        }
     }
 
-    Buffer<uint16_t> result(1024, 1024);
-    output.compile_jit();
-
-    output.realize(result);
-
-    double t = benchmark([&]() {
-        output.realize(result);
-    });
-
-    std::cout << "Dummy Func version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
-}
-
-/* This illustrates how to achieve the same scheduling behavior using the 'in()'
- * directive as opposed to creating dummy Funcs as done in 'test_transpose()' */
-Buffer<uint16_t> test_transpose_wrap(int mode) {
     Func input, block_transpose, block, output;
     Var x, y;
 
-    input(x, y) = cast<uint16_t>(x + y);
-    input.compute_root();
+    input(x, y) = in(x, y);
 
     output(x, y) = input(y, x);
 
     Var xi, yi;
-    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
-
-    // Do 8 vectorized loads from the input.
-    block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
-
-    std::string algorithm;
-    switch (mode) {
-    case scalar_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).unroll(x).unroll(y);
-        algorithm = "Scalar transpose";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector<Argument>());
-        break;
-    case vec_y_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(y).unroll(x);
-        algorithm = "Transpose vectorized in y";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector<Argument>());
-        break;
-    case vec_x_trans:
-        block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
-        algorithm = "Transpose vectorized in x";
-        output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector<Argument>());
-        break;
-    }
+    output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp)
+        .vectorize(xi)
+        .unroll(yi);
+
+    // Do vectorized loads from the input.
+    input.in().compute_at(output, x).vectorize(x).unroll(y);
+
+    // Transpose in registers
+    input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+    // TODO: Should not be necessary, but prevents licm from doing something dumb.
+    output.output_buffer().dim(0).set_bounds(0, 256);
 
-    Buffer<uint16_t> result(1024, 1024);
     output.compile_jit();
 
-    output.realize(result);
+    output.realize(out);
 
-    double t = benchmark([&]() {
-        output.realize(result);
+    double time = benchmark(10, 10, [&]() {
+        output.realize(out);
     });
 
-    std::cout << "Wrapper version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n";
-    return result;
+    for (int y = 0; y < N; y++) {
+        for (int x = 0; x < N; x++) {
+            T actual = out(x, y), correct = in(y, x);
+            if (actual != correct) {
+                std::cerr << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    /*
+    output.compile_to_assembly(Internal::get_test_tmp_dir() + "transpose_uint" +
+                                   std::to_string(sizeof(T) * 8) + "_" +
+                                   std::to_string(block_width) + "x" +
+                                   std::to_string(block_height) + ".s",
+                               std::vector<Argument>{in}, "transpose", t);
+    */
+
+    return Result{(int)sizeof(T), block_width, block_height,
+                  out.size_in_bytes() / (1.0e9 * time)};
 }
 
 int main(int argc, char **argv) {
@@ -118,23 +83,48 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    test_transpose(scalar_trans);
-    test_transpose_wrap(scalar_trans);
-    test_transpose(vec_y_trans);
-    test_transpose_wrap(vec_y_trans);
-
-    Buffer<uint16_t> im1 = test_transpose(vec_x_trans);
-    Buffer<uint16_t> im2 = test_transpose_wrap(vec_x_trans);
-
-    // Check correctness of the wrapper version
-    for (int y = 0; y < im2.height(); y++) {
-        for (int x = 0; x < im2.width(); x++) {
-            if (im2(x, y) != im1(x, y)) {
-                printf("wrapper(%d, %d) = %d instead of %d\n",
-                       x, y, im2(x, y), im1(x, y));
-                return 1;
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "Computing best tile sizes for each type\n";
+    std::vector<Result> results;
+    int limit = 64 * 64;
+    for (int bh : {1, 2, 4, 8, 16, 32, 64}) {
+        for (int bw : {1, 2, 4, 8, 16, 32, 64}) {
+            std::cout << "." << std::flush;
+            results.push_back(test_transpose<uint8_t>(bw, bh, target));
+            if (bw * bh <= limit / 2) {
+                results.push_back(test_transpose<uint16_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 4) {
+                results.push_back(test_transpose<uint32_t>(bw, bh, target));
+            }
+            if (bw * bh <= limit / 8) {
+                results.push_back(test_transpose<uint64_t>(bw, bh, target));
+            }
+        }
+    }
+    std::cout << "\nbytes, tile width, tile height, bandwidth (GB/s):\n";
+
+    // Sort the results by bandwidth
+    std::sort(results.begin(), results.end(),
+              [](const Result &a, const Result &b) {
+                  return a.bandwidth > b.bandwidth;
+              });
+
+    // Print top n tile sizes for each type
+    for (int t : {1, 2, 4, 8}) {
+        int top_n = 5;
+        for (size_t i = 0; i < results.size() && top_n > 0; i++) {
+            if (results[i].type_size == t) {
+                std::cout << t << " "
+                          << results[i].block_width << " "
+                          << results[i].block_height << " "
+                          << results[i].bandwidth << "\n";
+                top_n--;
             }
         }
+        std::cout << "\n";
     }
 
     printf("Success!\n");

From 2ba8ddeac15016095f2b41fec6936f0ba80eb820 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:38:15 -0800
Subject: [PATCH 03/34] Fix comment.

The previous comment reported a time that seemed to have regressed. It
was not 8.2ms on main - more like 11
---
 apps/iir_blur/iir_blur_generator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 4e4db6e61410..3c4dee4304af 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -36,7 +36,7 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
     if (!skip_schedule) {
         if (!target.has_gpu_feature()) {
             // CPU schedule.
-            // 8.2ms on an Intel i9-9960X using 16 threads
+            // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
             // and strips.
             Var xo, yo, t, yi;

From d102f7bee65116d980284ecdc60fd2b8997e9db3 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 11:42:50 -0800
Subject: [PATCH 04/34] Comment fix

---
 src/CodeGen_X86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index ab854f72e897..6ef95f6b51bc 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -982,7 +982,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to every pairs of vectors, would take the some vector bit
+      vectors, applied to pairs of vectors, would take the some vector bit
       and make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the

From 46d41ddbe8bfe1bccf33c211902072407f0ead39 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 12:54:39 -0800
Subject: [PATCH 05/34] clang-tidy fixes

---
 src/CodeGen_X86.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 6ef95f6b51bc..968e9f25e54a 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -982,8 +982,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to pairs of vectors, would take the some vector bit
-      and make it the lowest lane bit instead, shuffling the other bits upwards,
+      vectors, applied to pairs of vectors, would take the some vector bit and
+      make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the
       result. So if we used this instruction to push the highest vector bit
@@ -1239,6 +1239,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     while ((size_t)vec_elements > elems_per_native_vec) {
         int cut = vec_elements / 2;
         std::vector<Value *> new_v;
+        new_v.reserve(v.size() * 2);
         for (auto *vec : v) {
             new_v.push_back(slice_vector(vec, 0, cut));
         }
@@ -1282,7 +1283,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 size_t j = i ^ step;
 
                 // Don't process vectors twice.
-                if (j < i) continue;
+                if (j < i) {
+                    continue;
+                }
 
                 // Just interleave the two vectors. Because we have fewer
                 // elements than one slice, unpckl/h is a straight interleave.
@@ -1411,7 +1414,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
         for (size_t idx = 0; idx < v.size(); idx++) {
             size_t j = idx ^ step;
-            if (j <= idx) continue;
+            if (j <= idx) {
+                continue;
+            }
             auto [lo, hi] = shufi(v[idx], v[j], false);
             v[idx] = lo;
             v[j] = hi;
@@ -1441,7 +1446,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
                 for (size_t idx = 0; idx < v.size(); idx++) {
                     size_t j = idx ^ step;
-                    if (j <= idx) continue;
+                    if (j <= idx) {
+                        continue;
+                    }
                     auto [lo, hi] = shufi(v[idx], v[j], true);
                     v[idx] = lo;
                     v[j] = hi;
@@ -1453,7 +1460,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
                 for (size_t idx = 0; idx < v.size(); idx++) {
                     size_t j = idx ^ step;
-                    if (j <= idx) continue;
+                    if (j <= idx) {
+                        continue;
+                    }
                     auto [lo, hi] = shufi(v[idx], v[j], false);
                     v[idx] = lo;
                     v[j] = hi;
@@ -1464,8 +1473,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                    s_bits[1] == low_slice_bit) {
             // The slice bits are both there, but in the wrong order
             std::swap(s_bits[0], s_bits[1]);
-            for (size_t i = 0; i < v.size(); i++) {
-                v[i] = self_shufi(v[i]);
+            for (auto &vec : v) {
+                vec = self_shufi(vec);
             }
         }
 

From 27f122026c317b18afc3b39b0044bd135d8e135a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 14:16:52 -0800
Subject: [PATCH 06/34] Make variable names more consistent

---
 src/CodeGen_X86.cpp | 61 +++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 968e9f25e54a..2b086538eee5 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1328,7 +1328,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             new_v.reserve(v.size() / 2);
             for (size_t i = 0; i < v.size(); i++) {
                 size_t k = i ^ step;
-                if (k < i) continue;
+                if (k < i) {
+                    continue;
+                }
                 new_v.push_back(concat_vectors({v[i], v[k]}));
             }
             v.swap(new_v);
@@ -1341,11 +1343,12 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // as a v bit.
             if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
                 int b = l_bits[0] - 1;
-                if (std::find(v_bits.begin(), v_bits.end(), b) == v_bits.end()) {
-                    b = v_bits.back();
+                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
+                if (vb_it == v_bits.end()) {
+                    vb_it = v_bits.end() - 1;
+                    b = *vb_it;
                 }
 
-                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
                 int j = vb_it - v_bits.begin();
                 *vb_it = l_bits.back();
                 l_bits.pop_back();
@@ -1354,7 +1357,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 int step = 1 << j;
                 for (size_t i = 0; i < v.size(); i++) {
                     size_t k = i ^ step;
-                    if (k < i) continue;
+                    if (k < i) {
+                        continue;
+                    }
                     auto [lo, hi] = unpck(v[i], v[k]);
                     v[i] = lo;
                     v[k] = hi;
@@ -1390,7 +1395,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         int step = 1 << j;
         for (size_t i = 0; i < v.size(); i++) {
             size_t k = i ^ step;
-            if (k < i) continue;
+            if (k < i) {
+                continue;
+            }
             auto [lo, hi] = unpck(v[i], v[k]);
             v[i] = lo;
             v[k] = hi;
@@ -1408,18 +1415,18 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     int low_slice_bit = l_bits.size();
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
-        int i = ls_in_v - v_bits.begin();
-        int step = 1 << i;
+        int j = ls_in_v - v_bits.begin();
+        int step = 1 << j;
         std::swap(*ls_in_v, s_bits.back());
 
-        for (size_t idx = 0; idx < v.size(); idx++) {
-            size_t j = idx ^ step;
-            if (j <= idx) {
+        for (size_t i = 0; i < v.size(); i++) {
+            size_t k = i ^ step;
+            if (k < i) {
                 continue;
             }
-            auto [lo, hi] = shufi(v[idx], v[j], false);
-            v[idx] = lo;
-            v[j] = hi;
+            auto [lo, hi] = shufi(v[i], v[k], false);
+            v[i] = lo;
+            v[k] = hi;
         }
     }
 
@@ -1432,8 +1439,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // The high slice bit is in the v_bits. Note that if it's not, it'll
             // be one of the slice bits. It can't be an l bit, because we've
             // already finalized them.
-            int i = hs_in_v - v_bits.begin();
-            int step = 1 << i;
+            int j = hs_in_v - v_bits.begin();
+            int step = 1 << j;
 
             if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
                 // The low slice bit is currently occupying the high slice bit slot,
@@ -1444,27 +1451,27 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 s_bits.back() = *hs_in_v;
                 *hs_in_v = temp;
 
-                for (size_t idx = 0; idx < v.size(); idx++) {
-                    size_t j = idx ^ step;
-                    if (j <= idx) {
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) {
                         continue;
                     }
-                    auto [lo, hi] = shufi(v[idx], v[j], true);
-                    v[idx] = lo;
-                    v[j] = hi;
+                    auto [lo, hi] = shufi(v[i], v[k], true);
+                    v[i] = lo;
+                    v[k] = hi;
                 }
             } else {
                 // The low slice bit must be already in place, so no crossover required.
                 internal_assert(s_bits[0] == low_slice_bit);
                 std::swap(*hs_in_v, s_bits.back());
 
-                for (size_t idx = 0; idx < v.size(); idx++) {
-                    size_t j = idx ^ step;
-                    if (j <= idx) {
+                for (size_t i = 0; i < v.size(); i++) {
+                    size_t k = i ^ step;
+                    if (k < i) {
                         continue;
                     }
-                    auto [lo, hi] = shufi(v[idx], v[j], false);
-                    v[idx] = lo;
+                    auto [lo, hi] = shufi(v[i], v[k], false);
+                    v[i] = lo;
                     v[j] = hi;
                 }
             }

From 5576f46776bbfc0f629c6ec8adf8986943056fd5 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 15:18:19 -0800
Subject: [PATCH 07/34] Simplify code with helper lambda

---
 src/CodeGen_X86.cpp | 118 ++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 69 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 2b086538eee5..8cfcedccd50d 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1233,6 +1233,23 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         return optimization_fence(shuffle_vectors(a, a, indices));
     };
 
+    // A helper to iterate over all pairs of entries in v, separated by some
+    // power-of-two spacing.
+    auto for_all_pairs = [&](size_t log_step, auto fn) {
+        size_t step = 1 << log_step;
+        for (size_t i = 0; i < v.size(); i++) {
+            // Pair each vector with the one separated by the step.
+            size_t j = i ^ step;
+
+            // Don't process vectors twice.
+            if (j < i) {
+                continue;
+            }
+
+            fn(&v[i], &v[j]);
+        }
+    };
+
     // First, if the vectors are wider than native, that will manifest as too
     // many slice bits. Cut them into separate native vectors. This will not
     // create any instructions.
@@ -1275,18 +1292,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
             // The distance in the vecs array is the index of the corresponding
             // v bit we're pulling down.
-            int step = 1 << j;
             std::vector<Value *> new_v;
             new_v.reserve(v.size() / 2);
-            for (size_t i = 0; i < v.size(); i++) {
-                // Pair each vector with the one separated by the step.
-                size_t j = i ^ step;
-
-                // Don't process vectors twice.
-                if (j < i) {
-                    continue;
-                }
-
+            for_all_pairs(j, [&](auto *a, auto *b) {
                 // Just interleave the two vectors. Because we have fewer
                 // elements than one slice, unpckl/h is a straight interleave.
                 std::vector<int> indices;
@@ -1294,8 +1302,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                     indices.push_back(k);
                     indices.push_back(vec_elements + k);
                 }
-                new_v.push_back(shuffle_vectors(v[i], v[j], indices));
-            }
+                new_v.push_back(shuffle_vectors(*a, *b, indices));
+            });
             v.swap(new_v);
             vec_elements *= 2;
             bit--;
@@ -1323,16 +1331,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             v_bits.erase(v_it);
             s_bits.push_back(bit);
 
-            int step = 1 << j;
             std::vector<Value *> new_v;
             new_v.reserve(v.size() / 2);
-            for (size_t i = 0; i < v.size(); i++) {
-                size_t k = i ^ step;
-                if (k < i) {
-                    continue;
-                }
-                new_v.push_back(concat_vectors({v[i], v[k]}));
-            }
+            for_all_pairs(j, [&](auto *a, auto *b) {
+                new_v.push_back(concat_vectors({*a, *b}));
+            });
             v.swap(new_v);
             vec_elements *= 2;
         } else {
@@ -1354,16 +1357,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 l_bits.pop_back();
                 l_bits.insert(l_bits.begin(), b);
 
-                int step = 1 << j;
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = unpck(v[i], v[k]);
-                    v[i] = lo;
-                    v[k] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = unpck(*a, *b);
+                    *a = lo;
+                    *b = hi;
+                });
             }
         }
     }
@@ -1392,16 +1390,11 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         l_bits.pop_back();
         l_bits.insert(l_bits.begin(), bit);
 
-        int step = 1 << j;
-        for (size_t i = 0; i < v.size(); i++) {
-            size_t k = i ^ step;
-            if (k < i) {
-                continue;
-            }
-            auto [lo, hi] = unpck(v[i], v[k]);
-            v[i] = lo;
-            v[k] = hi;
-        }
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = unpck(*a, *b);
+            *a = lo;
+            *b = hi;
+        });
     }
 
     // They should be 0, 1, 2, 3...
@@ -1416,18 +1409,13 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
         int j = ls_in_v - v_bits.begin();
-        int step = 1 << j;
         std::swap(*ls_in_v, s_bits.back());
 
-        for (size_t i = 0; i < v.size(); i++) {
-            size_t k = i ^ step;
-            if (k < i) {
-                continue;
-            }
-            auto [lo, hi] = shufi(v[i], v[k], false);
-            v[i] = lo;
-            v[k] = hi;
-        }
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            auto [lo, hi] = shufi(*a, *b, false);
+            *a = lo;
+            *b = hi;
+        });
     }
 
     // And then the high slice bit, if there is one
@@ -1440,7 +1428,6 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             // be one of the slice bits. It can't be an l bit, because we've
             // already finalized them.
             int j = hs_in_v - v_bits.begin();
-            int step = 1 << j;
 
             if (!s_bits.empty() && s_bits.back() == low_slice_bit) {
                 // The low slice bit is currently occupying the high slice bit slot,
@@ -1451,29 +1438,22 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 s_bits.back() = *hs_in_v;
                 *hs_in_v = temp;
 
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = shufi(v[i], v[k], true);
-                    v[i] = lo;
-                    v[k] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+
             } else {
                 // The low slice bit must be already in place, so no crossover required.
                 internal_assert(s_bits[0] == low_slice_bit);
                 std::swap(*hs_in_v, s_bits.back());
 
-                for (size_t i = 0; i < v.size(); i++) {
-                    size_t k = i ^ step;
-                    if (k < i) {
-                        continue;
-                    }
-                    auto [lo, hi] = shufi(v[i], v[k], false);
-                    v[i] = lo;
-                    v[j] = hi;
-                }
+                for_all_pairs(j, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
             }
         } else if (s_bits.size() == 2 &&
                    s_bits[0] == high_slice_bit &&

From 107aaa5122b6c962c51b2cefb181671315368635 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 27 Jan 2026 15:20:06 -0800
Subject: [PATCH 08/34] Comment tweaks

---
 src/CodeGen_X86.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 8cfcedccd50d..b68987bebd71 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -950,17 +950,17 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
       works poorly. Here we have a somewhat complex algorithm for generating
       better sequences of shuffle instructions for avx and avx-512.
 
-      Consider the location of one of the elements of one of the vectors. It
-      has a vector index, which says which vector it's in, and a vector lane
-      index, which gives the lane. x86 shuffles work in terms of 128-bit
-      subvectors, which we will call slices. So we'll decompose that lane index
-      into a slice index, to identify the 128-bit slice within a vector, and
-      the lane index within that slice. For avx the slice index is either zero
-      or one, and for avx-512 it's 0, 1, 2, or 3. Because we have limited
-      everything to be a power of two, we can write out these indices in
-      binary. We'll use v for the vector index, s for the slice index, and l
-      for the lane index. For an avx-512 interleave of 16 vectors of 32
-      elements each (i.e. uint16s), a location could thus be written as:
+      Consider the location of one of the elements of one of the vectors. It has
+      a vector index, which says which vector it's in, and a vector lane index,
+      which gives the lane. x86 shuffles work in terms of 128-bit subvectors,
+      which we will call slices. So we'll decompose that lane index into a slice
+      index, to identify the 128-bit slice within a vector, and the lane index
+      within that slice. For avx the slice index is either zero or one, and for
+      avx-512 it can be zero through three. Because we have limited everything
+      to be a power of two, we can write out these indices in binary. We'll use
+      v for the vector index, s for the slice index, and l for the lane
+      index. For an avx-512 interleave of 16 vectors of 32 elements each
+      (i.e. uint16s), a location could thus be written as:
 
       [l0 l1 l2] [s0 s1] [v0 v1 v2 v3]
 
@@ -982,7 +982,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       Now let's consider the instructions we have available. These generally
       permute these bits. E.g. an instruction that interleaves two entire
-      vectors, applied to pairs of vectors, would take the some vector bit and
+      vectors, applied to pairs of vectors, would take the same vector bit and
       make it the lowest lane bit instead, shuffling the other bits upwards,
       with the highest-order within-vector bit taking the place of the vector
       bit (because we produce separate vectors for the low and high half of the

From 0bc1b9f28c6f0c1d3a0b48b43b578014ee116f83 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 28 Jan 2026 13:36:52 -0800
Subject: [PATCH 09/34] Don't do half-width unpcks

---
 src/CodeGen_X86.cpp                  | 150 +++++++++++++--------------
 test/performance/block_transpose.cpp |   3 +-
 2 files changed, 75 insertions(+), 78 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index b68987bebd71..4bf504967ab3 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -936,11 +936,13 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     const size_t elems_per_slice = 128 / element_bits;
 
     // Only apply special x86 logic for power-of-two interleaves for avx and
-    // above (TODO: Could slice into native vectors and concat results even if
-    // not power of two)
+    // above where we're going to end up with multiple native vectors (TODO:
+    // Could slice into native vectors and concat results even if not power of
+    // two)
 
     if (!is_power_of_two(vec_elements) ||
-        !is_power_of_two(vecs.size())) {
+        !is_power_of_two(vecs.size()) ||
+        (vecs.size() * vec_elements * element_bits) <= (size_t)native_vector_bits()) {
         return CodeGen_Posix::interleave_vectors(vecs);
     }
 
@@ -1091,35 +1093,27 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
       [2 3 4] [5] [0 1]
 
-      There's no good concatenation we can do to make whole vectors. That 0 and 1
-      both need to end up as lanes bits, and we have no instructions that swap
-      slice bits with lanes bits. So we'll just have to run unpck instructions at
-      half-vector width to push that 4 into the vector bit range:
+      Let's concatenate adjacent pairs as before.
 
-      [1 2 3] [5] [0 4]
+      [2 3 4] [5 0] [1]
 
-      and now we can concatenate according to bit 4 to make whole vectors
+      Now we do one unpck
 
-      [1 2 3] [5 4] [0]
-
-      We then do one more unpck to pull the 0 down:
-
-      [0 1 2] [5 4] [3]
-
-      Next, we need to make 3 a slice bit. We can use shufi to swap it with 4:
+      [1 2 3] [5 0] [4]
 
-      [0 1 2] [5 3] [4]
+      And we encounter a problem when it comes to the second one. The next bit
+      we want pull in is hiding in the slice bits, which unpck instructions
+      can't access. So at this point we use a shufi to push it back into the
+      vector bits, swapping 0 and 4.
 
-      and then another shufi to rotate those three
+      [1 2 3] [5 4] [0]
 
-      [0 1 2] [3 4] [5]
+      Now we can do the last unpck.
 
-      and we're done.
+      [0 1 2] [5 4] [3]
 
-      Depending on how many of each bit we start with, we can also end up in
-      situations where everything is correct except the two slice bits are in
-      the wrong order, in which case we can use a shufi instruction with a
-      vector and itself to swap those two bits.
+      From here we can use two shufi instructions to fix up the vector and slice
+      bits.
 
       So there are many possible paths depending on the number of elements per
       vector, the number of elements per 128-bit slice of each vector, and the
@@ -1134,7 +1128,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // The number of 128-bit slices per vector is 2 for avx and 4 for avx512
     const int final_num_s_bits = ctz64(native_vector_bits() / 128);
-    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2) << native_vector_bits() << " " << final_num_s_bits << "\n";
+    internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2)
+        << native_vector_bits() << " " << final_num_s_bits;
 
     const int num_v_bits = ctz64(v.size());
     const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice);
@@ -1216,7 +1211,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         }
         Value *lo = shuffle_vectors(a, b, lo_indices);
         Value *hi = shuffle_vectors(a, b, hi_indices);
-        return {optimization_fence(lo), optimization_fence(hi)};
+        return {lo, hi};
     };
 
     // A 2x2 transpose of slices within a single vector
@@ -1230,7 +1225,7 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
                 indices.push_back(i + j * (int)elems_per_slice);
             }
         }
-        return optimization_fence(shuffle_vectors(a, a, indices));
+        return shuffle_vectors(a, a, indices);
     };
 
     // A helper to iterate over all pairs of entries in v, separated by some
@@ -1326,64 +1321,64 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         }
 
         auto v_it = std::find(v_bits.begin(), v_bits.end(), bit);
-        if (v_it != v_bits.end()) {
-            int j = v_it - v_bits.begin();
-            v_bits.erase(v_it);
-            s_bits.push_back(bit);
 
-            std::vector<Value *> new_v;
-            new_v.reserve(v.size() / 2);
-            for_all_pairs(j, [&](auto *a, auto *b) {
-                new_v.push_back(concat_vectors({*a, *b}));
-            });
-            v.swap(new_v);
-            vec_elements *= 2;
-        } else {
-            // Oh no, the bit we wanted to use isn't in v_bits, it's in l_bits.
-            // We'll do sub-width unpck instead with an appropriate v bit to try
-            // to push it out. This is in a while loop, so it will keep doing
-            // this until it pops out the top of the l bits and we identify it
-            // as a v bit.
-            if (std::find(l_bits.begin(), l_bits.end(), bit) != l_bits.end()) {
-                int b = l_bits[0] - 1;
-                auto vb_it = std::find(v_bits.begin(), v_bits.end(), b);
-                if (vb_it == v_bits.end()) {
-                    vb_it = v_bits.end() - 1;
-                    b = *vb_it;
-                }
+        if (v_it == v_bits.end()) {
+            // Just concatenate according to the lowest vector bit.
+            v_it = v_bits.begin();
+            bit = *v_it;
+        }
 
-                int j = vb_it - v_bits.begin();
-                *vb_it = l_bits.back();
-                l_bits.pop_back();
-                l_bits.insert(l_bits.begin(), b);
+        int j = v_it - v_bits.begin();
+        v_bits.erase(v_it);
+        s_bits.push_back(bit);
 
-                for_all_pairs(j, [&](auto *a, auto *b) {
-                    auto [lo, hi] = unpck(*a, *b);
-                    *a = lo;
-                    *b = hi;
-                });
-            }
-        }
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        for_all_pairs(j, [&](auto *a, auto *b) {
+            new_v.push_back(concat_vectors({*a, *b}));
+        });
+        v.swap(new_v);
+        vec_elements *= 2;
     }
 
-    // If only one vector is left, we just need to check if the slice bits are
-    // in the right order:
-    if (v_bits.empty()) {
-        internal_assert(v.size() == 1);
-        if (s_bits.size() == 2 && s_bits[0] > s_bits[1]) {
-            v[0] = self_shufi(v[0]);
-            std::swap(s_bits[0], s_bits[1]);
-        }
-        return v[0];
-    }
+    // There should be more than one vector left
+    internal_assert(v.size() > 1);
 
-    // Now we have at least two whole vectors. Next we finalize lane bits using
+    // Now we have at least two whole vectors. Next we try to finalize lane bits using
     // unpck instructions.
     while (l_bits[0] != 0) {
         int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
 
         auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
-        internal_assert(vb_it != v_bits.end());
+
+        // internal_assert(vb_it != v_bits.end());
+        if (vb_it == v_bits.end()) {
+            // The next bit is not in vector bits. It must be hiding in the
+            // slice bits due to earlier concatenation. Move it into the v_bits
+            // with a shufi
+            if (s_bits.back() == bit) {
+                // It's the last (or sole) slice bit. Swap it with the first v bit
+                std::swap(s_bits.back(), v_bits[0]);
+                for_all_pairs(0, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, false);
+                    *a = lo;
+                    *b = hi;
+                });
+            } else {
+                internal_assert(s_bits.size() == 2 && s_bits[0] == bit);
+                // It's the low slice bit. We need shufi with crossover.
+                int v_bit = v_bits[0];
+                v_bits[0] = s_bits[0];
+                s_bits[0] = s_bits[1];
+                s_bits[1] = v_bit;
+                for_all_pairs(0, [&](auto *a, auto *b) {
+                    auto [lo, hi] = shufi(*a, *b, true);
+                    *a = lo;
+                    *b = hi;
+                });
+            }
+            vb_it = v_bits.begin();
+        }
 
         int j = vb_it - v_bits.begin();
         *vb_it = l_bits.back();
@@ -1397,14 +1392,15 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         });
     }
 
-    // They should be 0, 1, 2, 3...
+    // Lane bits should now be 0, 1, 2, 3...
     for (int i = 0; i < (int)l_bits.size(); i++) {
         internal_assert(l_bits[i] == i);
     }
 
-    // Then we fix the slice bits with shufi instructions
+    // Time to fix the slice bits
 
-    // First the low slice bit
+    // First the low slice bit. If it's one of the v bits, move it to be the
+    // high slice bit with a shufi.
     int low_slice_bit = l_bits.size();
     auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit);
     if (ls_in_v != v_bits.end()) {
diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 9915cf8e5f51..8760d8ac5495 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -56,7 +56,8 @@ Result test_transpose(int block_width, int block_height, const Target &t) {
         for (int x = 0; x < N; x++) {
             T actual = out(x, y), correct = in(y, x);
             if (actual != correct) {
-                std::cerr << "out(" << x << ", " << y << ") = "
+                std::cerr << "For block size (" << block_width << ", " << block_height << "): "
+                          << "out(" << x << ", " << y << ") = "
                           << actual << " instead of " << correct << "\n";
                 exit(1);
             }

From cdc1de283010d19d58bc5944ae96f1e5ca4171f6 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 30 Jan 2026 15:50:35 +1100
Subject: [PATCH 10/34] Use optimization fences in the base class too

Before:

Computing best tile sizes for each type
.................................................
bytes, tile width, tile height, bandwidth (GB/s):
1 8 8 20.9997
1 16 8 20.8329
1 8 16 18.5702
1 8 32 17.2463
1 8 64 14.312

2 8 16 19.2047
2 8 8 18.8368
2 16 8 17.0593
2 8 32 17.0591
2 4 8 15.7681

4 8 8 24.9364
4 4 16 22.9699
4 8 16 22.5743
4 4 32 22.255
4 4 8 20.4468

8 8 8 38.4094
8 16 4 28.4167
8 16 8 27.6184
8 8 4 27.6062
8 8 16 26.8693

After:

Computing best tile sizes for each type
.................................................
bytes, tile width, tile height, bandwidth (GB/s):
1 16 32 34.1921
1 16 16 31.8399
1 8 16 25.575
1 16 64 25.1665
1 32 16 25.0061

2 8 32 28.2635
2 8 16 27.7648
2 16 16 27.2126
2 16 32 23.9034
2 8 8 23.6345

4 8 16 34.5303
4 8 8 28.3653
4 16 8 26.8521
4 8 32 26.084
4 16 16 24.4519

8 8 8 33.7163
8 8 4 29.1339
8 4 16 26.418
8 16 4 25.4663
8 2 8 24.3949
---
 src/CodeGen_LLVM.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 7715fce28c34..72b648feff1b 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2195,7 +2195,10 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     internal_assert(!t->isScalableTy())
         << "optimization_fence does not support scalable vectors yet";
     const int bits = t->getPrimitiveSizeInBits();
-    llvm::Type *float_type = llvm_type_of(Float(64, bits / 64));
+    if (bits % 16) {
+        return v;
+    }
+    llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
@@ -2217,7 +2220,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         for (int i = 0; i < vec_elements * 2; i++) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
-        return shuffle_vectors(a, b, indices);
+        return optimization_fence(shuffle_vectors(a, b, indices));
     } else {
         // Grab the even and odd elements of vecs.
         vector<Value *> even_vecs;

From 3eef5dbac3c531283fd752e3d50f7db1d194ce55 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 12 Feb 2026 07:49:33 -0800
Subject: [PATCH 11/34] Use Catanzaro's algorithm for non-power-of-two
 interleaves

---
 src/CodeGen_LLVM.cpp            | 120 ++++++++++++++++--------
 test/performance/CMakeLists.txt |   1 +
 test/performance/interleave.cpp | 159 ++++++++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 39 deletions(-)
 create mode 100644 test/performance/interleave.cpp

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index b5aa069e673d..57aa43299eea 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2211,6 +2211,8 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     }
     int vec_elements = get_vector_num_elements(vecs[0]->getType());
 
+    int factor = gcd(vec_elements, (int)vecs.size());
+
     if (vecs.size() == 1) {
         return vecs[0];
     } else if (vecs.size() == 2) {
@@ -2221,57 +2223,97 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
         return optimization_fence(shuffle_vectors(a, b, indices));
-    } else {
-        // Grab the even and odd elements of vecs.
-        vector<Value *> even_vecs;
-        vector<Value *> odd_vecs;
-        for (size_t i = 0; i < vecs.size(); i++) {
-            if (i % 2 == 0) {
-                even_vecs.push_back(vecs[i]);
-            } else {
-                odd_vecs.push_back(vecs[i]);
+    } else if (factor == 1) {
+        // The number of vectors and the vector length is
+        // coprime. (E.g. interleaving an odd number of vectors of some
+        // power-of-two length). Use the algorithm from "A Decomposition for
+        // In-place Matrix Transposition" by Catanzaro et al.
+        std::vector<Value *> v = vecs;
+
+        // Using unary shuffles, get each element into the right ultimate
+        // lane. This works out without collisions because the number of vectors
+        // and the length of each vector is coprime.
+        const int num_vecs = (int)v.size();
+        std::vector<int> shuffle(vec_elements);
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                shuffle[k % vec_elements] = j;
             }
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
         }
 
-        // If the number of vecs is odd, save the last one for later.
-        Value *last = nullptr;
-        if (even_vecs.size() > odd_vecs.size()) {
-            last = even_vecs.back();
-            even_vecs.pop_back();
+        // We intentionally don't put an optimization fence after the unary
+        // shuffles, because some architectures have a two-way shuffle, so it
+        // helps to fuse the unary shuffle into the first layer of two-way
+        // blends below.
+
+        // Now we need to transfer the elements across the vectors. If we
+        // reorder the vectors, this becomes a rotation across the vectors of a
+        // different amount per lane.
+        std::vector<Value *> new_v(v.size());
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            new_v[i] = v[j];
         }
-        internal_assert(even_vecs.size() == odd_vecs.size());
+        v.swap(new_v);
 
-        // Interleave the even and odd parts.
-        Value *even = interleave_vectors(even_vecs);
-        Value *odd = interleave_vectors(odd_vecs);
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
+        }
+        internal_assert(rotation[0] == 0);
 
-        if (last) {
-            int result_elements = vec_elements * vecs.size();
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        int d = 1;
+        while (d < num_vecs) {
 
-            // Interleave even and odd, leaving a space for the last element.
-            vector<int> indices(result_elements, -1);
-            for (int i = 0, idx = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size();
-                    idx++;
-                }
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
             }
-            Value *even_odd = shuffle_vectors(even, odd, indices);
 
-            // Interleave the last vector into the result.
-            last = slice_vector(last, 0, result_elements);
-            for (int i = 0; i < result_elements; i++) {
-                if (i % vecs.size() < vecs.size() - 1) {
-                    indices[i] = i;
-                } else {
-                    indices[i] = i / vecs.size() + result_elements;
-                }
+            for (int i = 0; i < num_vecs; i++) {
+                int j = (i + num_vecs - d) % num_vecs;
+                new_v[i] = shuffle_vectors(v[i], v[j], shuffle);
             }
 
-            return shuffle_vectors(even_odd, last, indices);
-        } else {
-            return interleave_vectors({even, odd});
+            v.swap(new_v);
+
+            d *= 2;
         }
+
+        return concat_vectors(v);
+
+    } else {
+        // The number of vectors shares a factor with the length of the
+        // vectors. Pick some large factor of the number of vectors, interleave
+        // in separate groups, and then interleave the results.
+        const int n = (int)vecs.size();
+        int f = 1;
+        for (int i = 2; i < n; i++) {
+            if (n % i == 0) {
+                f = i;
+                break;
+            }
+        }
+
+        internal_assert(f > 1 && f < n);
+
+        vector<vector<Value *>> groups(f);
+        for (size_t i = 0; i < vecs.size(); i++) {
+            groups[i % f].push_back(vecs[i]);
+        }
+
+        // Interleave each group
+        vector<Value *> interleaved(f);
+        for (int i = 0; i < f; i++) {
+            interleaved[i] = optimization_fence(interleave_vectors(groups[i]));
+        }
+
+        // Interleave the result
+        return interleave_vectors(interleaved);
     }
 }
 
diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt
index 851e7e3ae506..5978ac2961a3 100644
--- a/test/performance/CMakeLists.txt
+++ b/test/performance/CMakeLists.txt
@@ -16,6 +16,7 @@ tests(GROUPS performance
       fast_pow.cpp
       fast_sine_cosine.cpp
       gpu_half_throughput.cpp
+      interleave.cpp
       jit_stress.cpp
       lots_of_inputs.cpp
       memcpy.cpp
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
new file mode 100644
index 000000000000..f73d7b687ac4
--- /dev/null
+++ b/test/performance/interleave.cpp
@@ -0,0 +1,159 @@
+#include "Halide.h"
+#include "halide_benchmark.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+
+using namespace Halide;
+using namespace Halide::Tools;
+
+struct Result {
+    int type_size, factor;
+    double bandwidth;
+};
+
+template<typename T>
+Result test_interleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N, factor), out(N * factor);
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            in(x, y) = (T)(x * factor + y);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x) = in(x / factor, x % factor);
+
+    Var xi, yi;
+    output.unroll(x, factor, TailStrategy::RoundUp).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    output.output_buffer().dim(0).set_min(0);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x * factor + y), correct = in(x, y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << " * " << factor << " + " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "interleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+template<typename T>
+Result test_deinterleave(int factor, const Target &t) {
+    const int N = 8192;
+    Buffer<T> in(N * factor), out(N, factor);
+
+    for (int x = 0; x < N; x++) {
+        for (int y = 0; y < factor; y++) {
+            in(x * factor + y) = (T)(x + y * N);
+        }
+    }
+
+    Func output;
+    Var x, y;
+
+    output(x, y) = in(x * factor + y);
+
+    Var xi, yi;
+    output.reorder(y, x).bound(y, 0, factor).unroll(y).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    // output.output_buffer().dim(0).set_min(0);
+
+    output.compile_jit();
+
+    output.realize(out);
+
+    double time = benchmark(20, 20, [&]() {
+        output.realize(out);
+    });
+
+    for (int y = 0; y < factor; y++) {
+        for (int x = 0; x < N; x++) {
+            uint64_t actual = out(x, y), correct = in(x * factor + y);
+            if (actual != correct) {
+                std::cerr << "For factor " << factor
+                          << "out(" << x << ", " << y << ") = "
+                          << actual << " instead of " << correct << "\n";
+                exit(1);
+            }
+        }
+    }
+
+    // Uncomment to dump asm for inspection
+    output.compile_to_assembly("/dev/stdout",
+    std::vector<Argument>{in}, "interleave", t);
+
+    return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
+}
+
+int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+    if (target.arch == Target::WebAssembly) {
+        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
+        return 0;
+    }
+
+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";
+#if 0
+    for (int t : {1, 2, 4, 8}) {
+        for (int f = 2; f < 16; f++) {
+#else
+     {
+         {
+            int t = 1, f = 4;
+#endif
+            Result r1, r2;
+            switch (t) {
+            case 1:
+                r1 = test_interleave<uint8_t>(f, target);
+                r2 = test_deinterleave<uint8_t>(f, target);
+                break;
+            case 2:
+                r1 = test_interleave<uint16_t>(f, target);
+                r2 = test_deinterleave<uint16_t>(f, target);
+                break;
+            case 4:
+                r1 = test_interleave<uint32_t>(f, target);
+                r2 = test_deinterleave<uint32_t>(f, target);
+                break;
+            case 8:
+                r1 = test_interleave<uint64_t>(f, target);
+                r2 = test_deinterleave<uint64_t>(f, target);
+                break;
+            default:
+                break;
+            }
+            std::cout << r1.type_size << " "
+                      << r1.factor << " "
+                      << r1.bandwidth << " "
+                      << r2.bandwidth << "\n";
+
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 678a353650869e42cafd5e5e9168a21a99f67b08 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 18 Feb 2026 14:28:04 -0800
Subject: [PATCH 12/34] Support more interleave and deinterleave patterns

---
 src/CodeGen_LLVM.cpp                 | 137 +++++++++++++++++++++-
 src/CodeGen_LLVM.h                   |   3 +
 src/CodeGen_X86.cpp                  | 164 +++++++++++++++++++++++++--
 src/IR.cpp                           |  40 +++++++
 src/IR.h                             |   8 ++
 src/IRPrinter.cpp                    |   5 +
 src/Simplify_Exprs.cpp               |  16 ++-
 src/Simplify_Stmts.cpp               |  20 ++++
 src/StageStridedLoads.cpp            | 109 +++++++++++++++++-
 test/performance/block_transpose.cpp |  13 ++-
 test/performance/interleave.cpp      |  17 +--
 11 files changed, 496 insertions(+), 36 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 57aa43299eea..0350ed0e5035 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2288,8 +2288,9 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
 
     } else {
         // The number of vectors shares a factor with the length of the
-        // vectors. Pick some large factor of the number of vectors, interleave
-        // in separate groups, and then interleave the results.
+        // vectors. Pick some factor of the number of vectors, interleave in
+        // separate groups, and then interleave the results. Doing the smallest
+        // factor first seems to be fastest.
         const int n = (int)vecs.size();
         int f = 1;
         for (int i = 2; i < n; i++) {
@@ -2317,6 +2318,120 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     }
 }
 
+std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType());
+    internal_assert(vec_elements % num_vecs == 0);
+    vec_elements /= num_vecs;
+
+    int factor = gcd(vec_elements, num_vecs);
+
+    if (num_vecs == 1) {
+        return {vec};
+    } else if (num_vecs == 2) {
+        std::vector<Value *> result(2);
+        std::vector<int> indices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i] = i * 2;
+        }
+        result[0] = shuffle_vectors(vec, vec, indices);
+        for (int i = 0; i < vec_elements; i++) {
+            indices[i]++;
+        }
+        result[1] = shuffle_vectors(vec, vec, indices);
+        return result;
+    } else if (factor == 1) {
+        // Use the inverse of Catanzaro's algorithm from above. We slice into
+        // distinct vectors, then rotate each element into the correct final
+        // vector, then do a unary permutation of each vector.
+        std::vector<int> shuffle(vec_elements);
+
+        // Instead of concatenating, we slice.
+        std::vector<Value *> v(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            v[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+
+        // Compute the same rotation as above
+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;
+        }
+        internal_assert(rotation[0] == 0);
+
+        // We'll handle each bit of the rotation one at a time with a two-way
+        // shuffle.
+        std::vector<Value *> new_v(v.size());
+        int d = 1;
+        while (d < num_vecs) {
+
+            for (int i = 0; i < vec_elements; i++) {
+                shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements);
+            }
+
+            for (int i = 0; i < num_vecs; i++) {
+                // The rotation is in the opposite direction to the interleaving
+                // version, so num_vecs - d becomes just d.
+                int j = (i + d) % num_vecs;
+                // An optimization fence here keeps it as a blend and stops it
+                // from getting fused with the unary shuffle below.
+                new_v[i] = optimization_fence(shuffle_vectors(v[i], v[j], shuffle));
+            }
+
+            v.swap(new_v);
+            d *= 2;
+        }
+
+        // Now reorder the vectors in the inverse order to the above.
+        for (int i = 0; i < num_vecs; i++) {
+            int j = (i * vec_elements) % num_vecs;
+            // j and i are swapped below, because we're doing the inverse of the algorithm above
+            new_v[j] = v[i];
+        }
+        v.swap(new_v);
+
+        // The elements are now in the correct vector. Finish up with a unary
+        // shuffle of each.
+        for (int i = 0; i < num_vecs; i++) {
+            for (int j = 0; j < vec_elements; j++) {
+                int k = j * num_vecs + i;
+                // This is the inverse shuffle of the interleaving version, so
+                // the index and the arg of the assignment below are swapped
+                // compared to the above.
+                shuffle[j] = k % vec_elements;
+            }
+
+            v[i] = shuffle_vectors(v[i], v[i], shuffle);
+        }
+
+        return v;
+
+    } else {
+        // Do a lower-factor deinterleave, then deinterleave each result
+        // again. We know there's a non-trivial factor because if it were prime
+        // the gcd above would have been 1. Unlike interleave, doing the largest
+        // factor first seems to be fastest.
+        int f = 1;
+        for (int i = 2; i < num_vecs; i++) {
+            if (num_vecs % i == 0) {
+                f = i;
+            }
+        }
+
+        auto partial = deinterleave_vector(vec, f);
+        std::vector<Value *> result(num_vecs);
+        for (size_t i = 0; i < partial.size(); i++) {
+            Value *v = partial[i];
+            auto vecs = deinterleave_vector(v, num_vecs / f);
+            for (size_t j = 0; j < vecs.size(); j++) {
+                result[j * f + i] = vecs[j];
+            }
+        }
+
+        return result;
+    }
+}
+
 void CodeGen_LLVM::scalarize(const Expr &e) {
     llvm::Type *result_type = llvm_type_of(e.type());
 
@@ -4178,6 +4293,24 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
 
     if (op->is_interleave()) {
         value = interleave_vectors(vecs);
+    } else if (op->is_transpose()) {
+        int cols = op->transpose_factor();
+        int rows = op->vectors[0].type().lanes() / cols;
+        if (is_power_of_two(cols) &&
+            !is_power_of_two(rows)) {
+            // We're doing something like vectorizing over c and x when storing
+            // packed rgb. Best handled as an interleave.
+            std::vector<Value *> slices(rows);
+            for (int i = 0; i < rows; i++) {
+                slices[i] = slice_vector(vecs[0], i * cols, cols);
+            }
+            value = interleave_vectors(slices);
+        } else {
+            // Deinterleave out the cols of the input matrix and concat
+            // them. Occurs when, for example, loading packed RGB and
+            // vectorizing across x.
+            value = concat_vectors(deinterleave_vector(vecs[0], cols));
+        }
     } else if (op->is_concat()) {
         value = concat_vectors(vecs);
     } else {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index e006a885fc57..46ec05638e3f 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -460,6 +460,9 @@ class CodeGen_LLVM : public IRVisitor {
      * an arbitrary number of vectors.*/
     virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);
 
+    /** The inverse of interleave_vectors. */
+    virtual std::vector<llvm::Value *> deinterleave_vector(llvm::Value *vec, int num_vecs);
+
     /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
      * abuse it to prevent shufflevector fusion too. */
     llvm::Value *optimization_fence(llvm::Value *);
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 21e399a16965..0e09443859b8 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -113,6 +113,7 @@ class CodeGen_X86 : public CodeGen_Posix {
     void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
     // @}
 
+    std::vector<llvm::Value *> deinterleave_vector(llvm::Value *, int) override;
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &) override;
 
 private:
@@ -910,6 +911,30 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     CodeGen_Posix::codegen_vector_reduce(op, init);
 }
 
+std::vector<Value *> CodeGen_X86::deinterleave_vector(Value *vec, int num_vecs) {
+    int vec_elements = get_vector_num_elements(vec->getType()) / num_vecs;
+    const size_t element_bits = vec->getType()->getScalarSizeInBits();
+    if (target.has_feature(Target::AVX) &&
+        is_power_of_two(num_vecs) &&
+        is_power_of_two(vec_elements) &&
+        (int)(vec_elements * num_vecs * element_bits) > native_vector_bits()) {
+
+        // Our interleaving logic below supports this case
+        std::vector<Value *> slices(vec_elements);
+        for (int i = 0; i < vec_elements; i++) {
+            slices[i] = slice_vector(vec, i * num_vecs, num_vecs);
+        }
+        vec = interleave_vectors(slices);
+        std::vector<Value *> result(num_vecs);
+        for (int i = 0; i < num_vecs; i++) {
+            result[i] = slice_vector(vec, i * vec_elements, vec_elements);
+        }
+        return result;
+    } else {
+        return CodeGen_Posix::deinterleave_vector(vec, num_vecs);
+    }
+}
+
 Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     // Only use x86-specific interleaving for AVX and above
     if (vecs.empty() || !target.has_feature(Target::AVX)) {
@@ -1146,6 +1171,24 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // Now we define helpers for each instruction we are going to use
 
+    // Useful for debugging or enhancing this algorithm
+    /*
+    auto dump_bits = [&]() {
+        for (int b : l_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : s_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "| ";
+        for (int b : v_bits) {
+            debug(0) << b << " ";
+        }
+        debug(0) << "\n";
+    };
+    */
+
     // unpckl/h instruction
     auto unpck = [&](Value *a, Value *b) -> std::pair<Value *, Value *> {
         int n = get_vector_num_elements(a->getType());
@@ -1258,6 +1301,99 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         s_bits.pop_back();
     }
 
+    // If adjacent vectors are shuffles of the same underlying vector(s),
+    // concatenate pairs, because this is probably free.
+    while ((size_t)vec_elements < elems_per_native_vec && !v_bits.empty()) {
+        std::vector<Value *> new_v;
+        new_v.reserve(v.size() / 2);
+        bool fail = false;
+        std::vector<int> indices;
+        indices.reserve(vec_elements * 2);
+        for (size_t i = 0; i < v.size(); i += 2) {
+            ShuffleVectorInst *a = llvm::dyn_cast<ShuffleVectorInst>(v[i]);
+            ShuffleVectorInst *b = llvm::dyn_cast<ShuffleVectorInst>(v[i + 1]);
+            if (a &&
+                b &&
+                a->getOperand(0) == b->getOperand(0) &&
+                a->getOperand(1) == b->getOperand(1)) {
+
+                // Concatenate the two shuffles
+                indices.clear();
+                for (int j : a->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                for (int j : b->getShuffleMask()) {
+                    indices.push_back(j);
+                }
+                new_v.push_back(shuffle_vectors(a->getOperand(0), a->getOperand(1), indices));
+            } else {
+                fail = true;
+            }
+        }
+        if (fail) {
+            break;
+        }
+
+        v.swap(new_v);
+        // The lowest vector bit becomes the highest lane or slice bit
+        if ((size_t)vec_elements < elems_per_slice) {
+            l_bits.push_back(v_bits[0]);
+        } else {
+            s_bits.push_back(v_bits[0]);
+        }
+        v_bits.erase(v_bits.begin());
+        vec_elements *= 2;
+    }
+
+    if (final_num_s_bits > 1 &&
+        (size_t)vec_elements == elems_per_native_vec &&
+        (size_t)v_bits[0] >= l_bits.size() - 1) {
+        // A big binary shuffle of adjacent pairs will fix the l bits
+        // entirely. AVX-512 has these. Yes, this will use registers for the
+        // shuffle indices, but the alternative requires very many unpck
+        // operations to completely cycle out the v_bits that are hiding in the
+        // bottom of the l_bits.
+
+        std::vector<int> lo_indices(vec_elements);
+        std::vector<int> hi_indices(vec_elements);
+        std::vector<int> sorted_bits = l_bits;
+        sorted_bits.insert(sorted_bits.end(), s_bits.begin(), s_bits.end());
+        sorted_bits.push_back(v_bits[0]);
+        std::sort(sorted_bits.begin(), sorted_bits.end());
+        std::vector<int> idx_of_bit(l_bits.size() + s_bits.size() + v_bits.size(), 0);
+        for (size_t b = 0; b < sorted_bits.size(); b++) {
+            idx_of_bit[sorted_bits[b]] = b;
+        }
+
+        for (size_t dst_idx = 0; dst_idx < (size_t)vec_elements * 2; dst_idx++) {
+            size_t src_idx = 0;
+            for (size_t b = 0; b < l_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[l_bits[b]]) & 1) << b;
+            }
+            for (size_t b = 0; b < s_bits.size(); b++) {
+                src_idx |= ((dst_idx >> idx_of_bit[s_bits[b]]) & 1) << (b + l_bits.size());
+            }
+            src_idx |= ((dst_idx >> idx_of_bit[v_bits[0]]) & 1) << (l_bits.size() + s_bits.size());
+            if (dst_idx < (size_t)vec_elements) {
+                lo_indices[dst_idx] = (int)src_idx;
+            } else {
+                hi_indices[dst_idx - vec_elements] = (int)src_idx;
+            }
+        }
+
+        for_all_pairs(0, [&](auto *a, auto *b) {
+            Value *lo = shuffle_vectors(*a, *b, lo_indices);
+            Value *hi = shuffle_vectors(*a, *b, hi_indices);
+            *a = lo;
+            *b = hi;
+        });
+
+        auto first_s_bit = sorted_bits.begin() + l_bits.size();
+        std::copy(sorted_bits.begin(), first_s_bit, l_bits.begin());
+        std::copy(first_s_bit, first_s_bit + s_bits.size(), s_bits.begin());
+        v_bits[0] = sorted_bits.back();
+    }
+
     // Interleave pairs if we have vectors smaller than a single slice. Choosing
     // which pairs to interleave is important because we want to pull down v
     // bits that are destined to end up as l bits, and we want to pull them down
@@ -1300,9 +1436,8 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
 
     // Concatenate/repack to get at least the desired number of slice bits.
     while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) {
-        int desired_low_slice_bit = ctz64(elems_per_slice);
-        int desired_high_slice_bit = desired_low_slice_bit + 1;
-
+        const int desired_low_slice_bit = ctz64(elems_per_slice);
+        const int desired_high_slice_bit = desired_low_slice_bit + 1;
         int bit;
         if (!s_bits.empty() &&
             s_bits[0] == desired_low_slice_bit) {
@@ -1340,7 +1475,9 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     // Now we have at least two whole vectors. Next we try to finalize lane bits using
     // unpck instructions.
     while (l_bits[0] != 0) {
-        int bit = std::min(l_bits[0], (int)ctz64(elems_per_slice)) - 1;
+
+        int first_s_bit = (int)ctz64(elems_per_slice);
+        int bit = std::min(l_bits[0], first_s_bit) - 1;
 
         auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit);
 
@@ -1348,11 +1485,17 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
         if (vb_it == v_bits.end()) {
             // The next bit is not in vector bits. It must be hiding in the
             // slice bits due to earlier concatenation. Move it into the v_bits
-            // with a shufi
+            // with a shufi. We'll need to pick a v bit to take its place,
+            // ideally one destined to end up in the s bits.
+            vb_it = std::find_if(v_bits.begin(), v_bits.end(), [&](int b) { return b >= first_s_bit; });
+            if (vb_it == v_bits.end()) {
+                vb_it = v_bits.begin();
+            }
+
             if (s_bits.back() == bit) {
                 // It's the last (or sole) slice bit. Swap it with the first v bit
-                std::swap(s_bits.back(), v_bits[0]);
-                for_all_pairs(0, [&](auto *a, auto *b) {
+                std::swap(s_bits.back(), *vb_it);
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
                     auto [lo, hi] = shufi(*a, *b, false);
                     *a = lo;
                     *b = hi;
@@ -1360,17 +1503,16 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
             } else {
                 internal_assert(s_bits.size() == 2 && s_bits[0] == bit);
                 // It's the low slice bit. We need shufi with crossover.
-                int v_bit = v_bits[0];
-                v_bits[0] = s_bits[0];
+                int v_bit = *vb_it;
+                *vb_it = s_bits[0];
                 s_bits[0] = s_bits[1];
                 s_bits[1] = v_bit;
-                for_all_pairs(0, [&](auto *a, auto *b) {
+                for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) {
                     auto [lo, hi] = shufi(*a, *b, true);
                     *a = lo;
                     *b = hi;
                 });
             }
-            vb_it = v_bits.begin();
         }
 
         int j = vb_it - v_bits.begin();
diff --git a/src/IR.cpp b/src/IR.cpp
index c82ae4ebd252..049ad8848aaa 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -815,6 +815,21 @@ Expr Shuffle::make_interleave(const std::vector<Expr> &vectors) {
     return make(vectors, indices);
 }
 
+Expr Shuffle::make_transpose(Expr e, int cols) {
+    internal_assert(e.type().lanes() % cols == 0)
+        << "Transpose cols must divide the number of lanes.\n";
+    int rows = e.type().lanes() / cols;
+
+    std::vector<int> indices(e.type().lanes());
+    for (int j = 0; j < cols; j++) {
+        for (int i = 0; i < rows; i++) {
+            indices[j * rows + i] = i * cols + j;
+        }
+    }
+
+    return make({std::move(e)}, indices);
+}
+
 Expr Shuffle::make_concat(const std::vector<Expr> &vectors) {
     internal_assert(!vectors.empty()) << "Concat of zero vectors.\n";
 
@@ -1012,6 +1027,31 @@ bool Shuffle::is_concat() const {
     return indices.size() == input_lanes && is_ramp(indices);
 }
 
+bool Shuffle::is_transpose() const {
+    if (vectors.size() > 1 ||
+        (int)indices.size() != vectors[0].type().lanes() ||
+        indices.size() < 2) {
+        return false;
+    }
+    int cols = indices[1] - indices[0];
+    int rows = vectors[0].type().lanes() / cols;
+    if ((int)indices.size() != rows * cols) {
+        return false;
+    }
+    for (int row = 0; row < rows; row++) {
+        for (int col = 0; col < cols; col++) {
+            if (indices[col * rows + row] != row * cols + col) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+int Shuffle::transpose_factor() const {
+    return indices[1] - indices[0];
+}
+
 bool Shuffle::is_slice() const {
     size_t input_lanes = 0;
     for (const Expr &i : vectors) {
diff --git a/src/IR.h b/src/IR.h
index da27019a93c7..78d61be2349c 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -910,6 +910,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * interleaving of vectors of the same length. */
     static Expr make_interleave(const std::vector<Expr> &vectors);
 
+    /** Convenience constructor for making a shuffle representing an
+     * in-place transpose of a matrix with the given number of columns. */
+    static Expr make_transpose(Expr e, int cols);
+
     /** Convenience constructor for making a shuffle representing a
      * concatenation of the vectors. */
     static Expr make_concat(const std::vector<Expr> &vectors);
@@ -930,6 +934,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
+    /** Check if this shuffle is an in-place transpose of a single vector */
+    bool is_transpose() const;
+    int transpose_factor() const;
+
     /** Check if this shuffle can be represented as a repeating pattern that
      * repeats the same shuffle of the single input vector some number of times.
      * For example: 0, 3, 1, 1,  0, 3, 1, 1, .....,  0, 3, 1, 1
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index e95286af03ee..9cd5527b09a6 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1461,6 +1461,11 @@ void IRPrinter::visit(const Shuffle *op) {
         stream << paren(", ") << imm_int(op->slice_begin())
                << paren(", ") << imm_int(op->slice_stride())
                << paren(", ") << imm_int(op->indices.size());
+    } else if (op->is_transpose()) {
+        openf("transpose_vector");
+        print_list(op->vectors);
+        stream << paren(", ") << imm_int(op->transpose_factor());
+
     } else {
         openf("shuffle");
         print_list(op->vectors);
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 0eb3bbaf3c15..52665c0c2894 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -327,8 +327,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
 
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
@@ -360,6 +361,17 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp && is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense loads and transpose the
+        // resulting vector instead.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_load =
+            Load::make(op->type, op->name, transposed_index, op->image, op->param, predicate, align);
+        return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index bbacbe69b55d..308254ff1b9a 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -348,6 +348,7 @@ Stmt Simplify::visit(const Store *op) {
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
     const Load *load = value.as<Load>();
+    const Shuffle *shuf = index.as<Shuffle>();
     const Broadcast *scalar_pred = predicate.as<Broadcast>();
     if (scalar_pred && !scalar_pred->value.type().is_scalar()) {
         // Nested vectorization
@@ -365,6 +366,25 @@ Stmt Simplify::visit(const Store *op) {
     } else if (is_undef(value) || (load && load->name == op->name && equal(load->index, index))) {
         // foo[x] = foo[x] or foo[x] = undef is a no-op
         return Evaluate::make(0);
+    } else if (shuf && shuf->is_concat()) {
+        // Break a store of a concat of vector indices into separate stores
+        std::string var_name = unique_name('t');
+        Expr var = Variable::make(value.type(), var_name);
+        std::vector<Stmt> stores;
+        int lanes = 0;
+        for (size_t i = 0; i < shuf->vectors.size(); i++) {
+            Expr idx = shuf->vectors[i];
+            stores.push_back(Store::make(op->name,
+                                         Shuffle::make_slice(var, lanes, 1, idx.type().lanes()),
+                                         shuf->vectors[i],
+                                         op->param,
+                                         Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()),
+                                         ModulusRemainder{}));
+            lanes += idx.type().lanes();
+        }
+        Stmt s = Block::make(stores);
+        s = LetStmt::make(var_name, value, s);
+        return mutate(s);
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 85691921bc8d..16e4680323f4 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -1,5 +1,6 @@
 #include "StageStridedLoads.h"
 #include "CSE.h"
+#include "ExprUsesVar.h"
 #include "IREquality.h"
 #include "IRMutator.h"
 #include "IROperator.h"
@@ -95,12 +96,15 @@ class FindStridedLoads : public IRVisitor {
                         base = base_add->a;
                         offset = *off;
                     }
+                } else if (auto off = as_const_int(base)) {
+                    base = 0;
+                    offset = *off;
                 }
 
                 // TODO: We do not yet handle nested vectorization here for
                 // ramps which have not already collapsed. We could potentially
                 // handle more interesting types of shuffle than simple flat slices.
-                if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) {
+                if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
                     if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
@@ -157,6 +161,19 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
+    std::map<Stmt, std::pair<std::string, Expr>> let_injections;
+
+    using IRMutator::mutate;
+
+    Stmt mutate(const Stmt &s) override {
+        auto it = let_injections.find(s);
+        if (it != let_injections.end()) {
+            const auto &[name, value] = it->second;
+            return LetStmt::make(name, value, IRMutator::mutate(s));
+        } else {
+            return IRMutator::mutate(s);
+        }
+    }
 
 protected:
     Expr visit(const Load *op) override {
@@ -191,6 +208,61 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
+Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
+    std::vector<Stmt> path, result;
+    mutate_with(root,  //
+                [&](auto *self, const Stmt &s) {
+                    path.push_back(s);
+                    self->mutate_base(s);
+                    path.pop_back();
+                    return s;  //
+                },
+                [&](auto *self, const Expr &e) {
+                    const Load *l = e.as<Load>();
+                    if (l && exprs.count(l)) {
+                        if (result.empty()) {
+                            result = path;
+                        } else {
+                            // Find the common prefix of path and result
+                            size_t i = 0;
+                            while (i < path.size() &&
+                                   i < result.size() &&
+                                   path[i].get() == result[i].get()) {
+                                i++;
+                            }
+                            result.resize(i);
+                        }
+                    };
+                    return self->mutate_base(e);  //
+                });
+    internal_assert(!result.empty()) << "None of the exprs were found\n";
+    return result.back();
+}
+
+bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
+    // Check none of the variables the idx depends on are defined somewhere
+    // within this stmt, and there are no stores to the given buffer in the
+    // stmt.
+    bool result = true;
+    visit_with(s,                                 //
+               [&](auto *self, const Let *let) {  //
+                   result &= !expr_uses_var(idx, let->name);
+               },
+               [&](auto *self, const LetStmt *let) {  //
+                   result &= !expr_uses_var(idx, let->name);
+               },
+               [&](auto *self, const For *loop) {  //
+                   result &= !expr_uses_var(idx, loop->name);
+               },
+               [&](auto *self, const Allocate *alloc) {  //
+                   result &= alloc->name != buf;
+               },
+               [&](auto *self, const Store *store) {  //
+                   result &= store->name != buf;
+               });
+    return result;
+}
+
 }  // namespace
 
 Stmt stage_strided_loads(const Stmt &s) {
@@ -218,6 +290,7 @@ Stmt stage_strided_loads(const Stmt &s) {
             const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
 
             if (!can_lift) {
+                debug(0) << "Can't lift: " << Expr(load->second[0]->index) << "\n";
                 load++;
                 continue;
             }
@@ -228,13 +301,39 @@ Stmt stage_strided_loads(const Stmt &s) {
             Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
+
+            std::set<const Load *> all_loads;
+            for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) {
+                all_loads.insert(l->second.begin(), l->second.end());
+            }
+
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
                                           const_true(lanes), op->alignment);
             shared_load = common_subexpression_elimination(shared_load);
-            for (; load != v.end() && load->first < first_offset + k.stride; load++) {
-                Expr shuf = Shuffle::make_slice(shared_load, load->first - first_offset, k.stride, k.lanes);
-                for (const Load *l : load->second) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+
+            // If possible, we do the shuffle as an in-place transpose followed
+            // by a dense slice. This is more efficient when extracting multiple
+            // slices.
+            Stmt let_site = innermost_containing_stmt(alloc ? Stmt(alloc) : s, all_loads);
+            if (can_hoist_shared_load(let_site, k.buf, idx)) {
+                shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                std::string name = unique_name('t');
+                Expr var = Variable::make(shared_load.type(), name);
+                for (; load != v.end() && load->first < first_offset + k.stride; load++) {
+                    int row = load->first - first_offset;
+                    Expr shuf = Shuffle::make_slice(var, row * k.lanes, 1, k.lanes);
+                    for (const Load *l : load->second) {
+                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    }
+                }
+                replacer.let_injections.emplace(let_site, std::make_pair(name, shared_load));
+            } else {
+                for (; load != v.end() && load->first < first_offset + k.stride; load++) {
+                    int row = load->first - first_offset;
+                    Expr shuf = Shuffle::make_slice(shared_load, row, k.stride, k.lanes);
+                    for (const Load *l : load->second) {
+                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    }
                 }
             }
         }
diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp
index 8760d8ac5495..921d7f9a913b 100644
--- a/test/performance/block_transpose.cpp
+++ b/test/performance/block_transpose.cpp
@@ -33,13 +33,16 @@ Result test_transpose(int block_width, int block_height, const Target &t) {
     Var xi, yi;
     output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp)
         .vectorize(xi)
-        .unroll(yi);
+        .vectorize(yi);
 
-    // Do vectorized loads from the input.
-    input.in().compute_at(output, x).vectorize(x).unroll(y);
+    // Explicitly vectorized loads from the input. Was necessary before we
+    // automatically swizzled the 2D load into dense order.
+    // input.in().compute_at(output, x).vectorize(x).unroll(y);
 
-    // Transpose in registers
-    input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+    // Explicit transpose in registers. This used to be the idiom, but is no
+    // longer necessary because stage_strided_loads should detect the strided
+    // loads from input.in() and turn it into a transpose.
+    // input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
 
     // TODO: Should not be necessary, but prevents licm from doing something dumb.
     output.output_buffer().dim(0).set_bounds(0, 256);
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
index f73d7b687ac4..3df42ed0237f 100644
--- a/test/performance/interleave.cpp
+++ b/test/performance/interleave.cpp
@@ -76,8 +76,10 @@ Result test_deinterleave(int factor, const Target &t) {
     output(x, y) = in(x * factor + y);
 
     Var xi, yi;
-    output.reorder(y, x).bound(y, 0, factor).unroll(y).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
-    // output.output_buffer().dim(0).set_min(0);
+    output.bound(y, 0, factor)
+        .reorder(y, x)
+        .unroll(y)  // Also works if we vectorize y
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
 
     output.compile_jit();
 
@@ -100,8 +102,8 @@ Result test_deinterleave(int factor, const Target &t) {
     }
 
     // Uncomment to dump asm for inspection
-    output.compile_to_assembly("/dev/stdout",
-    std::vector<Argument>{in}, "interleave", t);
+    // output.compile_to_assembly("/dev/stdout",
+    // std::vector<Argument>{in}, "deinterleave", t);
 
     return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)};
 }
@@ -117,14 +119,8 @@ int main(int argc, char **argv) {
     target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
 
     std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";
-#if 0
     for (int t : {1, 2, 4, 8}) {
         for (int f = 2; f < 16; f++) {
-#else
-     {
-         {
-            int t = 1, f = 4;
-#endif
             Result r1, r2;
             switch (t) {
             case 1:
@@ -150,7 +146,6 @@ int main(int argc, char **argv) {
                       << r1.factor << " "
                       << r1.bandwidth << " "
                       << r2.bandwidth << "\n";
-
         }
     }
 

From 4c1adf779b2644103ed63458dcc4cbce449a480c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 09:48:06 -0800
Subject: [PATCH 13/34] clang-tidy fix

---
 src/Simplify_Stmts.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 308254ff1b9a..1b4588342096 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -372,11 +372,10 @@ Stmt Simplify::visit(const Store *op) {
         Expr var = Variable::make(value.type(), var_name);
         std::vector<Stmt> stores;
         int lanes = 0;
-        for (size_t i = 0; i < shuf->vectors.size(); i++) {
-            Expr idx = shuf->vectors[i];
+        for (const Expr &idx : shuf->vectors) {
             stores.push_back(Store::make(op->name,
                                          Shuffle::make_slice(var, lanes, 1, idx.type().lanes()),
-                                         shuf->vectors[i],
+                                         idx,
                                          op->param,
                                          Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()),
                                          ModulusRemainder{}));

From 1c940e8fb34459738f7d0e1ace636685bba47ea4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 09:48:26 -0800
Subject: [PATCH 14/34] Handle multiple let injections at same site

Also better algorithm for innermost containing stmt
---
 src/StageStridedLoads.cpp | 43 +++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 16e4680323f4..c159d9d62b0a 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -161,18 +161,19 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
-    std::map<Stmt, std::pair<std::string, Expr>> let_injections;
+    std::map<Stmt, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     using IRMutator::mutate;
 
     Stmt mutate(const Stmt &s) override {
+        Stmt stmt = IRMutator::mutate(s);
         auto it = let_injections.find(s);
         if (it != let_injections.end()) {
-            const auto &[name, value] = it->second;
-            return LetStmt::make(name, value, IRMutator::mutate(s));
-        } else {
-            return IRMutator::mutate(s);
+            for (const auto &[name, value] : it->second) {
+                stmt = LetStmt::make(name, value, stmt);
+            }
         }
+        return stmt;
     }
 
 protected:
@@ -209,34 +210,29 @@ class ReplaceStridedLoads : public IRMutator {
 };
 
 Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
-    std::vector<Stmt> path, result;
+    Stmt result;
+    // The innermost containing stmt is whichever stmt node contains the
+    // largest number of our exprs, with ties breaking inwards.
+    int seen = 0, best = 0;
     mutate_with(root,  //
                 [&](auto *self, const Stmt &s) {
-                    path.push_back(s);
+                    int old = seen;
                     self->mutate_base(s);
-                    path.pop_back();
+                    if (old == 0 && seen > best) {
+                        result = s;
+                        best = seen;
+                    }
                     return s;  //
                 },
                 [&](auto *self, const Expr &e) {
                     const Load *l = e.as<Load>();
                     if (l && exprs.count(l)) {
-                        if (result.empty()) {
-                            result = path;
-                        } else {
-                            // Find the common prefix of path and result
-                            size_t i = 0;
-                            while (i < path.size() &&
-                                   i < result.size() &&
-                                   path[i].get() == result[i].get()) {
-                                i++;
-                            }
-                            result.resize(i);
-                        }
+                        seen++;
                     };
                     return self->mutate_base(e);  //
                 });
-    internal_assert(!result.empty()) << "None of the exprs were found\n";
-    return result.back();
+    internal_assert(seen) << "None of the exprs were found\n";
+    return result;
 }
 
 bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
@@ -290,7 +286,6 @@ Stmt stage_strided_loads(const Stmt &s) {
             const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
 
             if (!can_lift) {
-                debug(0) << "Can't lift: " << Expr(load->second[0]->index) << "\n";
                 load++;
                 continue;
             }
@@ -326,7 +321,7 @@ Stmt stage_strided_loads(const Stmt &s) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
-                replacer.let_injections.emplace(let_site, std::make_pair(name, shared_load));
+                replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;

From c39b1a0505396bf95c5b1829872981b4bb709b4a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 19 Feb 2026 16:30:41 -0800
Subject: [PATCH 15/34] better simplification and better handling of composite
 factors

---
 apps/iir_blur/iir_blur_generator.cpp |  3 +-
 src/CSE.cpp                          | 35 ++++++++++++++++--
 src/CodeGen_LLVM.cpp                 | 35 ++++++++++--------
 src/CodeGen_X86.cpp                  | 20 +++++++++--
 src/IRMatch.h                        | 54 ++++++++++++++++++++++++++++
 src/Simplify_Add.cpp                 |  1 +
 src/Simplify_EQ.cpp                  |  1 +
 src/Simplify_Max.cpp                 |  1 +
 src/Simplify_Min.cpp                 |  1 +
 src/Simplify_Mul.cpp                 |  1 +
 src/Simplify_Sub.cpp                 |  1 +
 src/StageStridedLoads.cpp            | 43 ++++++++++++++--------
 12 files changed, 160 insertions(+), 36 deletions(-)

diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 3c4dee4304af..7f411d7e8fef 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -43,13 +43,12 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
                 .split(y, y, yi, vec)
-                .unroll(yi)
+                .vectorize(yi)
                 .vectorize(x)
                 .fuse(yo, c, t)
                 .parallel(t);
 
             blur.in(transpose)
-                .reorder_storage(y, x)
                 .compute_at(transpose, y)
                 .vectorize(x)
                 .unroll(y);
diff --git a/src/CSE.cpp b/src/CSE.cpp
index c2a46d93bc4d..e7e56bb4df09 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -237,10 +237,39 @@ class CSEEveryExprInStmt : public IRMutator {
         }
         const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
         internal_assert(bundle && bundle->args.size() == 2);
-        Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
+
+        Expr value = bundle->args[0], index = bundle->args[1];
+
+        // Figure out which ones are actually needed by the index
+
+        auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
+            visit_with(e, [&](auto *, const Variable *var) {
+                s.insert(var->name);
+            });
+        };
+
+        std::set<string> index_lets;
+        add_all_vars_to_set(index, index_lets);
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                add_all_vars_to_set(val, index_lets);
+            }
+        }
+
+        vector<pair<string, Expr>> deferred;
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                deferred.emplace_back(var, val);
+            } else {
+                value = Let::make(var, val, value);
+            }
+        }
+
+        Stmt s = Store::make(op->name, value, index,
                              op->param, mutate(op->predicate), op->alignment);
-        for (const auto &[var, value] : reverse_view(lets)) {
-            s = LetStmt::make(var, value, s);
+
+        for (const auto &[var, val] : deferred) {
+            s = LetStmt::make(var, val, s);
         }
         return s;
     }
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 9fcc3a6cd046..a5937f123cfe 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2289,18 +2289,20 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     } else {
         // The number of vectors shares a factor with the length of the
         // vectors. Pick some factor of the number of vectors, interleave in
-        // separate groups, and then interleave the results. Doing the smallest
-        // factor first seems to be fastest.
+        // separate groups, and then interleave the results. Do the largest
+        // power of two factor first.
         const int n = (int)vecs.size();
-        int f = 1;
-        for (int i = 2; i < n; i++) {
-            if (n % i == 0) {
-                f = i;
-                break;
+        int f = n & -n;
+        if (f == 1 || f == n) {
+            for (int i = 2; i < n; i++) {
+                if (n % i == 0) {
+                    f = i;
+                    break;
+                }
             }
         }
 
-        internal_assert(f > 1 && f < n);
+        internal_assert(f > 1 && f < n && n % f == 0) << f << " " << n;
 
         vector<vector<Value *>> groups(f);
         for (size_t i = 0; i < vecs.size(); i++) {
@@ -2409,15 +2411,20 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
     } else {
         // Do a lower-factor deinterleave, then deinterleave each result
         // again. We know there's a non-trivial factor because if it were prime
-        // the gcd above would have been 1. Unlike interleave, doing the largest
-        // factor first seems to be fastest.
-        int f = 1;
-        for (int i = 2; i < num_vecs; i++) {
-            if (num_vecs % i == 0) {
-                f = i;
+        // the gcd above would have been 1. Do the largest power-of-two factor
+        // first.
+        int f = num_vecs & -num_vecs;
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
+                    f = i;
+                    break;
+                }
             }
         }
 
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) << f << " " << num_vecs;
+
         auto partial = deinterleave_vector(vec, f);
         std::vector<Value *> result(num_vecs);
         for (size_t i = 0; i < partial.size(); i++) {
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 0e09443859b8..edf79a4db15c 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -954,9 +954,23 @@ Value *CodeGen_X86::interleave_vectors(const std::vector<Value *> &vecs) {
     const size_t elems_per_slice = 128 / element_bits;
 
     // Only apply special x86 logic for power-of-two interleaves for avx and
-    // above where we're going to end up with multiple native vectors (TODO:
-    // Could slice into native vectors and concat results even if not power of
-    // two)
+    // above where we're going to end up with multiple native vectors.
+
+    if (!is_power_of_two(vec_elements) &&
+        vec_elements % elems_per_native_vec == 0) {
+        // It's not a power of two, but it's a multiple of the native vector
+        // length, so slice it and recurse.
+        std::vector<Value *> results;
+        for (int i = 0; i < vec_elements; i += elems_per_native_vec) {
+            std::vector<Value *> slices;
+            slices.reserve(vecs.size());
+            for (auto *v : vecs) {
+                slices.push_back(slice_vector(v, i, (int)elems_per_native_vec));
+            }
+            results.push_back(interleave_vectors(slices));
+        }
+        return concat_vectors(results);
+    }
 
     if (!is_power_of_two(vec_elements) ||
         !is_power_of_two(vecs.size()) ||
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 671a6e086e1f..7e9abc80789b 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2249,6 +2249,60 @@ HALIDE_ALWAYS_INLINE auto slice(Vec vec, Base base, Stride stride, Lanes lanes)
     return {pattern_arg(vec), pattern_arg(base), pattern_arg(stride), pattern_arg(lanes)};
 }
 
+template<typename Vec, typename Factor>
+struct TransposeOp {
+    struct pattern_tag {};
+    Vec vec;
+    Factor factor;
+
+    static constexpr uint32_t binds = Vec::binds | Factor::binds;
+
+    constexpr static IRNodeType min_node_type = IRNodeType::Shuffle;
+    constexpr static IRNodeType max_node_type = IRNodeType::Shuffle;
+    constexpr static bool canonical = Vec::canonical && Factor::canonical;
+
+    template<uint32_t bound>
+    HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
+        if (e.node_type != IRNodeType::Shuffle) {
+            return false;
+        }
+        const Shuffle &v = (const Shuffle &)e;
+        return v.vectors.size() == 1 &&
+               v.is_transpose() &&
+               vec.template match<bound>(*v.vectors[0].get(), state) &&
+               factor.template match<(bound | bindings<Vec>::mask)>(v.transpose_factor(), state);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        halide_scalar_value_t factor_val;
+        halide_type_t ty;
+        factor.make_folded_const(factor_val, ty, state);
+        int f = (int)factor_val.u.i64;
+        return Shuffle::make_transpose(vec.make(state, type_hint), f);
+    }
+
+    constexpr static bool foldable = false;
+
+    HALIDE_ALWAYS_INLINE
+    TransposeOp(Vec v, Factor f)
+        : vec(v), factor(f) {
+        static_assert(Factor::foldable, "Factor of transpose should consist only of operations that constant-fold");
+    }
+};
+
+template<typename Vec, typename Factor>
+std::ostream &operator<<(std::ostream &s, const TransposeOp<Vec, Factor> &op) {
+    s << "transpose(" << op.vec << ", " << op.factor << ")";
+    return s;
+}
+
+template<typename Vec, typename Factor>
+HALIDE_ALWAYS_INLINE auto transpose(Vec vec, Factor factor) noexcept
+    -> TransposeOp<decltype(pattern_arg(vec)), decltype(pattern_arg(factor))> {
+    return {pattern_arg(vec), pattern_arg(factor)};
+}
+
 template<typename A>
 struct Fold {
     struct pattern_tag {};
diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp
index 6158cc9cd48c..06967a8d32d3 100644
--- a/src/Simplify_Add.cpp
+++ b/src/Simplify_Add.cpp
@@ -120,6 +120,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) + z), slice(x + y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (z - slice(y, c0, c1, c2)), slice(x - y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) - z), slice(x + y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) + transpose(y, c0), transpose(x + y, c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(x + x * y, x * (y + 1)) ||
diff --git a/src/Simplify_EQ.cpp b/src/Simplify_EQ.cpp
index 994d14cd4cee..5d8c09901b49 100644
--- a/src/Simplify_EQ.cpp
+++ b/src/Simplify_EQ.cpp
@@ -195,6 +195,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *info) {
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(slice(x, c0, c1, c2) == slice(y, c0, c1, c2) + z,
                  slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) == transpose(y, c0), transpose(x == y, c0)) ||
          false) ||
         (no_overflow(a.type()) && EVAL_IN_LAMBDA  //
          (rewrite(x * y == 0, (x == 0) || (y == 0)) ||
diff --git a/src/Simplify_Max.cpp b/src/Simplify_Max.cpp
index 1926bc9a069e..cc4253ca718f 100644
--- a/src/Simplify_Max.cpp
+++ b/src/Simplify_Max.cpp
@@ -212,6 +212,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *info) {
          rewrite(max(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(max(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(slice(y, c0, c1, c2), z)), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(max(slice(x, c0, c1, c2), max(z, slice(y, c0, c1, c2))), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(max(transpose(x, c0), transpose(y, c0)), transpose(max(x, y), c0)) ||
 
          (no_overflow(op->type) &&
           (rewrite(max(max(x, y) + c0, x), max(x, y + c0), c0 < 0) ||
diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp
index 3f6084c6c4f1..e6515ab280e9 100644
--- a/src/Simplify_Min.cpp
+++ b/src/Simplify_Min.cpp
@@ -214,6 +214,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *info) {
          rewrite(min(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(min(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(slice(y, c0, c1, c2), z)), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite(min(slice(x, c0, c1, c2), min(z, slice(y, c0, c1, c2))), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(min(transpose(x, c0), transpose(y, c0)), transpose(min(x, y), c0)) ||
          (no_overflow(op->type) &&
           (rewrite(min(min(x, y) + c0, x), min(x, y + c0), c0 > 0) ||
            rewrite(min(min(x, y) + c0, x), min(x, y) + c0, c0 < 0) ||
diff --git a/src/Simplify_Mul.cpp b/src/Simplify_Mul.cpp
index dfa38d39111c..e1bcb68fe7bc 100644
--- a/src/Simplify_Mul.cpp
+++ b/src/Simplify_Mul.cpp
@@ -81,6 +81,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *info) {
         rewrite(slice(x, c0, c1, c2) * slice(y, c0, c1, c2), slice(x * y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (slice(y, c0, c1, c2) * z), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
         rewrite(slice(x, c0, c1, c2) * (z * slice(y, c0, c1, c2)), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+        rewrite(transpose(x, c0) * transpose(y, c0), transpose(x * y, c0)) ||
 
         false) {
         return mutate(rewrite.result, info);
diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp
index 29bd02c78ed6..2444cb6fd1d9 100644
--- a/src/Simplify_Sub.cpp
+++ b/src/Simplify_Sub.cpp
@@ -177,6 +177,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *info) {
          rewrite(slice(x, c0, c1, c2) - (slice(y, c0, c1, c2) + z), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((slice(x, c0, c1, c2) - z) - slice(y, c0, c1, c2), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
          rewrite((z - slice(x, c0, c1, c2)) - slice(y, c0, c1, c2), z - slice(x + y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
+         rewrite(transpose(x, c0) - transpose(y, c0), transpose(x - y, c0)) ||
 
          (no_overflow(op->type) && EVAL_IN_LAMBDA  //
           (rewrite(max(x, y) - x, max(y - x, 0)) ||
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index c159d9d62b0a..5711e36e92ec 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -161,13 +161,11 @@ class ReplaceStridedLoads : public IRMutator {
     std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
     std::map<const Allocate *, int> padding;
     Scope<const Allocate *> allocation_scope;
-    std::map<Stmt, std::vector<std::pair<std::string, Expr>>> let_injections;
-
-    using IRMutator::mutate;
+    std::map<const IRNode *, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     Stmt mutate(const Stmt &s) override {
         Stmt stmt = IRMutator::mutate(s);
-        auto it = let_injections.find(s);
+        auto it = let_injections.find(s.get());
         if (it != let_injections.end()) {
             for (const auto &[name, value] : it->second) {
                 stmt = LetStmt::make(name, value, stmt);
@@ -176,6 +174,17 @@ class ReplaceStridedLoads : public IRMutator {
         return stmt;
     }
 
+    Expr mutate(const Expr &e) override {
+        Expr expr = IRMutator::mutate(e);
+        auto it = let_injections.find(e.get());
+        if (it != let_injections.end()) {
+            for (const auto &[name, value] : it->second) {
+                expr = Let::make(name, value, expr);
+            }
+        }
+        return expr;
+    }
+
 protected:
     Expr visit(const Load *op) override {
         const Allocate *alloc = nullptr;
@@ -209,8 +218,8 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
-Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &exprs) {
-    Stmt result;
+const IRNode *innermost_containing_node(const Stmt &root, const std::set<const Load *> &exprs) {
+    const IRNode *result = nullptr;
     // The innermost containing stmt is whichever stmt node contains the
     // largest number of our exprs, with ties breaking inwards.
     int seen = 0, best = 0;
@@ -219,28 +228,34 @@ Stmt innermost_containing_stmt(const Stmt &root, const std::set<const Load *> &e
                     int old = seen;
                     self->mutate_base(s);
                     if (old == 0 && seen > best) {
-                        result = s;
+                        result = s.get();
                         best = seen;
                     }
                     return s;  //
                 },
                 [&](auto *self, const Expr &e) {
+                    int old = seen;
                     const Load *l = e.as<Load>();
                     if (l && exprs.count(l)) {
                         seen++;
                     };
-                    return self->mutate_base(e);  //
+                    self->mutate_base(e);
+                    if (old == 0 && seen > best) {
+                        result = e.get();
+                        best = seen;
+                    }
+                    return e;  //
                 });
     internal_assert(seen) << "None of the exprs were found\n";
     return result;
 }
 
-bool can_hoist_shared_load(const Stmt &s, const std::string &buf, const Expr &idx) {
+bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &idx) {
     // Check none of the variables the idx depends on are defined somewhere
     // within this stmt, and there are no stores to the given buffer in the
     // stmt.
     bool result = true;
-    visit_with(s,                                 //
+    visit_with(n,                                 //
                [&](auto *self, const Let *let) {  //
                    result &= !expr_uses_var(idx, let->name);
                },
@@ -293,7 +308,8 @@ Stmt stage_strided_loads(const Stmt &s) {
             // We have a complete cluster of loads. Make a single dense load
             int lanes = k.lanes * k.stride;
             int64_t first_offset = load->first;
-            Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
+            Expr base = common_subexpression_elimination(k.base);
+            Expr idx = Ramp::make(base + (int)first_offset, make_one(k.base.type()), lanes);
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
 
@@ -304,14 +320,12 @@ Stmt stage_strided_loads(const Stmt &s) {
 
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
                                           const_true(lanes), op->alignment);
-            shared_load = common_subexpression_elimination(shared_load);
 
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            Stmt let_site = innermost_containing_stmt(alloc ? Stmt(alloc) : s, all_loads);
+            const IRNode *let_site = innermost_containing_node(alloc ? Stmt(alloc) : s, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
-                shared_load = Shuffle::make_transpose(shared_load, k.stride);
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
@@ -321,6 +335,7 @@ Stmt stage_strided_loads(const Stmt &s) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
+                shared_load = Shuffle::make_transpose(shared_load, k.stride);
                 replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {

From 794df0bf30950dbb32cb0b4f4c07a7023e655cdd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 20 Feb 2026 09:54:19 -0800
Subject: [PATCH 16/34] Fix innermost_containing_node

---
 src/IRMutator.h           | 9 +++++++++
 src/StageStridedLoads.cpp | 9 +++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/IRMutator.h b/src/IRMutator.h
index c170b37eb42b..4c06e12eda97 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -343,6 +343,15 @@ auto mutate_with(const T &ir, Lambdas &&...lambdas) {
     }
 }
 
+template<typename... Lambdas>
+auto mutate_with(const IRNode *ir, Lambdas &&...lambdas) -> IRHandle {
+    if (ir->node_type <= StrongestExprNodeType) {
+        return mutate_with(Expr((const BaseExprNode *)ir), std::forward<Lambdas>(lambdas)...);
+    } else {
+        return mutate_with(Stmt((const BaseStmtNode *)ir), std::forward<Lambdas>(lambdas)...);
+    }
+}
+
 /** A helper function for mutator-like things to mutate regions */
 template<typename Mutator, typename... Args>
 std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&...args) {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 5711e36e92ec..05799c433062 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -218,7 +218,7 @@ class ReplaceStridedLoads : public IRMutator {
     using IRMutator::visit;
 };
 
-const IRNode *innermost_containing_node(const Stmt &root, const std::set<const Load *> &exprs) {
+const IRNode *innermost_containing_node(const IRNode *root, const std::set<const Load *> &exprs) {
     const IRNode *result = nullptr;
     // The innermost containing stmt is whichever stmt node contains the
     // largest number of our exprs, with ties breaking inwards.
@@ -258,18 +258,23 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
     visit_with(n,                                 //
                [&](auto *self, const Let *let) {  //
                    result &= !expr_uses_var(idx, let->name);
+                   self->visit_base(let);
                },
                [&](auto *self, const LetStmt *let) {  //
                    result &= !expr_uses_var(idx, let->name);
+                   self->visit_base(let);
                },
                [&](auto *self, const For *loop) {  //
                    result &= !expr_uses_var(idx, loop->name);
+                   self->visit_base(loop);
                },
                [&](auto *self, const Allocate *alloc) {  //
                    result &= alloc->name != buf;
+                   self->visit_base(alloc);
                },
                [&](auto *self, const Store *store) {  //
                    result &= store->name != buf;
+                   self->visit_base(store);
                });
     return result;
 }
@@ -324,7 +329,7 @@ Stmt stage_strided_loads(const Stmt &s) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(alloc ? Stmt(alloc) : s, all_loads);
+            const IRNode *let_site = innermost_containing_node(k.scope, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);

From 486addd0c8a6c934a891e1f5b9ec2c54c0b0027c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 21 Feb 2026 11:44:53 -0800
Subject: [PATCH 17/34] Fix some simd op check failures

---
 src/CodeGen_ARM.cpp       |  1 +
 src/CodeGen_LLVM.cpp      |  9 +++++++--
 src/Lower.cpp             |  2 +-
 src/StageStridedLoads.cpp | 17 +++++++++++++----
 src/StageStridedLoads.h   |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 43372183aeb4..d43426857a9a 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1886,6 +1886,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     if (target.os != Target::IOS && target.os != Target::OSX &&
         load &&
         op->vectors.size() == 1 &&
+        op->is_slice() &&
         2 <= stride && stride <= 4 &&
         op->slice_begin() < stride &&
         load->type.lanes() == stride * op->type.lanes()) {
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a5937f123cfe..85d3e2f4ce85 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2195,10 +2195,15 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     internal_assert(!t->isScalableTy())
         << "optimization_fence does not support scalable vectors yet";
     const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 16) {
+    if (bits % 32) {
+        const int lanes = get_vector_num_elements(t);
+        const int padded_lanes = (lanes + 3) / 4 * 4;
+        v = slice_vector(v, 0, padded_lanes);
+        v = optimization_fence(v);
+        v = slice_vector(v, 0, lanes);
         return v;
     }
-    llvm::Type *float_type = llvm_type_of(Float(16, bits / 16));
+    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 32b64e83a2bd..e633cc99c1d2 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -381,7 +381,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after partitioning loops:", s);
 
     debug(1) << "Staging strided loads...\n";
-    s = stage_strided_loads(s);
+    s = stage_strided_loads(s, t);
     log("Lowering after staging strided loads:", s);
 
     debug(1) << "Trimming loops to the region over which they do something...\n";
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 05799c433062..3df12a6d1592 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -281,7 +281,7 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
 
 }  // namespace
 
-Stmt stage_strided_loads(const Stmt &s) {
+Stmt stage_strided_loads(const Stmt &s, const Target &target) {
     FindStridedLoads finder;
     ReplaceStridedLoads replacer;
 
@@ -329,18 +329,27 @@ Stmt stage_strided_loads(const Stmt &s) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(k.scope, all_loads);
+            const IRNode *let_site = innermost_containing_node(k.scope ? k.scope : s.get(), all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
+                // For larger strides we can do a better job at shuffling if we
+                // do it as one big task. For stride 2 it interferes with
+                // horizontal add pattern matching. On ARM it also interferes
+                // with LLVM's pattern matching for vld3 and vld4.
+                bool transpose_shared_load = k.stride > 2 && (target.arch != Target::ARM || k.stride > 4);
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;
-                    Expr shuf = Shuffle::make_slice(var, row * k.lanes, 1, k.lanes);
+                    Expr shuf = transpose_shared_load ?
+                        Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
+                        Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }
                 }
-                shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                if (transpose_shared_load) {
+                    shared_load = Shuffle::make_transpose(shared_load, k.stride);
+                }
                 replacer.let_injections[let_site].emplace_back(name, shared_load);
             } else {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h
index a29cef2438f1..b6afd3770981 100644
--- a/src/StageStridedLoads.h
+++ b/src/StageStridedLoads.h
@@ -37,7 +37,7 @@ namespace Internal {
  * internal allocations it adds padding to the allocation explicitly, by setting
  * the padding field on Allocate nodes.
  */
-Stmt stage_strided_loads(const Stmt &s);
+Stmt stage_strided_loads(const Stmt &s, const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide

From a1ecca90b95d67a04b16578de6d80aee9a56e6a0 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 09:48:18 -0800
Subject: [PATCH 18/34] Fix infinite recursion issue and missed case in
 interleave codegen

---
 src/CodeGen_LLVM.cpp      | 61 ++++++++++++++++++++++++++++++---------
 src/StageStridedLoads.cpp | 11 +++++--
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 85d3e2f4ce85..3a759091a9bf 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2197,7 +2197,9 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
     const int bits = t->getPrimitiveSizeInBits();
     if (bits % 32) {
         const int lanes = get_vector_num_elements(t);
-        const int padded_lanes = (lanes + 3) / 4 * 4;
+        const int element_bits = t->getScalarSizeInBits();
+        const int lanes_per_32_bits = 32 / element_bits;
+        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
         v = slice_vector(v, 0, padded_lanes);
         v = optimization_fence(v);
         v = slice_vector(v, 0, lanes);
@@ -2215,19 +2217,20 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         internal_assert(vecs[0]->getType() == vecs[i]->getType());
     }
     int vec_elements = get_vector_num_elements(vecs[0]->getType());
+    const int num_vecs = (int)vecs.size();
 
-    int factor = gcd(vec_elements, (int)vecs.size());
+    int factor = gcd(vec_elements, num_vecs);
 
-    if (vecs.size() == 1) {
+    if (num_vecs == 1) {
         return vecs[0];
-    } else if (vecs.size() == 2) {
+    } else if (num_vecs == 2) {
         Value *a = vecs[0];
         Value *b = vecs[1];
         vector<int> indices(vec_elements * 2);
         for (int i = 0; i < vec_elements * 2; i++) {
             indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
         }
-        return optimization_fence(shuffle_vectors(a, b, indices));
+        return shuffle_vectors(a, b, indices);
     } else if (factor == 1) {
         // The number of vectors and the vector length is
         // coprime. (E.g. interleaving an odd number of vectors of some
@@ -2290,27 +2293,41 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         }
 
         return concat_vectors(v);
-
     } else {
         // The number of vectors shares a factor with the length of the
         // vectors. Pick some factor of the number of vectors, interleave in
         // separate groups, and then interleave the results. Do the largest
         // power of two factor first.
-        const int n = (int)vecs.size();
-        int f = n & -n;
-        if (f == 1 || f == n) {
-            for (int i = 2; i < n; i++) {
-                if (n % i == 0) {
+        int f = num_vecs & -num_vecs;
+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
                     f = i;
                     break;
                 }
             }
         }
 
-        internal_assert(f > 1 && f < n && n % f == 0) << f << " " << n;
+        // if f == 1 then the vector length is a multiple of the
+        // interleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad each vector out to a
+        // power of two size, interleave, and discard the tail of the
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            std::vector<Value *> padded(num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                padded[i] = slice_vector(vecs[i], 0, padded_size);
+            }
+            Value *v = interleave_vectors(padded);
+            return slice_vector(v, 0, num_vecs * vec_elements);
+        }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
 
         vector<vector<Value *>> groups(f);
-        for (size_t i = 0; i < vecs.size(); i++) {
+        for (int i = 0; i < num_vecs; i++) {
             groups[i % f].push_back(vecs[i]);
         }
 
@@ -2428,7 +2445,23 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
             }
         }
 
-        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) << f << " " << num_vecs;
+        // if f == 1 then the final vector length is a multiple of the
+        // deinterleaving factor and the number of vectors is prime but not two
+        // (e.g. vec_elements = 24 and num_vecs = 3). Pad the vector out to a
+        // power of two size, deinterleave, and discard the tail of each vector
+        // result. This buys us some extra room to run Catanzaro's algorithm in.
+        if (f == 1) {
+            int padded_size = next_power_of_two(vec_elements);
+            Value *padded = slice_vector(vec, 0, padded_size * num_vecs);
+            std::vector<Value *> result = deinterleave_vector(padded, num_vecs);
+            for (int i = 0; i < num_vecs; i++) {
+                result[i] = slice_vector(result[i], 0, vec_elements);
+            }
+            return result;
+        }
+
+        internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0)
+            << f << " " << num_vecs << " " << factor;
 
         auto partial = deinterleave_vector(vec, f);
         std::vector<Value *> result(num_vecs);
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 3df12a6d1592..541e27be6a2a 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -329,7 +329,12 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             // If possible, we do the shuffle as an in-place transpose followed
             // by a dense slice. This is more efficient when extracting multiple
             // slices.
-            const IRNode *let_site = innermost_containing_node(k.scope ? k.scope : s.get(), all_loads);
+
+            // We can't lift the shared load further out than the scope over
+            // which the loads definition occur. If k.scope is null, the loads
+            // are valid everywhere (it must be an input buffer)
+            const IRNode *outermost = k.scope ? k.scope : s.get();
+            const IRNode *let_site = innermost_containing_node(outermost, all_loads);
             if (can_hoist_shared_load(let_site, k.buf, idx)) {
                 // For larger strides we can do a better job at shuffling if we
                 // do it as one big task. For stride 2 it interferes with
@@ -341,8 +346,8 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {
                     int row = load->first - first_offset;
                     Expr shuf = transpose_shared_load ?
-                        Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
-                        Shuffle::make_slice(var, row, k.stride, k.lanes);
+                                    Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
+                                    Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
                         replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
                     }

From f66d5eaa68c815cd098ad2f6bf7a017d2fdac30a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 09:53:29 -0800
Subject: [PATCH 19/34] Adjust expectations in stage_strided_loads test

---
 test/correctness/stage_strided_loads.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index f791385f7c25..038108844eb4 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -86,10 +86,7 @@ int main(int argc, char **argv) {
         f(x) += {buf(2 * x), buf(2 * x + 1)};
         f.update().vectorize(x, 8, TailStrategy::RoundUp);
 
-        // In this case, the dense load appears twice across the two store
-        // statements for the two tuple components, but it will get deduped by
-        // llvm.
-        checker.check(f, 2);
+        checker.check(f, 1);
     }
 
     {
@@ -113,7 +110,7 @@ int main(int argc, char **argv) {
         g.vectorize(x, 8, TailStrategy::RoundUp);
         f.compute_at(g, x).vectorize(x);
 
-        checker.check(g, 2);
+        checker.check(g, 1);
     }
 
     {
@@ -125,7 +122,7 @@ int main(int argc, char **argv) {
         g(x) = f(x);
         g.vectorize(x, 8, TailStrategy::RoundUp);
 
-        checker.check(g, 2);
+        checker.check(g, 1);
     }
 
     {
@@ -135,7 +132,7 @@ int main(int argc, char **argv) {
         f(x, c) = buf(4 * x + c) + 4 * x;
         f.vectorize(x, 8, TailStrategy::RoundUp).bound(c, 0, 4).unroll(c).reorder(c, x);
 
-        checker.check(f, 4);
+        checker.check(f, 1);
     }
 
     {
@@ -152,7 +149,7 @@ int main(int argc, char **argv) {
         f.tile(x, y, xi, yi, 8, 8, TailStrategy::RoundUp).vectorize(xi).reorder(c, x, y);
         g.compute_at(f, x).vectorize(x);
         h.compute_at(f, x).vectorize(x);
-        checker.check(f, 2);
+        checker.check(f, 1);
     }
 
     // We can always densify strided loads to internal allocations, because we
@@ -181,7 +178,7 @@ int main(int argc, char **argv) {
     {
         Func f;
         Var x;
-        f(x) = buf(16 * x) + buf(16 * x + 15);
+        f(x) = buf(17 * x) + buf(17 * x + 15);
         f.vectorize(x, 16, TailStrategy::RoundUp);
 
         checker.check_not(f, 0);

From c25142f7a27b83fb80d6485b7612b5335cd92b23 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 10:25:44 -0800
Subject: [PATCH 20/34] Allow reversed suffix or not in sve test

---
 test/correctness/simd_op_check_sve2.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index fca748dd60d9..7e1e1e00ddfb 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -447,13 +447,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 Expr shift = (i_2 % bits) - (bits / 2);
                 Expr round_s = (cast_i(1) >> min(shift, 0)) / 2;
                 Expr round_u = (cast_u(1) >> min(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift));
+                // The r suffix is optional - it just changes which of the two args gets clobbered
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) << shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) << shift));
 
                 round_s = (cast_i(1) << max(shift, 0)) / 2;
                 round_u = (cast_u(1) << max(shift, 0)) / 2;
-                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift));
-                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift));
+                add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) >> shift));
+                add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) >> shift));
 
                 // VRSHR    I       -       Rounding Shift Right
                 add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1));

From bae3e02d2bcb7499fa52c5fa950189acfc6ed99b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 10:26:00 -0800
Subject: [PATCH 21/34] Don't use optimization fences on hexagon

---
 src/CodeGen_Hexagon.cpp | 7 +++++++
 src/CodeGen_LLVM.h      | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 4ac47b4d1f3b..563ac00a4972 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
     llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                  const std::vector<int> &indices) override;
+    llvm::Value *optimization_fence(llvm::Value *v) override;
     using CodeGen_Posix::shuffle_vectors;
     ///@}
 
@@ -1296,6 +1297,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     return vdelta(concat_vectors({a, b}), indices);
 }
 
+Value *CodeGen_Hexagon::optimization_fence(Value *v) {
+    // As of llvm 21, the base class version seems to trip up LLVM's hexagon
+    // backend, possibly because it relies on a floating point type.
+    return v;
+}
+
 Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
                                 int max_index) {
     llvm::Type *lut_ty = lut->getType();
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 46ec05638e3f..abbf935122c3 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -465,7 +465,7 @@ class CodeGen_LLVM : public IRVisitor {
 
     /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
      * abuse it to prevent shufflevector fusion too. */
-    llvm::Value *optimization_fence(llvm::Value *);
+    virtual llvm::Value *optimization_fence(llvm::Value *);
 
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments

From b7defbd2934330d5eecfd4fa8cc9567f90ad03ec Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 11:18:18 -0800
Subject: [PATCH 22/34] Fix infinite simplifier loop

---
 src/Simplify_Exprs.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 52665c0c2894..ad8a9827847a 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -362,7 +362,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
         }
         return Shuffle::make(loaded_vecs, s_index->indices);
     } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
-               inner_ramp && is_const_one(r_index->stride)) {
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
         // If it's a nested ramp and the outer ramp has stride 1, swap the
         // nesting order of the ramps to make dense loads and transpose the
         // resulting vector instead.

From 23944a0093f5e9d0d03f96b12ea2706d8fba1c6e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 11:18:27 -0800
Subject: [PATCH 23/34] Don't hoist transposes on hexagon

---
 src/StageStridedLoads.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index 541e27be6a2a..d8315ecd19cd 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -340,7 +340,10 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 // do it as one big task. For stride 2 it interferes with
                 // horizontal add pattern matching. On ARM it also interferes
                 // with LLVM's pattern matching for vld3 and vld4.
-                bool transpose_shared_load = k.stride > 2 && (target.arch != Target::ARM || k.stride > 4);
+                bool transpose_shared_load = k.stride > 2;
+                if (target.arch == Target::ARM || target.arch == Target::Hexagon) {
+                    transpose_shared_load = k.stride > 4;
+                }
                 std::string name = unique_name('t');
                 Expr var = Variable::make(shared_load.type(), name);
                 for (; load != v.end() && load->first < first_offset + k.stride; load++) {

From 0d110d206dbfcb133fda12d8a7d44c72d770eca3 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 23 Feb 2026 14:07:10 -0800
Subject: [PATCH 24/34] Make distinct strided load nodes in the IR distinct in
 memory too

---
 src/StageStridedLoads.cpp | 46 +++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index d8315ecd19cd..3723f997a871 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -158,9 +158,8 @@ class FindStridedLoads : public IRVisitor {
 // Replace a bunch of load expressions in a stmt
 class ReplaceStridedLoads : public IRMutator {
 public:
-    std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
+    std::map<const Load *, Expr> replacements;
     std::map<const Allocate *, int> padding;
-    Scope<const Allocate *> allocation_scope;
     std::map<const IRNode *, std::vector<std::pair<std::string, Expr>>> let_injections;
 
     Stmt mutate(const Stmt &s) override {
@@ -187,11 +186,7 @@ class ReplaceStridedLoads : public IRMutator {
 
 protected:
     Expr visit(const Load *op) override {
-        const Allocate *alloc = nullptr;
-        if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
-            alloc = *a_ptr;
-        }
-        auto it = replacements.find({alloc, op});
+        auto it = replacements.find(op);
         if (it != replacements.end()) {
             return mutate(it->second);
         } else {
@@ -200,7 +195,6 @@ class ReplaceStridedLoads : public IRMutator {
     }
 
     Stmt visit(const Allocate *op) override {
-        ScopedBinding bind(allocation_scope, op->name, op);
         auto it = padding.find(op);
         Stmt s = IRMutator::visit(op);
         if (it == padding.end()) {
@@ -281,10 +275,25 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr &
 
 }  // namespace
 
-Stmt stage_strided_loads(const Stmt &s, const Target &target) {
+Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
     FindStridedLoads finder;
     ReplaceStridedLoads replacer;
 
+    // Make all strided loads distinct IR nodes so that we can uniquely identify
+    // them by address. We may want to mutate the same load node in different
+    // ways depending on the surrounding context.
+    Stmt s = mutate_with(stmt, [&](auto *self, const Load *l) {
+        const Ramp *r = l->index.as<Ramp>();
+        if (l->type.is_scalar() || (r && is_const_one(r->stride))) {
+            // Definitely not a strided load
+            return self->visit_base(l);
+        } else {
+            // Might be a strided load after simplification
+            return Load::make(l->type, l->name, self->mutate(l->index), l->image, l->param,
+                              self->mutate(l->predicate), l->alignment);
+        }
+    });
+
     // Find related clusters of strided loads anywhere in the stmt. While this
     // appears to look globally, it requires expressions to match exactly, so
     // really it's only going to find things inside the same loops and let
@@ -293,7 +302,6 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
 
     for (const auto &l : finder.found_loads) {
         const FindStridedLoads::Key &k = l.first;
-        const Allocate *alloc = k.allocation;
         const std::map<int64_t, std::vector<const Load *>> &v = l.second;
 
         // Find clusters of strided loads that can share the same dense load.
@@ -352,7 +360,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                                     Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) :
                                     Shuffle::make_slice(var, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
-                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                        replacer.replacements.emplace(l, shuf);
                     }
                 }
                 if (transpose_shared_load) {
@@ -364,7 +372,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                     int row = load->first - first_offset;
                     Expr shuf = Shuffle::make_slice(shared_load, row, k.stride, k.lanes);
                     for (const Load *l : load->second) {
-                        replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                        replacer.replacements.emplace(l, shuf);
                     }
                 }
             }
@@ -374,7 +382,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
         // picked up in a cluster, but for whom we know it's safe to do a
         // dense load before their start.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
             int64_t delta = k.stride - 1;
@@ -392,14 +400,14 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             dense_load = common_subexpression_elimination(dense_load);
             Expr shuf = Shuffle::make_slice(dense_load, delta, k.stride, k.lanes);
             for (const Load *l : loads) {
-                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                replacer.replacements.emplace(l, shuf);
             }
         }
 
         // Look for any loads we can densify because an overlapping load occurs
         // in any parent scope.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
             int64_t min_offset = offset;
@@ -430,7 +438,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
             dense_load = common_subexpression_elimination(dense_load);
             Expr shuf = Shuffle::make_slice(dense_load, offset - final_offset, k.stride, k.lanes);
             for (const Load *l : loads) {
-                replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                replacer.replacements.emplace(l, shuf);
             }
         }
 
@@ -439,7 +447,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
         // external allocations by doing a dense load at a trimmed size. We rely
         // on codegen to do a good job at loading vectors of a funny size.
         for (const auto &[offset, loads] : v) {
-            if (replacer.replacements.count({alloc, loads[0]})) {
+            if (replacer.replacements.count(loads[0])) {
                 continue;
             }
 
@@ -463,7 +471,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 dense_load = common_subexpression_elimination(dense_load);
                 Expr shuf = Shuffle::make_slice(dense_load, offset - first_offset, k.stride, k.lanes);
                 for (const Load *l : loads) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    replacer.replacements.emplace(l, shuf);
                 }
 
             } else if (k.lanes % 2 == 0) {
@@ -486,7 +494,7 @@ Stmt stage_strided_loads(const Stmt &s, const Target &target) {
                 Expr shuf2 = Shuffle::make_slice(dense_load2, delta, k.stride, k.lanes / 2);
                 Expr shuf = Shuffle::make_concat({shuf1, shuf2});
                 for (const Load *l : loads) {
-                    replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
+                    replacer.replacements.emplace(l, shuf);
                 }
             }
         }

From 84f10b1ce4e26490d77fb60d8626e4b4a34734bb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 12:35:42 -0800
Subject: [PATCH 25/34] arm-32 has no vst2 for 64-bit elements

---
 src/CodeGen_ARM.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index d43426857a9a..31f64272b552 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1471,10 +1471,11 @@ void CodeGen_ARM::visit(const Store *op) {
         intrin_type = t;
         Type elt = t.element_of();
         int vec_bits = t.bits() * t.lanes();
-        if (elt == Float(32) || elt == Float(64) ||
-            is_float16_and_has_feature(elt) ||
-            elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
-            elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
+        if (t.bits() <= target.bits &&
+            (elt == Float(32) || elt == Float(64) ||
+             is_float16_and_has_feature(elt) ||
+             elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
+             elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
             const int target_vector_bits = native_vector_bits();
             if (vec_bits % 128 == 0) {
                 type_ok_for_vst = true;

From 8d93c3c7c2fc76386af57df8ddf1136c25b9cbc8 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 13:43:51 -0800
Subject: [PATCH 26/34] Windows bad filename fix in simd op check

---
 test/correctness/simd_op_check_sve2.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index 7e1e1e00ddfb..3ed0a70ef380 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -1221,6 +1221,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             std::stringstream type_name_stream;
             type_name_stream << e.type();
             std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor);
+
+            // Some regex symbols are illegal in filenames on windows
+            std::string illegal = "<>:\"/\\|?*";
+            std::replace_if(decorated_op_name.begin(), decorated_op_name.end(),  //
+                            [&](char c) { return illegal.find(c) != std::string::npos; }, '_');
+
             auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size());
 
             // Bail out after generating the unique_name, so that names are

From 36565ce56464486ad2cbdd6fab7a84131bb430d4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 14:04:19 -0800
Subject: [PATCH 27/34] Temporary dumping of cpu info to debug github actions
 issue

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index 54c61a622ae8..0f136b40114d 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,8 @@ MAKEFLAGS += --no-builtin-rules
 
 UNAME = $(shell uname)
 
+$(info $(shell cat /proc/cpuinfo))
+
 ifeq ($(OS), Windows_NT)
     $(error Halide no longer supports the MinGW environment. Please use MSVC through CMake instead.)
 else

From 3f45c47773f0c3cf4ec0dc2582a0b9fe8b0c4f41 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 24 Feb 2026 14:24:17 -0800
Subject: [PATCH 28/34] dump cpuinfo in makefile testing workflow

To help diagnose occasional illegal instruction errors
---
 .github/workflows/testing-make.yml | 1 +
 Makefile                           | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/testing-make.yml b/.github/workflows/testing-make.yml
index ccd6c600f851..0beeb9a86563 100644
--- a/.github/workflows/testing-make.yml
+++ b/.github/workflows/testing-make.yml
@@ -47,6 +47,7 @@ jobs:
         run: |
           if [ "$RUNNER_OS" = "Linux" ]; then
             echo "LLVM_CONFIG=llvm-config-$LLVM_VERSION" | tee -a "$GITHUB_ENV"
+            cat /proc/cpuinfo
           elif [ "$RUNNER_OS" = "macOS" ]; then
             echo "LLVM_CONFIG=$(brew --prefix llvm@$LLVM_VERSION)/bin/llvm-config" | tee -a "$GITHUB_ENV"
           fi
diff --git a/Makefile b/Makefile
index 0f136b40114d..54c61a622ae8 100644
--- a/Makefile
+++ b/Makefile
@@ -16,8 +16,6 @@ MAKEFLAGS += --no-builtin-rules
 
 UNAME = $(shell uname)
 
-$(info $(shell cat /proc/cpuinfo))
-
 ifeq ($(OS), Windows_NT)
     $(error Halide no longer supports the MinGW environment. Please use MSVC through CMake instead.)
 else

From 2695151683836d4597d50a5a9f0aa4decb4fbb7a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 6 Mar 2026 14:25:58 -0800
Subject: [PATCH 29/34] Address review comments

---
 src/CodeGen_LLVM.cpp                     |  12 +-
 src/IR.cpp                               |   6 +-
 src/IR.h                                 |  12 +-
 src/Simplify_Exprs.cpp                   |  16 +-
 src/Simplify_Shuffle.cpp                 |   5 +-
 src/Simplify_Stmts.cpp                   |  29 +++-
 src/StageStridedLoads.cpp                |  22 ++-
 src/Util.h                               |   5 +
 test/correctness/CMakeLists.txt          |   1 +
 test/correctness/stage_strided_loads.cpp |   2 +-
 test/correctness/transpose_idioms.cpp    | 211 +++++++++++++++++++++++
 test/performance/interleave.cpp          |   6 +-
 12 files changed, 303 insertions(+), 24 deletions(-)
 create mode 100644 test/correctness/transpose_idioms.cpp

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 259f287354b1..4be2a8ab0577 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2241,7 +2241,6 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         // Using unary shuffles, get each element into the right ultimate
         // lane. This works out without collisions because the number of vectors
         // and the length of each vector is coprime.
-        const int num_vecs = (int)v.size();
         std::vector<int> shuffle(vec_elements);
         for (int i = 0; i < num_vecs; i++) {
             for (int j = 0; j < vec_elements; j++) {
@@ -2298,7 +2297,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
         // vectors. Pick some factor of the number of vectors, interleave in
         // separate groups, and then interleave the results. Do the largest
         // power of two factor first.
-        int f = num_vecs & -num_vecs;
+        int f = largest_power_of_two_factor(num_vecs);
         if (f == 1 || f == num_vecs) {
             for (int i = 2; i < num_vecs; i++) {
                 if (num_vecs % i == 0) {
@@ -2317,6 +2316,7 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
             int padded_size = next_power_of_two(vec_elements);
             std::vector<Value *> padded(num_vecs);
             for (int i = 0; i < num_vecs; i++) {
+                // slice_vector can also be used to pad with don't cares
                 padded[i] = slice_vector(vecs[i], 0, padded_size);
             }
             Value *v = interleave_vectors(padded);
@@ -2367,7 +2367,6 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // Use the inverse of Catanzaro's algorithm from above. We slice into
         // distinct vectors, then rotate each element into the correct final
         // vector, then do a unary permutation of each vector.
-        std::vector<int> shuffle(vec_elements);
 
         // Instead of concatenating, we slice.
         std::vector<Value *> v(num_vecs);
@@ -2385,6 +2384,7 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
 
         // We'll handle each bit of the rotation one at a time with a two-way
         // shuffle.
+        std::vector<int> shuffle(vec_elements);
         std::vector<Value *> new_v(v.size());
         int d = 1;
         while (d < num_vecs) {
@@ -2409,7 +2409,9 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // Now reorder the vectors in the inverse order to the above.
         for (int i = 0; i < num_vecs; i++) {
             int j = (i * vec_elements) % num_vecs;
-            // j and i are swapped below, because we're doing the inverse of the algorithm above
+            // j and i are swapped below, because we're doing the inverse of the
+            // algorithm above. This map is 1:1 because vec_elements and
+            // num_vecs are coprime, so every slot of new_v is stored to.
             new_v[j] = v[i];
         }
         v.swap(new_v);
@@ -2435,7 +2437,7 @@ std::vector<Value *> CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs)
         // again. We know there's a non-trivial factor because if it were prime
         // the gcd above would have been 1. Do the largest power-of-two factor
         // first.
-        int f = num_vecs & -num_vecs;
+        int f = largest_power_of_two_factor(num_vecs);
         if (f == 1 || f == num_vecs) {
             for (int i = 2; i < num_vecs; i++) {
                 if (num_vecs % i == 0) {
diff --git a/src/IR.cpp b/src/IR.cpp
index 5f5320c68c87..b53d99960d4b 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -1029,10 +1029,12 @@ bool Shuffle::is_concat() const {
 bool Shuffle::is_transpose() const {
     if (vectors.size() > 1 ||
         (int)indices.size() != vectors[0].type().lanes() ||
-        indices.size() < 2) {
+        indices.size() < 2 ||
+        indices[0] != 0 ||
+        indices[1] <= 0) {
         return false;
     }
-    int cols = indices[1] - indices[0];
+    int cols = indices[1];
     int rows = vectors[0].type().lanes() / cols;
     if ((int)indices.size() != rows * cols) {
         return false;
diff --git a/src/IR.h b/src/IR.h
index e70312363627..3b1320330df6 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -988,8 +988,11 @@ struct Shuffle : public ExprNode<Shuffle> {
      * interleaving of vectors of the same length. */
     static Expr make_interleave(const std::vector<Expr> &vectors);
 
-    /** Convenience constructor for making a shuffle representing an
-     * in-place transpose of a matrix with the given number of columns. */
+    /** Convenience constructor for making a shuffle representing an in-place
+     * transpose of a row-major matrix with the given number of columns. The
+     * output, interpreted as a row-major matrix, therefore has than number of
+     * rows. For example, to turn the vector RGBRGBRGBRGB into RRRRGGGGBBBB cols
+     * would be 3, and to do the reverse cols would be 4. */
     static Expr make_transpose(Expr e, int cols);
 
     /** Convenience constructor for making a shuffle representing a
@@ -1012,7 +1015,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
-    /** Check if this shuffle is an in-place transpose of a single vector */
+    /** Check if this shuffle is an in-place transpose of a single vector. The
+     * factor is the number of columns of the source matrix, or equivalently,
+     * the number of rows of the destination matrix, interpreting a vector as a
+     * matrix stored row-major. */
     bool is_transpose() const;
     int transpose_factor() const;
 
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index ad8a9827847a..c7eb6c7f802b 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -349,13 +349,20 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
                                op->image, op->param, const_true(new_lanes, nullptr), align);
         return Broadcast::make(load, b_index->lanes);
     } else if (s_index &&
-               is_const_one(predicate) &&
                (s_index->is_concat() ||
                 s_index->is_interleave())) {
-        // Loads of concats/interleaves should be concats/interleaves of loads
+        // Loads of concats/interleaves should be concats/interleaves of
+        // loads. We'll need to slice up the predicate though.
         std::vector<Expr> loaded_vecs;
         for (const Expr &new_index : s_index->vectors) {
             int new_lanes = new_index.type().lanes();
+            Expr predicate_slice =
+                is_const_one(predicate) ? const_true(new_lanes, nullptr) :
+                s_index->is_concat() ?
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size() * new_lanes, 1, new_lanes) :
+                                          Shuffle::make_slice(predicate, (int)loaded_vecs.size(), op->type.lanes() / new_lanes, new_lanes);
+            predicate_slice = mutate(predicate_slice, nullptr);
+
             Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index,
                                    op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{});
             loaded_vecs.emplace_back(std::move(load));
@@ -371,8 +378,11 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
         Expr transposed_index =
             Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
                        Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
         Expr transposed_load =
-            Load::make(op->type, op->name, transposed_index, op->image, op->param, predicate, align);
+            Load::make(op->type, op->name, transposed_index, op->image, op->param, transposed_predicate, align);
         return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info);
     } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) {
         return op;
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index aecb4c6fc99a..2a614ac81744 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -95,10 +95,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
     // broadcast. Note that it doesn't matter what the indices
     // are.
     const Broadcast *b1 = new_vectors[0].as<Broadcast>();
-    if (b1) {
+    if (b1 && b1->value.type().is_scalar()) {
         bool can_collapse = true;
         for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) {
-            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>()) {
+            if (const Broadcast *b2 = new_vectors[i].as<Broadcast>();
+                b2 && b2->value.type().is_scalar()) {
                 Expr check = mutate(b1->value - b2->value, nullptr);
                 can_collapse &= is_const_zero(check);
             } else {
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 1b4588342096..60e80e86c1b5 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -326,6 +326,7 @@ Stmt Simplify::visit(const Store *op) {
     ExprInfo index_info;
     Expr index = mutate(op->index, &index_info);
 
+
     // If the store is fully unconditional and out of bounds, drop it.
     // This should only occur inside branches that make the store unreachable,
     // but perhaps the branch was hard to prove constant true or false. This
@@ -342,8 +343,9 @@ Stmt Simplify::visit(const Store *op) {
     }
 
     ExprInfo base_info;
-    if (const Ramp *r = index.as<Ramp>()) {
-        mutate(r->base, &base_info);
+    const Ramp *r_index = index.as<Ramp>();
+    if (r_index) {
+        mutate(r_index->base, &base_info);
     }
     base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment);
 
@@ -367,7 +369,10 @@ Stmt Simplify::visit(const Store *op) {
         // foo[x] = foo[x] or foo[x] = undef is a no-op
         return Evaluate::make(0);
     } else if (shuf && shuf->is_concat()) {
-        // Break a store of a concat of vector indices into separate stores
+        // Break a store of a concat of vector indices into separate stores. A
+        // concat index will result in a general scatter at codegen time. We
+        // should just break it up here, where there is a hope that the
+        // individual elements might be simplifiable to dense ramps.
         std::string var_name = unique_name('t');
         Expr var = Variable::make(value.type(), var_name);
         std::vector<Stmt> stores;
@@ -384,6 +389,24 @@ Stmt Simplify::visit(const Store *op) {
         Stmt s = Block::make(stores);
         s = LetStmt::make(var_name, value, s);
         return mutate(s);
+    } else if (const Ramp *inner_ramp = r_index ? r_index->base.as<Ramp>() : nullptr;
+               inner_ramp &&
+               !is_const_one(inner_ramp->stride) &&
+               is_const_one(r_index->stride)) {
+        // If it's a nested ramp and the outer ramp has stride 1, swap the
+        // nesting order of the ramps to make dense stores and transpose the
+        // index and value instead. Later in lowering after flattening the
+        // nested ramps it will turn into a concat of dense ramps and hit the
+        // case above.
+        Expr transposed_index =
+            Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes),
+                       Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes);
+        Expr transposed_value = Shuffle::make_transpose(value, inner_ramp->lanes);
+        Expr transposed_predicate = (predicate.as<Broadcast>() ?
+                                         predicate :  // common case optimization
+                                         Shuffle::make_transpose(predicate, inner_ramp->lanes));
+        return mutate(Store::make(op->name, transposed_value, transposed_index,
+                                  op->param, transposed_predicate, align));
     } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) {
         return op;
     } else {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index a1319d722112..896a33b5193e 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -104,7 +104,7 @@ class FindStridedLoads : public IRVisitor {
                 // TODO: We do not yet handle nested vectorization here for
                 // ramps which have not already collapsed. We could potentially
                 // handle more interesting types of shuffle than simple flat slices.
-                if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) {
+                if (stride >= 2 && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
                     if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
@@ -334,9 +334,23 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
             Type t = k.type.with_lanes(lanes);
             const Load *op = load->second[0];
 
+            int last_offset = first_offset;
+            int64_t biggest_gap = 0;
             std::set<const Load *> all_loads;
             for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) {
                 all_loads.insert(l->second.begin(), l->second.end());
+                biggest_gap = std::max(biggest_gap, l->first - last_offset);
+                last_offset = l->first;
+            }
+            biggest_gap = std::max(biggest_gap, (first_offset + k.stride) - last_offset);
+
+            // If our contiguous shared load has contiguous vectors in it of
+            // size at least k.lanes that are going to be entirely unused, this
+            // is a bad idea (e.g. a cluster of {ramp(0, 1024, 8) and ramp(37,
+            // 1024, 8)} should not be staged).
+            if (biggest_gap >= k.lanes) {
+                load++;
+                continue;
             }
 
             Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
@@ -391,7 +405,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // picked up in a cluster, but for whom we know it's safe to do a
         // dense load before their start.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t delta = k.stride - 1;
@@ -416,7 +430,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // Look for any loads we can densify because an overlapping load occurs
         // in any parent scope.
         for (const auto &[offset, loads] : reverse_view(v)) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
             int64_t min_offset = offset;
@@ -456,7 +470,7 @@ Stmt stage_strided_loads(const Stmt &stmt, const Target &target) {
         // external allocations by doing a dense load at a trimmed size. We rely
         // on codegen to do a good job at loading vectors of a funny size.
         for (const auto &[offset, loads] : v) {
-            if (replacer.replacements.count(loads[0])) {
+            if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) {
                 continue;
             }
 
diff --git a/src/Util.h b/src/Util.h
index a437d18a9ce4..2a9b9a676cc0 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -575,6 +575,11 @@ inline int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
+/** Returns the largest power of two which is a factor of the argument. */
+inline int64_t largest_power_of_two_factor(int64_t x) {
+    return x & -x;
+}
+
 /** Return whether or not an integer is a power of two. */
 inline bool is_power_of_two(int64_t x) {
     return (x & (x - 1)) == 0;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 77066a8392bd..5805fe599827 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -327,6 +327,7 @@ tests(GROUPS correctness
       tracing_broadcast.cpp
       tracing_stack.cpp
       transitive_bounds.cpp
+      transpose_idioms.cpp
       trim_no_ops.cpp
       tuple_partial_update.cpp
       tuple_reduction.cpp
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index 757f71acd487..dc09be89d09c 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -190,7 +190,7 @@ int main(int argc, char **argv) {
     {
         Func f;
         Var x;
-        f(x) = buf(17 * x) + buf(17 * x + 15);
+        f(x) = buf(50 * x) + buf(50 * x + 15);
         f.vectorize(x, 16, TailStrategy::RoundUp);
 
         checker.check_not(f, 0);
diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
new file mode 100644
index 000000000000..afe02039fbde
--- /dev/null
+++ b/test/correctness/transpose_idioms.cpp
@@ -0,0 +1,211 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+// This test enumerates all the scheduling idioms in Halide that *should*
+// produce good code for a transpose/interleave/deinterleave operation.
+
+class Checker : public IRMutator {
+
+    using IRMutator::visit;
+
+    Expr visit(const Load *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_loads++;
+        } else if (op->type.is_vector()) {
+            gathers++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        if (const Ramp *r = op->index.as<Ramp>();
+            r && is_const_one(r->stride)) {
+            dense_stores++;
+        } else if (op->index.type().is_vector()) {
+            scatters++;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        transposes += op->is_transpose();
+        interleaves += op->is_interleave();
+        if (op->is_slice()) {
+            if (op->slice_stride() == 1) {
+                dense_slices++;
+            } else {
+                strided_slices++;
+            }
+        }
+        return IRMutator::visit(op);
+    }
+
+public:
+    int dense_loads = 0;
+    int gathers = 0;
+    int dense_stores = 0;
+    int scatters = 0;
+    int dense_slices = 0;
+    int strided_slices = 0;
+    int interleaves = 0;
+    int transposes = 0;
+
+    void check() {
+        internal_assert(gathers == 0) << "Vector gathers found";
+        internal_assert(scatters == 0) << "Vector scatters found";
+        internal_assert(strided_slices == 0) << "strided slices found";
+        internal_assert(dense_loads) << "No dense loads found";
+        internal_assert(dense_stores) << "No dense stores found";
+        internal_assert(interleaves + transposes) << "No interleaves or transposes found";
+    }
+};
+
+void check(Func g) {
+    Checker checker;
+    g.add_custom_lowering_pass(&checker, nullptr);
+
+    // Choose a shape with lots of factors so that our RoundUp schedules work
+    int n = 16 * 9 * 7;
+    Buffer<int> out = g.realize({n, n});
+    for (int y = 0; y < out.height(); y++) {
+        for (int x = 0; x < out.width(); x++) {
+            int correct = 100 * x + y;
+            internal_assert(out(x, y) == correct)
+                << "out(" << x << ", " << y << ") = " << out(x, y)
+                << " instead of " << correct << "\n";
+        }
+    }
+
+    checker.check();
+}
+
+int main(int argc, char **argv) {
+    Var x{"x"}, y{"y"}, xi{"xi"}, yi{"yi"};
+
+    // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power
+    // of two sizes, and sizes that are coprime, and sizes that are neither
+    // coprime no powers of two.
+
+    for (auto tile : {std::pair{8, 16}, {7, 3}, {6, 9}}) {
+        {
+            // Idiom 1: Strided stores into a staged transposed copy of the
+            // input. The strided stores that get mashed together into one big
+            // interleave + store by the pass that interleaves strided
+            // stores. This has to be done on a staged copy of the input rather
+            // than g so that the strided stores have a constant stride.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).reorder_storage(y, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 2: Vectorize x, unroll y. Stage a copy of the input but
+            // don't transpose it. This will create strided loads from the
+            // staged input that get hoisted out into one big dense load +
+            // transpose by the stage_strided_stores pass. The staging is
+            // required so that the strides are constant.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .unroll(yi);
+
+            f.in().compute_at(g, x).vectorize(x).unroll(y);
+
+            check(g);
+        }
+
+        {
+            // Idiom 3: Vectorize both, x innermost. This should be handled by
+            // shuffle optimization logic in the simplifier: a store of a concat
+            // of ramps turns into a sequence of stores of slices of the RHS,
+            // and a load of a ramp of a ramp where the *outer* ramp has stride
+            // 1 but the inner doesn't turns into a transpose of a concat of
+            // dense loads.
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+
+        {
+            // Idiom 4: Vectorize both, y innermost. In this case the store of a
+            // ramp of a ramp gets rewritten by the simplifier to move the ramp
+            // with stride one innermost, transposing the RHS.
+
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp)
+                .reorder(yi, xi)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    {
+        // Check the double-vectorization approaches also work when there is a
+        // vector predicate on one of the two vectors, to be sure the simplifier
+        // is transforming the predicate correctly. We can't predicate both,
+        // because the vectorizer can't handle it and generates a scalar tail.
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::Predicate)
+                .split(y, y, yi, 11, TailStrategy::ShiftInwards)
+                .reorder(xi, yi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+        {
+            Func f{"f"}, g{"g"};
+            f(x, y) = x + 100 * y;
+            g(x, y) = f(y, x);
+            f.compute_root();
+
+            g
+                .never_partition(x, y)
+                .split(x, x, xi, 13, TailStrategy::ShiftInwards)
+                .split(y, y, yi, 11, TailStrategy::Predicate)
+                .reorder(yi, xi, x, y)
+                .vectorize(xi)
+                .vectorize(yi);
+
+            check(g);
+        }
+    }
+
+    printf("Success!\n");
+}
diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp
index 3df42ed0237f..ee1598e40d41 100644
--- a/test/performance/interleave.cpp
+++ b/test/performance/interleave.cpp
@@ -29,7 +29,11 @@ Result test_interleave(int factor, const Target &t) {
     output(x) = in(x / factor, x % factor);
 
     Var xi, yi;
-    output.unroll(x, factor, TailStrategy::RoundUp).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
+    // We'll use the interleaving-stores scheduling idiom, where unrolling
+    // strided stores gets mashed together into a single dense store of a
+    // interleave_vectors call.
+    output.unroll(x, factor, TailStrategy::RoundUp)
+        .vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);
     output.output_buffer().dim(0).set_min(0);
 
     output.compile_jit();

From 2962ea191044b2a06a2040de52b002c62cbb966f Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 10 Mar 2026 13:55:02 -0700
Subject: [PATCH 30/34] Remove duplicate function body

---
 src/CodeGen_LLVM.cpp | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 7edc6a3e003e..a23e0f52ab0d 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5211,27 +5211,6 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
     }
 }
 
-Value *CodeGen_LLVM::optimization_fence(Value *v) {
-    llvm::Type *t = v->getType();
-    internal_assert(!t->isScalableTy())
-        << "optimization_fence does not support scalable vectors yet";
-    const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 32) {
-        const int lanes = get_vector_num_elements(t);
-        const int element_bits = t->getScalarSizeInBits();
-        const int lanes_per_32_bits = 32 / element_bits;
-        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
-        v = slice_vector(v, 0, padded_lanes);
-        v = optimization_fence(v);
-        v = slice_vector(v, 0, lanes);
-        return v;
-    }
-    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
-    v = builder->CreateBitCast(v, float_type);
-    v = builder->CreateArithmeticFence(v, float_type);
-    return builder->CreateBitCast(v, t);
-}
-
 Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     if (v.size() == 1) {
         return v[0];

From fa2fcb7aad2b465f64003d725588456db465befd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 11:14:04 -0700
Subject: [PATCH 31/34] Use slice of predicate

---
 src/Simplify_Exprs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index c7eb6c7f802b..29f827553789 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -364,7 +364,7 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) {
             predicate_slice = mutate(predicate_slice, nullptr);
 
             Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index,
-                                   op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{});
+                                   op->image, op->param, predicate_slice, ModulusRemainder{});
             loaded_vecs.emplace_back(std::move(load));
         }
         return Shuffle::make(loaded_vecs, s_index->indices);

From dcdfb903637eaa4dbbb0bfa63ff75b1679629722 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 11:18:11 -0700
Subject: [PATCH 32/34] clang-format

---
 src/Simplify_Stmts.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 60e80e86c1b5..b1940482802a 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -326,7 +326,6 @@ Stmt Simplify::visit(const Store *op) {
     ExprInfo index_info;
     Expr index = mutate(op->index, &index_info);
 
-
     // If the store is fully unconditional and out of bounds, drop it.
     // This should only occur inside branches that make the store unreachable,
     // but perhaps the branch was hard to prove constant true or false. This

From 70afc5881deaabba73919503531f41fed0183c0b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 11 Mar 2026 13:09:11 -0700
Subject: [PATCH 33/34] SVE fixes

Co-authored-by: Claude Code <noreply@anthropic.com>
---
 src/CodeGen_LLVM.cpp                  | 10 +++++++---
 test/correctness/transpose_idioms.cpp |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a23e0f52ab0d..ca66274d5ea4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2199,8 +2199,12 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
 
 Value *CodeGen_LLVM::optimization_fence(Value *v) {
     llvm::Type *t = v->getType();
-    internal_assert(!t->isScalableTy())
-        << "optimization_fence does not support scalable vectors yet";
+    if (t->isScalableTy()) {
+        // Convert to fixed, fence, convert back.
+        Value *fixed = scalable_to_fixed_vector_type(v);
+        fixed = optimization_fence(fixed);
+        return fixed_to_scalable_vector_type(fixed);
+    }
     const int bits = t->getPrimitiveSizeInBits();
     if (bits % 32) {
         const int lanes = get_vector_num_elements(t);
@@ -2212,7 +2216,7 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) {
         v = slice_vector(v, 0, lanes);
         return v;
     }
-    llvm::Type *float_type = llvm_type_of(Float(32, bits / 32));
+    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
     v = builder->CreateBitCast(v, float_type);
     v = builder->CreateArithmeticFence(v, float_type);
     return builder->CreateBitCast(v, t);
diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp
index afe02039fbde..9fb29c2883e0 100644
--- a/test/correctness/transpose_idioms.cpp
+++ b/test/correctness/transpose_idioms.cpp
@@ -87,9 +87,10 @@ int main(int argc, char **argv) {
 
     // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power
     // of two sizes, and sizes that are coprime, and sizes that are neither
-    // coprime no powers of two.
+    // coprime no powers of two. We'll use sizes larger than 4, because some
+    // backends like to do different things for small strides.
 
-    for (auto tile : {std::pair{8, 16}, {7, 3}, {6, 9}}) {
+    for (auto tile : {std::pair{8, 16}, {7, 5}, {6, 9}}) {
         {
             // Idiom 1: Strided stores into a staged transposed copy of the
             // input. The strided stores that get mashed together into one big

From 5d2b5241fa620d9d84d70c8df960315a0dcf7577 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Mon, 16 Mar 2026 01:01:39 -0400
Subject: [PATCH 34/34] Move optimization_fence back

---
 src/CodeGen_LLVM.cpp | 50 ++++++++++++++++++++++----------------------
 src/CodeGen_LLVM.h   |  9 ++++----
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index bc4dd4f8eb3e..2d74f12b4c67 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2197,31 +2197,6 @@ void CodeGen_LLVM::visit(const Broadcast *op) {
     value = create_broadcast(v, op->lanes);
 }
 
-Value *CodeGen_LLVM::optimization_fence(Value *v) {
-    llvm::Type *t = v->getType();
-    if (t->isScalableTy()) {
-        // Convert to fixed, fence, convert back.
-        Value *fixed = scalable_to_fixed_vector_type(v);
-        fixed = optimization_fence(fixed);
-        return fixed_to_scalable_vector_type(fixed);
-    }
-    const int bits = t->getPrimitiveSizeInBits();
-    if (bits % 32) {
-        const int lanes = get_vector_num_elements(t);
-        const int element_bits = t->getScalarSizeInBits();
-        const int lanes_per_32_bits = 32 / element_bits;
-        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
-        v = slice_vector(v, 0, padded_lanes);
-        v = optimization_fence(v);
-        v = slice_vector(v, 0, lanes);
-        return v;
-    }
-    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
-    v = builder->CreateBitCast(v, float_type);
-    v = builder->CreateArithmeticFence(v, float_type);
-    return builder->CreateBitCast(v, t);
-}
-
 Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
     internal_assert(!vecs.empty());
     for (size_t i = 1; i < vecs.size(); i++) {
@@ -5215,6 +5190,31 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
     }
 }
 
+Value *CodeGen_LLVM::optimization_fence(Value *v) {
+    llvm::Type *t = v->getType();
+    if (t->isScalableTy()) {
+        // Convert to fixed, fence, convert back.
+        Value *fixed = scalable_to_fixed_vector_type(v);
+        fixed = optimization_fence(fixed);
+        return fixed_to_scalable_vector_type(fixed);
+    }
+    const int bits = t->getPrimitiveSizeInBits();
+    if (bits % 32) {
+        const int lanes = get_vector_num_elements(t);
+        const int element_bits = t->getScalarSizeInBits();
+        const int lanes_per_32_bits = 32 / element_bits;
+        const int padded_lanes = align_up(lanes, lanes_per_32_bits);
+        v = slice_vector(v, 0, padded_lanes);
+        v = optimization_fence(v);
+        v = slice_vector(v, 0, lanes);
+        return v;
+    }
+    llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed);
+    v = builder->CreateBitCast(v, float_type);
+    v = builder->CreateArithmeticFence(v, float_type);
+    return builder->CreateBitCast(v, t);
+}
+
 Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     if (v.size() == 1) {
         return v[0];
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 415de2463b47..57d78172c4fa 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -465,10 +465,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** The inverse of interleave_vectors. */
     virtual std::vector<llvm::Value *> deinterleave_vector(llvm::Value *vec, int num_vecs);
 
-    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
-     * abuse it to prevent shufflevector fusion too. */
-    virtual llvm::Value *optimization_fence(llvm::Value *);
-
     /** Description of an intrinsic function overload. Overloads are resolved
      * using both argument and return types. The scalar types of the arguments
      * and return type must match exactly for an overload resolution to succeed. */
@@ -518,6 +514,11 @@ class CodeGen_LLVM : public IRVisitor {
      * if you ask for more lanes than the vector has. */
     virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent);
 
+    /** Use an arithmetic fence to prevent LLVM from fusing operations
+     * across this barrier. Works by bitcasting to float, applying
+     * llvm.arithmetic.fence, and bitcasting back. */
+    virtual llvm::Value *optimization_fence(llvm::Value *);
+
     /** Concatenate a bunch of llvm vectors. Must be of the same type. */
     virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);