diff --git a/.gitignore b/.gitignore
index a08b8e8dd7f3..888235a389d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,9 @@ xcuserdata
 # NeoVim + clangd
 .cache
 
+# CCLS
+.ccls-cache
+
 # Emacs
 tags
 TAGS
diff --git a/Makefile b/Makefile
index c668cf20fdcd..6c6439478acd 100644
--- a/Makefile
+++ b/Makefile
@@ -535,6 +535,7 @@ SOURCE_FILES = \
   IRVisitor.cpp \
   JITModule.cpp \
   Lambda.cpp \
+  LegalizeVectors.cpp \
   Lerp.cpp \
   LICM.cpp \
   LLVM_Output.cpp \
@@ -737,6 +738,7 @@ HEADER_FILES = \
   IRVisitor.h \
   JITModule.h \
   Lambda.h \
+  LegalizeVectors.h \
   Lerp.h \
   LICM.h \
   LLVM_Output.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 036b92651667..22dc6202b0f1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -37,7 +37,8 @@ endif ()
 set_target_properties(Halide PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 ##
-# Lists of source files. Keep ALL lists sorted in alphabetical order.
+# Lists of source files. Keep ALL lists sorted in case-insensitive alphabetical order.
+# (neo)vim users can use ":sort i" in visual line mode.
 ##
 
 # The externally-visible header files that go into making Halide.h.
@@ -145,6 +146,7 @@ target_sources(
     IRVisitor.h
     JITModule.h
     Lambda.h
+    LegalizeVectors.h
     Lerp.h
     LICM.h
     LLVM_Output.h
@@ -323,6 +325,7 @@ target_sources(
     IRVisitor.cpp
     JITModule.cpp
     Lambda.cpp
+    LegalizeVectors.cpp
     Lerp.cpp
     LICM.cpp
     LLVM_Output.cpp
diff --git a/src/CSE.cpp b/src/CSE.cpp
index c2a46d93bc4d..6051e5e9cf62 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -33,6 +33,11 @@ bool should_extract(const Expr &e, bool lift_all) {
         return false;
     }
 
+    if (const Call *c = e.as<Call>()) {
+        // Calls with side effects should not be moved.
+        return c->is_pure() || c->call_type == Call::Halide;
+    }
+
     if (lift_all) {
         return true;
     }
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 7178e82965d8..592072a677c1 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1524,7 +1524,7 @@ void CodeGen_ARM::visit(const Store *op) {
         // Declare the function
         std::ostringstream instr;
         vector<llvm::Type *> arg_types;
-        llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, false, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
+        llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, true, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
         if (target.bits == 32) {
             instr << "llvm.arm.neon.vst"
                   << num_vecs
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 065dcebd1a64..9d33f7d4643b 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1157,7 +1157,7 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     internal_assert(result_elements > 0);
     llvm::Type *result_ty = get_vector_type(element_ty, result_elements);
 
-    // Try to rewrite shuffles that only access the elements of b.
+    // Find the range of non-dont-care indices.
     int min = INT_MAX;
     int max = -1;
     for (int idx : indices) {
@@ -1169,6 +1169,8 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     if (min == INT_MAX) {
         return llvm::PoisonValue::get(result_ty);
     }
+
+    // Try to rewrite shuffles that only access the elements of b.
     if (min >= a_elements) {
         vector<int> shifted_indices(indices);
         for (int &i : shifted_indices) {
@@ -1565,6 +1567,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
         Value *ret = nullptr;
         for (int i = 0; i < lut_elements; i += native_elements) {
             Value *lut_i = slice_vector(lut, i, native_elements);
+            internal_assert(get_vector_num_elements(lut_i->getType()) == native_elements);
             vector<int> indices_i(native_elements);
             vector<Constant *> mask(native_elements);
             bool all_used = true;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 300dfa096a1e..bad10f263661 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5093,10 +5093,11 @@ Value *CodeGen_LLVM::shuffle_vectors(Value *a, Value *b,
     }
     // Check for type identity *after* normalizing to fixed vectors
     internal_assert(a->getType() == b->getType());
+    int elements_a = get_vector_num_elements(a->getType());
     vector<Constant *> llvm_indices(indices.size());
     for (size_t i = 0; i < llvm_indices.size(); i++) {
         if (indices[i] >= 0) {
-            internal_assert(indices[i] < get_vector_num_elements(a->getType()) * 2);
+            internal_assert(indices[i] < elements_a * 2) << indices[i] << "  " << elements_a * 2;
             llvm_indices[i] = ConstantInt::get(i32_t, indices[i]);
         } else {
             // Only let -1 be undef.
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 48f1468c1316..a1de1be84099 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2086,31 +2086,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         debug(3) << "\n";
 
         if (arg_ids.size() == 1) {
-
             // 1 argument, just do a simple assignment via a cast
             SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
             builder.update_id(result_id);
 
         } else if (arg_ids.size() == 2) {
-
-            // 2 arguments, use a composite insert to update even and odd indices
-            uint32_t even_idx = 0;
-            uint32_t odd_idx = 1;
-            SpvFactory::Indices even_indices;
-            SpvFactory::Indices odd_indices;
-            for (int i = 0; i < op_lanes; ++i) {
-                even_indices.push_back(even_idx);
-                odd_indices.push_back(odd_idx);
-                even_idx += 2;
-                odd_idx += 2;
+            // 2 arguments, use vector-shuffle with logical indices indexing into (vec1[0], vec1[1], ..., vec2[0], vec2[1], ...)
+            SpvFactory::Indices logical_indices;
+            for (int i = 0; i < arg_lanes; ++i) {
+                logical_indices.push_back(uint32_t(i));
+                logical_indices.push_back(uint32_t(i + arg_lanes));
             }
 
             SpvId type_id = builder.declare_type(op->type);
-            SpvId value_id = builder.declare_null_constant(op->type);
-            SpvId partial_id = builder.reserve_id(SpvResultId);
             SpvId result_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::composite_insert(type_id, partial_id, arg_ids[0], value_id, even_indices));
-            builder.append(SpvFactory::composite_insert(type_id, result_id, arg_ids[1], partial_id, odd_indices));
+            builder.append(SpvFactory::vector_shuffle(type_id, result_id, arg_ids[0], arg_ids[1], logical_indices));
             builder.update_id(result_id);
 
         } else {
@@ -2140,7 +2130,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
     } else if (op->is_extract_element()) {
         int idx = op->indices[0];
         internal_assert(idx >= 0);
-        internal_assert(idx <= op->vectors[0].type().lanes());
+        internal_assert(idx < op->vectors[0].type().lanes());
         if (op->vectors[0].type().is_vector()) {
             SpvFactory::Indices indices = {(uint32_t)idx};
             SpvId type_id = builder.declare_type(op->type);
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index f7a5b5f49aa8..d6d6463d614d 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -17,6 +17,33 @@ namespace Internal {
 
 using std::pair;
 
+std::string variable_name_with_extracted_lanes(
+    const std::string &varname, int varlanes,
+    int starting_lane, int lane_stride, int new_lanes) {
+
+    if (lane_stride * new_lanes == varlanes) {
+        if (starting_lane == 0 && lane_stride == 2) {
+            return varname + ".even_lanes";
+        } else if (starting_lane == 1 && lane_stride == 2) {
+            return varname + ".odd_lanes";
+        }
+    }
+    if (lane_stride == 1) {
+        return varname + ".lanes_" + std::to_string(starting_lane) +
+               "_to_" + std::to_string(starting_lane + new_lanes - 1);
+    } else {
+        // Just specify the slice
+        std::string name = varname;
+        name += ".slice_";
+        name += std::to_string(starting_lane);
+        name += "_";
+        name += std::to_string(lane_stride);
+        name += "_";
+        name += std::to_string(new_lanes);
+        return name;
+    }
+}
+
 namespace {
 
 class StoreCollector : public IRMutator {
@@ -176,13 +203,17 @@ Stmt collect_strided_stores(const Stmt &stmt, const std::string &name, int strid
     return collect.mutate(stmt);
 }
 
-class Deinterleaver : public IRGraphMutator {
+class ExtractLanes : public IRMutator {
 public:
-    Deinterleaver(int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets)
+    ExtractLanes(
+        int starting_lane, int lane_stride, int new_lanes,
+        const Scope<> &sliceable_lets,
+        Scope<std::vector<VectorSlice>> &requested_slices)
         : starting_lane(starting_lane),
           lane_stride(lane_stride),
           new_lanes(new_lanes),
-          external_lets(lets) {
+          requested_slices(requested_slices) {
+        this->sliceable_lets.set_containing_scope(&sliceable_lets);
     }
 
 private:
@@ -190,35 +221,177 @@ class Deinterleaver : public IRGraphMutator {
     int lane_stride;
     int new_lanes;
 
-    // lets for which we have even and odd lane specializations
-    const Scope<> &external_lets;
+    // vector lets we're allowed to request slices of
+    Scope<> sliceable_lets;
+
+    // We populate this with the slices we need from the external_lets.
+    Scope<std::vector<VectorSlice>> &requested_slices;
 
     using IRMutator::visit;
 
+    inline bool needs_extracting(const Expr &op) {
+        if (op.type().is_scalar()) {
+            return false;
+        }
+        return !(starting_lane == 0 && lane_stride == 1 && new_lanes == op.type().lanes());
+    }
+
+    Expr extract_lanes_from_make_struct(const Call *op) {
+        internal_assert(op);
+        internal_assert(op->is_intrinsic(Call::make_struct));
+        auto [args, changed] = mutate_with_changes(op->args);
+        if (!changed) {
+            return op;
+        }
+        return Call::make(op->type, Call::make_struct, args, Call::Intrinsic);
+    }
+
+    Expr extract_lanes_trace(const Call *op) {
+        auto event = as_const_int(op->args[6]);
+        internal_assert(event);
+        if (*event == halide_trace_load || *event == halide_trace_store) {
+            debug(3) << "Extracting Trace Lanes: " << Expr(op) << "\n";
+            const Expr &func = op->args[0];
+            Expr values = extract_lanes_from_make_struct(op->args[1].as<Call>());
+            Expr coords = extract_lanes_from_make_struct(op->args[2].as<Call>());
+            const Expr &type_code = op->args[3];
+            const Expr &type_bits = op->args[4];
+            int type_lanes = *as_const_int(op->args[5]);
+            const Expr &event = op->args[6];
+            const Expr &parent_id = op->args[7];
+            const Expr &idx = op->args[8];
+            int size = *as_const_int(op->args[9]);
+            const Expr &tag = op->args[10];
+
+            int num_vecs = op->args[2].as<Call>()->args.size();
+            internal_assert(size == type_lanes * num_vecs) << Expr(op);
+            std::vector<Expr> args = {
+                func,
+                values, coords,
+                type_code, type_bits, Expr(new_lanes),
+                event, parent_id, idx, Expr(new_lanes * num_vecs),
+                tag};
+            Expr result = Call::make(Int(32), Call::trace, args, Call::Extern);
+            debug(4) << "  => " << result << "\n";
+            return result;
+        }
+
+        internal_error << "Unhandled trace call in ExtractLanes: " << *event;
+    }
+
+    Expr visit(const Let *op) override {
+
+        // Visit an entire chain of lets in a single method to conserve stack space.
+
+        // This logic is very to the same visit method in interleaver, but not
+        // the same. We don't mutate the let values by default, we just produce
+        // any requested slices of them.
+
+        struct Frame {
+            const Let *op;
+            ScopedBinding<> binding;
+            Frame(const Let *op, Scope<void> &scope)
+                : op(op),
+                  binding(op->value.type().is_vector(), scope, op->name) {
+            }
+        };
+        std::vector<Frame> frames;
+        Expr result;
+
+        do {
+            result = op->body;
+            frames.emplace_back(op, sliceable_lets);
+        } while ((op = result.template as<Let>()));
+
+        result = mutate(result);
+
+        std::set<std::string> vars_used;
+        auto track_vars_used = [&](const Expr &e) {
+            return visit_with(e,
+                              [&](auto *self, const Variable *var) {
+                                  vars_used.insert(var->name);
+                              });
+        };
+        track_vars_used(result);
+
+        for (const auto &frame : reverse_view(frames)) {
+
+            // The original variable, if it's needed.
+            if (vars_used.count(frame.op->name)) {
+                result = Let::make(frame.op->name, frame.op->value, result);
+                track_vars_used(frame.op->value);
+            }
+
+            // For vector lets, we may additionally need lets for the requested
+            // slices of this variable:
+            if (frame.op->value.type().is_vector()) {
+                if (std::vector<VectorSlice> *reqs = requested_slices.shallow_find(frame.op->name)) {
+                    for (const VectorSlice &sl : *reqs) {
+                        Expr slice;
+                        {
+                            ScopedValue<int> old_start(starting_lane, sl.start);
+                            ScopedValue<int> old_stride(lane_stride, sl.stride);
+                            ScopedValue<int> old_count(new_lanes, sl.count);
+                            slice = mutate(frame.op->value);
+                        }
+                        track_vars_used(slice);
+                        result = Let::make(sl.variable_name, slice, result);
+                    }
+                    requested_slices.pop(frame.op->name);
+                }
+            }
+        }
+
+        return result;
+    }
+
     Expr visit(const VectorReduce *op) override {
-        std::vector<int> input_lanes;
+        if (!needs_extracting(op)) {
+            return op;
+        }
         int factor = op->value.type().lanes() / op->type.lanes();
-        for (int i = starting_lane; i < op->type.lanes(); i += lane_stride) {
-            for (int j = 0; j < factor; j++) {
-                input_lanes.push_back(i * factor + j);
+        if (lane_stride != 1) {
+            std::vector<int> input_lanes;
+            for (int i = 0; i < new_lanes; ++i) {
+                int lane_start = (starting_lane + lane_stride * i) * factor;
+                for (int j = 0; j < factor; j++) {
+                    input_lanes.push_back(lane_start + j);
+                }
+            }
+            Expr in = Shuffle::make({op->value}, input_lanes);
+            return VectorReduce::make(op->op, in, new_lanes);
+        } else {
+            Expr in;
+            {
+                ScopedValue<int> old_starting_lane(starting_lane, starting_lane * factor);
+                ScopedValue<int> old_new_lanes(new_lanes, new_lanes * factor);
+                in = mutate(op->value);
             }
+            if (new_lanes == op->type.lanes() && in.same_as(op->value)) {
+                return op;
+            }
+            return VectorReduce::make(op->op, in, new_lanes);
         }
-        Expr in = Shuffle::make({op->value}, input_lanes);
-        return VectorReduce::make(op->op, in, new_lanes);
     }
 
     Expr visit(const Broadcast *op) override {
+        if (const Call *call = op->value.as<Call>()) {
+            if (call->name == Call::trace) {
+                Expr value = extract_lanes_trace(call);
+                if (new_lanes == 1) {
+                    return value;
+                } else {
+                    return Broadcast::make(value, new_lanes);
+                }
+            }
+        }
         if (new_lanes == 1) {
             if (op->value.type().lanes() == 1) {
                 return op->value;
             } else {
-                int old_starting_lane = starting_lane;
-                int old_lane_stride = lane_stride;
-                starting_lane = starting_lane % op->value.type().lanes();
-                lane_stride = op->value.type().lanes();
+                ScopedValue<int> old_starting_lane(starting_lane, starting_lane % op->value.type().lanes());
+                ScopedValue<int> old_lane_stride(lane_stride, op->value.type().lanes());
                 Expr e = mutate(op->value);
-                starting_lane = old_starting_lane;
-                lane_stride = old_lane_stride;
                 return e;
             }
         }
@@ -227,57 +400,70 @@ class Deinterleaver : public IRGraphMutator {
             return mutate(flatten_nested_ramps(op));
         }
 
+        if (new_lanes == op->type.lanes()) {
+            return op;
+        }
         return Broadcast::make(op->value, new_lanes);
     }
 
     Expr visit(const Load *op) override {
-        if (op->type.is_scalar()) {
+        if (!needs_extracting(op)) {
             return op;
-        } else {
-            Type t = op->type.with_lanes(new_lanes);
-            ModulusRemainder align = op->alignment;
-            // The alignment of a Load refers to the alignment of the first
-            // lane, so we can preserve the existing alignment metadata if the
-            // deinterleave is asking for any subset of lanes that includes the
-            // first. Otherwise we just drop it. We could check if the index is
-            // a ramp with constant stride or some other special case, but if
-            // that's the case, the simplifier is very good at figuring out the
-            // alignment, and it has access to context (e.g. the alignment of
-            // enclosing lets) that we do not have here.
-            if (starting_lane != 0) {
-                align = ModulusRemainder();
-            }
-            return Load::make(t, op->name, mutate(op->index), op->image, op->param, mutate(op->predicate), align);
         }
+        Type t = op->type.with_lanes(new_lanes);
+        ModulusRemainder align = op->alignment;
+        // The alignment of a Load refers to the alignment of the first
+        // lane, so we can preserve the existing alignment metadata if the
+        // deinterleave is asking for any subset of lanes that includes the
+        // first. Otherwise we just drop it. We could check if the index is
+        // a ramp with constant stride or some other special case, but if
+        // that's the case, the simplifier is very good at figuring out the
+        // alignment, and it has access to context (e.g. the alignment of
+        // enclosing lets) that we do not have here.
+        if (starting_lane != 0) {
+            align = ModulusRemainder();
+        }
+        return Load::make(t, op->name, mutate(op->index), op->image, op->param, mutate(op->predicate), align);
     }
 
     Expr visit(const Ramp *op) override {
+        if (!needs_extracting(op)) {
+            return op;
+        }
         int base_lanes = op->base.type().lanes();
         if (base_lanes > 1) {
             if (new_lanes == 1) {
                 int index = starting_lane / base_lanes;
-                Expr expr = op->base + cast(op->base.type(), index) * op->stride;
+                Expr expr = simplify(op->base + cast(op->base.type(), index) * op->stride);
                 ScopedValue<int> old_starting_lane(starting_lane, starting_lane % base_lanes);
                 ScopedValue<int> old_lane_stride(lane_stride, base_lanes);
                 expr = mutate(expr);
                 return expr;
             } else if (base_lanes == lane_stride &&
                        starting_lane < base_lanes) {
-                // Base class mutator actually works fine in this
-                // case, but we only want one lane from the base and
-                // one lane from the stride.
-                ScopedValue<int> old_new_lanes(new_lanes, 1);
-                return IRMutator::visit(op);
+                // We want one lane from the base and one lane from
+                // the stride, then build a new ramp with the right
+                // number of steps.
+                int ramp_lanes = new_lanes;
+                {
+                    ScopedValue<int> old_new_lanes(new_lanes, 1);
+                    Expr new_base = mutate(op->base);
+                    Expr new_stride = mutate(op->stride);
+                    if (ramp_lanes == 1) {
+                        return new_base;
+                    }
+                    return Ramp::make(new_base, new_stride, ramp_lanes);
+                }
             } else {
                 // There is probably a more efficient way to this by
                 // generalizing the two cases above.
                 return mutate(flatten_nested_ramps(op));
             }
         }
-        Expr expr = op->base + cast(op->base.type(), starting_lane) * op->stride;
+        Expr expr = simplify(op->base + cast(op->base.type(), starting_lane) * op->stride);
         internal_assert(expr.type() == op->base.type());
         if (new_lanes > 1) {
-            expr = Ramp::make(expr, op->stride * lane_stride, new_lanes);
+            expr = Ramp::make(expr, simplify(op->stride * cast(op->base.type(), lane_stride)), new_lanes);
         }
         return expr;
     }
@@ -294,39 +480,49 @@ class Deinterleaver : public IRGraphMutator {
     }
 
     Expr visit(const Variable *op) override {
-        if (op->type.is_scalar()) {
+        if (!needs_extracting(op)) {
             return op;
-        } else {
+        }
 
-            Type t = op->type.with_lanes(new_lanes);
-            if (external_lets.contains(op->name) &&
-                starting_lane == 0 &&
-                lane_stride == 2) {
-                return Variable::make(t, op->name + ".even_lanes", op->image, op->param, op->reduction_domain);
-            } else if (external_lets.contains(op->name) &&
-                       starting_lane == 1 &&
-                       lane_stride == 2) {
-                return Variable::make(t, op->name + ".odd_lanes", op->image, op->param, op->reduction_domain);
-            } else if (external_lets.contains(op->name) &&
-                       starting_lane == 0 &&
-                       lane_stride == 3) {
-                return Variable::make(t, op->name + ".lanes_0_of_3", op->image, op->param, op->reduction_domain);
-            } else if (external_lets.contains(op->name) &&
-                       starting_lane == 1 &&
-                       lane_stride == 3) {
-                return Variable::make(t, op->name + ".lanes_1_of_3", op->image, op->param, op->reduction_domain);
-            } else if (external_lets.contains(op->name) &&
-                       starting_lane == 2 &&
-                       lane_stride == 3) {
-                return Variable::make(t, op->name + ".lanes_2_of_3", op->image, op->param, op->reduction_domain);
+        Type t = op->type.with_lanes(new_lanes);
+
+        if (sliceable_lets.contains(op->name)) {
+            // The variable accessed is marked as sliceable by the caller.
+            // Let's request a slice and pretend it exists.
+            std::string sliced_var_name = variable_name_with_extracted_lanes(
+                op->name, op->type.lanes(),
+                starting_lane, lane_stride, new_lanes);
+
+            VectorSlice new_sl;  // When C++20 lands: Designated initializer
+            new_sl.start = starting_lane;
+            new_sl.stride = lane_stride;
+            new_sl.count = new_lanes;
+            new_sl.variable_name = sliced_var_name;
+
+            if (auto *vec = requested_slices.shallow_find(op->name)) {
+                bool found = false;
+                for (const VectorSlice &existing_sl : *vec) {
+                    if (existing_sl.start == starting_lane &&
+                        existing_sl.stride == lane_stride &&
+                        existing_sl.count == new_lanes) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    vec->push_back(std::move(new_sl));
+                }
             } else {
-                return give_up_and_shuffle(op);
+                requested_slices.push(op->name, {std::move(new_sl)});
             }
+            return Variable::make(t, sliced_var_name, op->image, op->param, op->reduction_domain);
+        } else {
+            return give_up_and_shuffle(op);
         }
     }
 
     Expr visit(const Cast *op) override {
-        if (op->type.is_scalar()) {
+        if (!needs_extracting(op)) {
             return op;
         } else {
             Type t = op->type.with_lanes(new_lanes);
@@ -335,122 +531,315 @@ class Deinterleaver : public IRGraphMutator {
     }
 
     Expr visit(const Reinterpret *op) override {
-        if (op->type.is_scalar()) {
+        // Written with assistance from Gemini 3 Pro, which required a lot of baby-sitting.
+
+        // Simple case of a scalar reinterpret: always one lane:
+        if (!needs_extracting(op)) {
             return op;
-        } else if (op->type.bits() != op->value.type().bits()) {
-            return give_up_and_shuffle(op);
-        } else {
-            Type t = op->type.with_lanes(new_lanes);
-            return Reinterpret::make(t, mutate(op->value));
         }
+
+        int out_bits = op->type.bits();
+        int in_bits = op->value.type().bits();
+
+        internal_assert(out_bits % in_bits == 0 || in_bits % out_bits == 0);
+
+        // Case A: Stride 1. Calculate everything with bit-offsets
+        if (lane_stride == 1) {
+
+            // Compute range of bits required from the input.
+            int start_bit = starting_lane * out_bits;
+            int total_bits = new_lanes * out_bits;
+            int end_bit = start_bit + total_bits;
+
+            // Convert this to a range of lane indices
+            int start_input_lane = start_bit / in_bits;
+            int end_input_lane = (end_bit + in_bits - 1) / in_bits;
+            int num_input_lanes = end_input_lane - start_input_lane;
+
+            // Actually now get those lanes from the input.
+            Expr extracted_input_lanes;
+            {
+                ScopedValue<int> old_sl(starting_lane, start_input_lane);
+                ScopedValue<int> old_nl(new_lanes, num_input_lanes);
+                extracted_input_lanes = mutate(op->value);
+            }
+
+            // The range of lanes we extracted from the input still might be too big, because
+            // we had to grab whole elements from the input, which can be coarser if out_bits > in_bits.
+            // So calculate how many lanes we extracted, when measured in the reinterpreted output type.
+            int intm_lanes = (num_input_lanes * in_bits) / out_bits;
+            Expr reinterpreted = Reinterpret::make(op->type.with_lanes(intm_lanes), extracted_input_lanes);
+
+            // Now calculate how many we output Type lanes we need to trim away.
+            int bits_to_strip_front = start_bit - (start_input_lane * in_bits);
+            int lanes_to_strip_front = bits_to_strip_front / out_bits;
+
+            if (lanes_to_strip_front == 0 && intm_lanes == new_lanes) {
+                return reinterpreted;
+            } else {
+                return Shuffle::make_slice(reinterpreted, lanes_to_strip_front, 1, new_lanes);
+            }
+        }
+
+        // Case B: Stride != 1. We are effectively gathering.
+        // We will rewrite those Reinterprets as a Concat of Reinterprets of extracted lanes.
+        std::vector<Expr> chunks(new_lanes);
+        for (int i = 0; i < new_lanes; ++i) {
+            // Find the bit range of this element in the output
+            int start_bit = (starting_lane + lane_stride * i) * out_bits;
+            int end_bit = start_bit + out_bits;
+
+            // Map it to input lanes
+            int start_input_lane = start_bit / in_bits;
+            int end_input_lane = (end_bit + in_bits - 1) / in_bits;
+            int num_input_lanes = end_input_lane - start_input_lane;
+
+            // Grab this range of lanes from the input.
+            Expr input_chunk;
+            {
+                ScopedValue<int> old_start(starting_lane, start_input_lane);
+                ScopedValue<int> old_stride(lane_stride, 1);
+                ScopedValue<int> old_count(new_lanes, num_input_lanes);
+                input_chunk = mutate(op->value);
+            }
+
+            // Reinterpret the chunk.
+            int extracted_bits = num_input_lanes * in_bits;
+            int reinterpreted_lanes = extracted_bits / out_bits;
+            internal_assert(reinterpreted_lanes != 0);
+
+            Expr reinterpreted = Reinterpret::make(op->type.with_lanes(reinterpreted_lanes), input_chunk);
+
+            // Now, in case of demotion:
+            // Example:
+            // R = ExtractLanes(Reinterpret([u32, u32, u32, u32], u8), 0, 2, 4)
+            //   = ExtractLanes([u8_0, u8_1, u8_2, u8_3, ...], 0, 2, 4)
+            //   = [u8_0, u8_2, u8_4, u8_6]
+            // A single extracted u32 element is too large, even after reinterpreting.
+            // So we need to slice the reinterpreted result.
+            int bit_offset = start_bit - (start_input_lane * in_bits);
+            int lane_offset = bit_offset / out_bits;
+
+            if (lane_offset == 0 && reinterpreted_lanes == 1) {
+                chunks[i] = std::move(reinterpreted);
+            } else {
+                chunks[i] = Shuffle::make_extract_element(reinterpreted, lane_offset);
+            }
+        }
+
+        // In case of demotion, we will potentially extract and reinterpret the same input lane several times.
+        // Simplification afterwards will turn them into Lets.
+
+        return Shuffle::make_concat(chunks);
     }
 
     Expr visit(const Call *op) override {
-        Type t = op->type.with_lanes(new_lanes);
+        internal_assert(op->type.lanes() >= starting_lane + lane_stride * (new_lanes - 1)) << Expr(op) << starting_lane << " " << lane_stride << " " << new_lanes;
 
         // Don't mutate scalars
-        if (op->type.is_scalar()) {
+        if (!needs_extracting(op)) {
             return op;
         } else {
-
             // Vector calls are always parallel across the lanes, so we
             // can just deinterleave the args.
+            Type t = op->type.with_lanes(new_lanes);
 
-            // Beware of intrinsics for which this is not true!
-            auto args = mutate(op->args);
-            return Call::make(t, op->name, args, op->call_type,
-                              op->func, op->value_index, op->image, op->param);
+            auto [args, changed] = mutate_with_changes(op->args);
+            if (!changed) {
+                // It's possible that this is a slice where output lanes = input
+                // lanes (e.g. reversing a vector) and the args are invariant
+                // under that slice (e.g. they are broadcasts).
+                internal_assert(t == op->type);
+                return op;
+            } else {
+                return Call::make(t, op->name, args, op->call_type,
+                                  op->func, op->value_index, op->image, op->param);
+            }
         }
     }
 
     Expr visit(const Shuffle *op) override {
+        if (!needs_extracting(op)) {
+            return op;
+        }
+
+        // Special case 1: Scalar extraction
+        if (new_lanes == 1) {
+            // Find in which vector it sits.
+            int index = op->indices[starting_lane];
+            for (const auto &vec : op->vectors) {
+                if (index < vec.type().lanes()) {
+                    // We found the source vector. Extract the scalar from it.
+                    ScopedValue<int> old_start(starting_lane, index);
+                    ScopedValue<int> old_stride(lane_stride, 1);  // Stride doesn't matter for scalar
+                    ScopedValue<int> old_count(new_lanes, 1);
+                    return mutate(vec);
+                }
+                index -= vec.type().lanes();
+            }
+            internal_error << "extract_lane index out of bounds: " << Expr(op) << " " << index << "\n";
+        }
+
         if (op->is_interleave()) {
             // Special case where we can discard some of the vector arguments entirely.
-            internal_assert(starting_lane >= 0 && starting_lane < lane_stride);
-            if ((int)op->vectors.size() == lane_stride) {
-                return op->vectors[starting_lane];
-            } else if ((int)op->vectors.size() % lane_stride == 0) {
-                // Pick up every lane-stride vector.
-                std::vector<Expr> new_vectors(op->vectors.size() / lane_stride);
-                for (size_t i = 0; i < new_vectors.size(); i++) {
-                    new_vectors[i] = op->vectors[i * lane_stride + starting_lane];
+            internal_assert(starting_lane >= 0);
+            int n_vectors = (int)op->vectors.size();
+
+            // Case A: Stride is a multiple of the number of input vectors.
+            // Example: extract_lanes(interleave(A, B), stride=4)
+            //          result comes from either A or B, depending on starting lane modulo number of vectors,
+            //          required stride of said vector is lane_stride / num_vectors
+            if (lane_stride > 0 && lane_stride % n_vectors == 0) {
+                const Expr &vec = op->vectors[starting_lane % n_vectors];
+                if (vec.type().lanes() == new_lanes) {
+                    // We need all lanes of this vector, just return it.
+                    return vec;
+                } else {
+                    // We don't need all lanes, unfortunately. Let's extract the part we need.
+                    ScopedValue<int> old_starting_lane(starting_lane, starting_lane / n_vectors);
+                    ScopedValue<int> old_lane_stride(lane_stride, lane_stride / n_vectors);
+                    return mutate(vec);
+                }
+            }
+
+            // Case B: Number of vectors is a multiple of the stride.
+            // Eg: extract_lanes(interleave(a, b, c, d, e, f), start=8, stride=3)
+            //  = extract_lanes(a0, b0, c0, d0, e0, f0, a1, b1, c1, d1, e1, f1, ...)
+            //  = (a2, c2, e2, c1, ...)
+            //  = interleave(a, c)
+            if (lane_stride > 0 && n_vectors % lane_stride == 0) {
+                int num_required_vectors = n_vectors / lane_stride;
+
+                // The result is only an interleave if the number of constituent
+                // vectors divides the number of total required lanes.
+                if (new_lanes % num_required_vectors == 0) {
+                    int lanes_per_vec = new_lanes / num_required_vectors;
+
+                    // Pick up every lane-stride vector.
+                    std::vector<Expr> new_vectors(num_required_vectors);
+                    for (size_t i = 0; i < new_vectors.size(); i++) {
+                        int absolute_lane_index = starting_lane + i * lane_stride;
+                        int src_vec_idx = absolute_lane_index % n_vectors;
+                        int vec_lane_start = absolute_lane_index / n_vectors;
+                        const Expr &vec = op->vectors[src_vec_idx];
+
+                        ScopedValue<int> old_starting_lane(starting_lane, vec_lane_start);
+                        ScopedValue<int> old_lane_stride(lane_stride, 1);
+                        ScopedValue<int> old_new_lanes(new_lanes, lanes_per_vec);
+                        new_vectors[i] = mutate(vec);
+                    }
+                    return Shuffle::make_interleave(new_vectors);
                 }
-                return Shuffle::make_interleave(new_vectors);
             }
         }
 
-        // Keep the same set of vectors and extract every nth numeric
-        // arg to the shuffle.
-        std::vector<int> indices;
+        // General case fallback
+        std::vector<int> indices(new_lanes);
+        bool constant_stride = true;
         for (int i = 0; i < new_lanes; i++) {
-            int idx = i * lane_stride + starting_lane;
-            indices.push_back(op->indices[idx]);
-        }
-
-        // If this is extracting a single lane, try to recursively deinterleave rather
-        // than leaving behind a shuffle.
-        if (indices.size() == 1) {
-            int index = indices.front();
-            for (const auto &i : op->vectors) {
-                if (index < i.type().lanes()) {
-                    ScopedValue<int> lane(starting_lane, index);
-                    return mutate(i);
+            int idx = op->indices[i * lane_stride + starting_lane];
+            indices[i] = idx;
+            if (i > 1 && constant_stride) {
+                int stride = indices[1] - indices[0];
+                if (indices[i] != indices[i - 1] + stride) {
+                    constant_stride = false;
                 }
-                index -= i.type().lanes();
             }
-            internal_error << "extract_lane index out of bounds: " << Expr(op) << " " << index << "\n";
+        }
+
+        // One optimization if we take a slice of a single vector.
+        if (constant_stride) {
+            int stride = indices[1] - indices[0];
+            int first_idx = indices.front();
+            int last_idx = indices.back();
+
+            // Find which vector contains this range
+            int current_bound = 0;
+            for (const auto &vec : op->vectors) {
+                int vec_lanes = vec.type().lanes();
+
+                // Check if the START of the ramp is in this vector
+                if (first_idx >= current_bound && first_idx < current_bound + vec_lanes) {
+
+                    // We found the vector containing the start.
+                    // Now, because it is a linear ramp, we only need to check if the
+                    // END of the ramp is also within this same vector.
+                    // (This handles negative strides, forward strides, and broadcasts correctly).
+                    if (last_idx >= current_bound && last_idx < current_bound + vec_lanes) {
+
+                        // Calculate the start index relative to this specific vector
+                        int local_start = first_idx - current_bound;
+
+                        ScopedValue<int> s_start(starting_lane, local_start);
+                        ScopedValue<int> s_stride(lane_stride, stride);
+                        // new_lanes is already correct
+                        return mutate(vec);
+                    }
+
+                    // If the start is here but the end is elsewhere, the ramp crosses
+                    // vector boundaries. We cannot optimize this as a single vector extraction.
+                    break;
+                }
+                current_bound += vec_lanes;
+            }
         }
 
         return Shuffle::make(op->vectors, indices);
     }
 };
 
-Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets) {
-    e = substitute_in_all_lets(e);
-    Deinterleaver d(starting_lane, lane_stride, new_lanes, lets);
-    e = d.mutate(e);
-    e = common_subexpression_elimination(e);
-    return simplify(e);
-}
-
-Expr extract_odd_lanes(const Expr &e, const Scope<> &lets) {
-    internal_assert(e.type().lanes() % 2 == 0);
-    return deinterleave(e, 1, 2, e.type().lanes() / 2, lets);
-}
+}  // namespace
 
-Expr extract_even_lanes(const Expr &e, const Scope<> &lets) {
-    internal_assert(e.type().lanes() % 2 == 0);
-    return deinterleave(e, 0, 2, (e.type().lanes() + 1) / 2, lets);
+Expr extract_lanes(const Expr &original_expr, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets, Scope<std::vector<VectorSlice>> &requested_sliced_lets) {
+    internal_assert(starting_lane + (new_lanes - 1) * lane_stride <= original_expr.type().lanes())
+        << "Extract lanes with start:" << starting_lane << ", stride:" << lane_stride << ", new_lanes:" << new_lanes << "  "
+        << "out of " << original_expr.type() << " which goes out of bounds.";
+
+    debug(3) << "ExtractLanes "
+             << "(start:" << starting_lane << ", stride:" << lane_stride << ", new_lanes:" << new_lanes << "): "
+             << original_expr << " of Type: " << original_expr.type() << "\n";
+    Type original_type = original_expr.type();
+    ExtractLanes d(starting_lane, lane_stride, new_lanes, lets, requested_sliced_lets);
+    Expr e = d.mutate(original_expr);
+    e = common_subexpression_elimination(e);
+    debug(3) << "   => " << e << "\n";
+    Type final_type = e.type();
+    internal_assert(original_type.code() == final_type.code())
+        << "Underlying types not identical after extract_lanes:\n"
+        << "Before: " << original_expr << "\n"
+        << "After: " << e << "\n";
+    internal_assert(new_lanes == final_type.lanes())
+        << "Number of lanes incorrect after extract_lanes: " << final_type.lanes() << " while expected was " << new_lanes << ": extract_lanes(" << starting_lane << ", " << lane_stride << ", " << new_lanes << "):\n"
+        << "Input: " << original_expr << "\nResult: " << e;
+    return e;
 }
 
-Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) {
-    internal_assert(e.type().lanes() % 3 == 0);
-    return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets);
+Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes) {
+    Scope<> lets;
+    Scope<std::vector<VectorSlice>> req;
+    return extract_lanes(e, starting_lane, lane_stride, new_lanes, lets, req);
 }
 
-}  // namespace
-
 Expr extract_even_lanes(const Expr &e) {
     internal_assert(e.type().lanes() % 2 == 0);
-    Scope<> lets;
-    return extract_even_lanes(e, lets);
+    return extract_lanes(e, 0, 2, e.type().lanes() / 2);
 }
 
 Expr extract_odd_lanes(const Expr &e) {
     internal_assert(e.type().lanes() % 2 == 0);
-    Scope<> lets;
-    return extract_odd_lanes(e, lets);
+    return extract_lanes(e, 1, 2, e.type().lanes() / 2);
 }
 
 Expr extract_lane(const Expr &e, int lane) {
-    Scope<> lets;
-    return deinterleave(e, lane, e.type().lanes(), 1, lets);
+    return extract_lanes(e, lane, e.type().lanes(), 1);
 }
 
 namespace {
 
+// Change name to DenisfyStridedLoadsAndStores?
 class Interleaver : public IRMutator {
     Scope<> vector_lets;
+    Scope<std::vector<VectorSlice>> requested_sliced_lets;
 
     using IRMutator::visit;
 
@@ -459,9 +848,9 @@ class Interleaver : public IRMutator {
 
     Expr deinterleave_expr(const Expr &e) {
         std::vector<Expr> exprs;
+        exprs.reserve(num_lanes);
         for (int i = 0; i < num_lanes; i++) {
-            Scope<> lets;
-            exprs.emplace_back(deinterleave(e, i, num_lanes, e.type().lanes() / num_lanes, lets));
+            exprs.emplace_back(extract_lanes(e, i, num_lanes, e.type().lanes() / num_lanes, vector_lets, requested_sliced_lets));
         }
         return Shuffle::make_interleave(exprs);
     }
@@ -492,18 +881,21 @@ class Interleaver : public IRMutator {
         for (const auto &frame : reverse_view(frames)) {
             Expr value = std::move(frame.new_value);
 
+            // The original variable:
             result = LetOrLetStmt::make(frame.op->name, value, result);
 
-            // For vector lets, we may additionally need a let defining the even and odd lanes only
+            // For vector lets, we may additionally need a lets for the requested slices of this variable:
             if (value.type().is_vector()) {
-                if (value.type().lanes() % 2 == 0) {
-                    result = LetOrLetStmt::make(frame.op->name + ".even_lanes", extract_even_lanes(value, vector_lets), result);
-                    result = LetOrLetStmt::make(frame.op->name + ".odd_lanes", extract_odd_lanes(value, vector_lets), result);
-                }
-                if (value.type().lanes() % 3 == 0) {
-                    result = LetOrLetStmt::make(frame.op->name + ".lanes_0_of_3", extract_mod3_lanes(value, 0, vector_lets), result);
-                    result = LetOrLetStmt::make(frame.op->name + ".lanes_1_of_3", extract_mod3_lanes(value, 1, vector_lets), result);
-                    result = LetOrLetStmt::make(frame.op->name + ".lanes_2_of_3", extract_mod3_lanes(value, 2, vector_lets), result);
+                if (std::vector<VectorSlice> *reqs =
+                        requested_sliced_lets.shallow_find(frame.op->name)) {
+                    for (const VectorSlice &sl : *reqs) {
+                        result = LetOrLetStmt::make(
+                            sl.variable_name,
+                            extract_lanes(value, sl.start, sl.stride, sl.count,
+                                          vector_lets, requested_sliced_lets),
+                            result);
+                    }
+                    requested_sliced_lets.pop(frame.op->name);
                 }
             }
         }
@@ -718,7 +1110,7 @@ class Interleaver : public IRMutator {
             const Ramp *ri = stores[i].as<Store>()->index.as<Ramp>();
             internal_assert(ri);
 
-            // Mismatched store vector laness.
+            // Mismatched store vector lanes.
             if (ri->lanes != lanes) {
                 return Stmt();
             }
diff --git a/src/Deinterleave.h b/src/Deinterleave.h
index 485641f71a5f..630fa8e7ecc1 100644
--- a/src/Deinterleave.h
+++ b/src/Deinterleave.h
@@ -9,15 +9,21 @@
  */
 
 #include "Expr.h"
+#include "Scope.h"
 
 namespace Halide {
 namespace Internal {
 
-/** Extract the odd-numbered lanes in a vector */
-Expr extract_odd_lanes(const Expr &a);
+struct VectorSlice {
+    int start, stride, count;
+    std::string variable_name;
+};
 
-/** Extract the even-numbered lanes in a vector */
-Expr extract_even_lanes(const Expr &a);
+/* Extract lanes and relying on the fact that the caller will provide new variables in Lets or LetStmts which correspond to slices of the original variable. */
+Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &sliceable_lets, Scope<std::vector<VectorSlice>> &requested_sliced_lets);
+
+/* Extract lanes without requesting any extra slices from variables. */
+Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes);
 
 /** Extract the nth lane of a vector */
 Expr extract_lane(const Expr &vec, int lane);
diff --git a/src/IR.cpp b/src/IR.cpp
index 2c91d16b50e1..0cc6cb828500 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -12,7 +12,7 @@ namespace Internal {
 
 Expr Cast::make(Type t, Expr v) {
     internal_assert(v.defined()) << "Cast of undefined\n";
-    internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths\n";
+    internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths: " << v << " of type " << v.type() << " cannot be cast to " << t << "\n";
 
     Cast *node = new Cast;
     node->type = t;
@@ -281,7 +281,7 @@ Expr Ramp::make(Expr base, Expr stride, int lanes) {
 
 Expr Broadcast::make(Expr value, int lanes) {
     internal_assert(value.defined()) << "Broadcast of undefined\n";
-    internal_assert(lanes != 1) << "Broadcast of lanes 1\n";
+    internal_assert(lanes != 1) << "Broadcast over 1 lane is not a broadcast\n";
 
     Broadcast *node = new Broadcast;
     node->type = value.type().with_lanes(lanes * value.type().lanes());
diff --git a/src/IROperator.h b/src/IROperator.h
index 797f12870f5d..015b767bd330 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -1285,7 +1285,8 @@ Expr random_int(Expr seed = Expr());
 
 /** Create an Expr that prints out its value whenever it is
  * evaluated. It also prints out everything else in the arguments
- * list, separated by spaces. This can include string literals. */
+ * list, separated by spaces. This can include string literals.
+ * Evaluates to the first argument passed. */
 //@{
 Expr print(const std::vector<Expr> &values);
 
diff --git a/src/LegalizeVectors.cpp b/src/LegalizeVectors.cpp
new file mode 100644
index 000000000000..85fba6c3eca1
--- /dev/null
+++ b/src/LegalizeVectors.cpp
@@ -0,0 +1,541 @@
+#include "LegalizeVectors.h"
+#include "CSE.h"
+#include "Deinterleave.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "Simplify.h"
+#include "Util.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+using namespace std;
+
+int max_lanes_for_device(DeviceAPI api, int parent_max_lanes) {
+    // The environment variable below (HL_FORCE_VECTOR_LEGALIZATION) is here solely for testing purposes.
+    // It is useful to "stress-test" this lowering pass by forcing a shorter maximal vector size across
+    // all codegen across the entire test suite. This should not be used in real uses of Halide.
+    std::string envvar = Halide::Internal::get_env_variable("HL_FORCE_VECTOR_LEGALIZATION");
+    if (!envvar.empty()) {
+        return std::atoi(envvar.c_str());
+    }
+    // The remainder of this function correctly determines the number of lanes the device API supports.
+    switch (api) {
+    case DeviceAPI::Metal:
+    case DeviceAPI::WebGPU:
+    case DeviceAPI::Vulkan:
+    case DeviceAPI::D3D12Compute:
+        return 4;
+    case DeviceAPI::OpenCL:
+        return 16;
+    case DeviceAPI::CUDA:
+    case DeviceAPI::Hexagon:
+    case DeviceAPI::HexagonDma:
+    case DeviceAPI::Host:
+        return 0;  // No max: LLVM based legalization
+    case DeviceAPI::None:
+        return parent_max_lanes;
+    case DeviceAPI::Default_GPU:
+        internal_error << "No GPU API was selected.";
+        return 0;
+    }
+    internal_error << "Unknown Device API";
+    return 0;
+}
+
+class LiftLetToLetStmt : public IRMutator {
+    using IRMutator::visit;
+
+    unordered_set<string> lifted_let_names;
+    vector<const Let *> lets;
+    Expr visit(const Let *op) override {
+        internal_assert(lifted_let_names.count(op->name) == 0)
+            << "Let " << op->name << " = ...  cannot be lifted to LetStmt because the name is not unique.";
+        lets.push_back(op);
+        lifted_let_names.insert(op->name);
+        return mutate(op->body);
+    }
+
+public:
+    Stmt mutate(const Stmt &s) override {
+        ScopedValue<decltype(lets)> scoped_lets(lets, {});
+        Stmt mutated = IRMutator::mutate(s);
+        for (const Let *let : reverse_view(lets)) {
+            mutated = LetStmt::make(let->name, let->value, mutated);
+        }
+        return mutated;
+    }
+
+    Expr mutate(const Expr &e) override {
+        return IRMutator::mutate(e);
+    }
+};
+
+class LiftExceedingVectors : public IRMutator {
+    using IRMutator::visit;
+
+    int max_lanes;
+
+    vector<pair<string, Expr>> lets;
+    bool just_in_let_definition{false};
+
+    template<typename LetOrLetStmt>
+    auto visit_let_or_letstmt(const LetOrLetStmt *op) -> decltype(op->body) {
+        just_in_let_definition = true;
+        Expr def = mutate(op->value);
+        just_in_let_definition = false;
+
+        decltype(op->body) body = mutate(op->body);
+        if (def.same_as(op->value) && body.same_as(op->body)) {
+            return op;
+        }
+        return LetOrLetStmt::make(op->name, std::move(def), std::move(body));
+    }
+
+    Expr visit(const Let *op) override {
+        return visit_let_or_letstmt(op);
+    }
+
+    Stmt visit(const LetStmt *op) override {
+        return visit_let_or_letstmt(op);
+    }
+
+    Expr visit(const Call *op) override {
+        // Custom handling of Call, to prevent certain things from being extracted out
+        // of the call arguments, as that's not always allowed.
+        bool exceeds_lanecount = op->type.lanes() > max_lanes;
+        Expr mutated = op;
+        if (exceeds_lanecount) {
+            std::vector<Expr> args;
+            args.reserve(op->args.size());
+            bool changed = false;
+            for (int i = 0; i < int(op->args.size()); ++i) {
+                bool may_extract = true;
+                if (op->is_intrinsic(Call::require)) {
+                    // Call::require is special: it behaves a little like if-then-else:
+                    // it runs the 3rd argument (the error handling part) only when there
+                    // is an error. Extracting that would unconditionally print the error.
+                    may_extract &= i < 2;
+                }
+                if (op->is_intrinsic(Call::if_then_else)) {
+                    // Only allow the condition to be extracted.
+                    may_extract &= i == 0;
+                }
+                const Expr &arg = op->args[i];
+                if (may_extract) {
+                    internal_assert(arg.type().lanes() == op->type.lanes());
+                    Expr mutated = mutate(arg);
+                    if (!mutated.same_as(arg)) {
+                        changed = true;
+                    }
+                    args.push_back(mutated);
+                } else {
+                    args.push_back(arg);
+                }
+            }
+            if (!changed) {
+                return op;
+            }
+            mutated = Call::make(op->type, op->name, args, op->call_type);
+        } else {
+            mutated = IRMutator::visit(op);
+        }
+        return mutated;
+    }
+
+public:
+    Stmt mutate(const Stmt &s) override {
+        ScopedValue<decltype(lets)> scoped_lets(lets, {});
+        just_in_let_definition = false;
+        Stmt mutated = IRMutator::mutate(s);
+        for (auto &let : reverse_view(lets)) {
+            // There is no recurse into let.second. This is handled by repeatedly calling this transform.
+            mutated = LetStmt::make(let.first, let.second, mutated);
+        }
+        return mutated;
+    }
+
+    Expr mutate(const Expr &e) override {
+        bool exceeds_lanecount = e.type().lanes() > max_lanes;
+
+        if (exceeds_lanecount) {
+            bool should_extract = false;
+            should_extract |= e.node_type() == IRNodeType::Shuffle;
+            should_extract |= e.node_type() == IRNodeType::VectorReduce;
+
+            should_extract &= !just_in_let_definition;
+
+            debug((should_extract ? 3 : 4)) << "Max lanes (" << max_lanes << ") exceeded (" << e.type().lanes() << ") by: " << e << "\n";
+            if (should_extract) {
+                std::string name = unique_name('t');
+                Expr var = Variable::make(e.type(), name);
+                lets.emplace_back(name, e);
+                debug(3) << "  => Lifted out into " << name << "\n";
+                return var;
+            }
+        }
+
+        just_in_let_definition = false;
+        return IRMutator::mutate(e);
+    }
+
+    LiftExceedingVectors(int max_lanes)
+        : max_lanes(max_lanes) {
+        internal_assert(max_lanes != 0) << "LiftExceedingVectors should not be called when there is no lane limit.";
+    }
+};
+
+class LegalizeVectors : public IRMutator {
+    using IRMutator::visit;
+
+    int max_lanes;
+
+    Scope<> sliceable_vectors;
+    Scope<std::vector<VectorSlice>> requested_slices;
+
+    template<typename LetOrLetStmt>
+    auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) {
+        bool exceeds_lanecount = op->value.type().lanes() > max_lanes;
+
+        if (exceeds_lanecount) {
+            int num_vecs = (op->value.type().lanes() + max_lanes - 1) / max_lanes;
+            debug(3) << "Legalize let " << op->value.type() << ": " << op->name
+                     << " = " << op->value << " into " << num_vecs << " vecs\n";
+
+            // First mark this Let as sliceable before mutating the body:
+            ScopedBinding<> vector_is_slicable(sliceable_vectors, op->name);
+
+            auto body = mutate(op->body);
+            // Here we know which requested vector variable slices should be created for the body of the Let/LetStmt to work.
+
+            if (const std::vector<VectorSlice> *reqs = requested_slices.find(op->name)) {
+                for (const VectorSlice &sl : *reqs) {
+                    Expr value = extract_lanes(op->value, sl.start, sl.stride, sl.count, sliceable_vectors, requested_slices);
+                    value = mutate(value);
+                    body = LetOrLetStmt::make(sl.variable_name, value, body);
+                    debug(3) << "  Add: let " << sl.variable_name << " = " << value << "\n";
+                }
+                requested_slices.pop(op->name);
+            }
+            return body;
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    Stmt visit(const LetStmt *op) override {
+        return visit_let(op);
+    }
+
+    Expr visit(const Let *op) override {
+        bool exceeds_lanecount = op->value.type().lanes() > max_lanes;
+        internal_assert(!exceeds_lanecount) << "All illegal Let's should have been converted to LetStmts";
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        bool exceeds_lanecount = op->index.type().lanes() > max_lanes;
+        if (exceeds_lanecount) {
+            // Split up in multiple stores
+            int num_vecs = (op->index.type().lanes() + max_lanes - 1) / max_lanes;
+
+            std::vector<Expr> bundle_args;
+            bundle_args.reserve(num_vecs * 3);
+
+            // Break up the index, predicate, and value of the Store into legal chunks.
+            for (int i = 0; i < num_vecs; ++i) {
+                int lane_start = i * max_lanes;
+                int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes);
+
+                // Pack them in a known order: rhs, index, predicate
+                bundle_args.push_back(extract_lanes(op->value, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices));
+                bundle_args.push_back(extract_lanes(op->index, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices));
+                bundle_args.push_back(extract_lanes(op->predicate, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices));
+            }
+
+            // Run CSE on the joint bundle
+            Expr joint_bundle = Call::make(Int(32), Call::bundle, bundle_args, Call::PureIntrinsic);
+            joint_bundle = common_subexpression_elimination(joint_bundle);
+
+            // Peel off the `Let` expressions introduced by the CSE pass
+            std::vector<std::pair<std::string, Expr>> let_bindings;
+            while (const Let *let = joint_bundle.as<Let>()) {
+                let_bindings.emplace_back(let->name, let->value);
+                joint_bundle = let->body;
+            }
+
+            // Destructure the bundle to get our optimized expressions
+            const Call *struct_call = joint_bundle.as<Call>();
+            internal_assert(struct_call && struct_call->is_intrinsic(Call::bundle))
+                << "Expected the CSE bundle to remain a bundle Call.";
+
+            // Construct the legal stores with the CSE'd expressions
+            std::vector<Stmt> assignments;
+            assignments.reserve(num_vecs);
+            for (int i = 0; i < num_vecs; ++i) {
+
+                // Unpack in the same order we packed them
+                Expr rhs = struct_call->args[i * 3 + 0];
+                Expr index = struct_call->args[i * 3 + 1];
+                Expr predicate = struct_call->args[i * 3 + 2];
+
+                ModulusRemainder alignment = op->alignment;
+                if (i != 0) {
+                    // In case i == 0, we are taking the first lane, and the alignment is still valid.
+                    alignment = ModulusRemainder();
+                }
+
+                assignments.push_back(Store::make(
+                    op->name, std::move(rhs), std::move(index),
+                    op->param, std::move(predicate), alignment));
+            }
+
+            Stmt result = Block::make(assignments);
+
+            // Wrap the block in LetStmts to properly scope all shared expressions
+            // Iterate backwards to build the LetStmt tree from the inside out.
+            for (auto &let : reverse_view(let_bindings)) {
+                result = LetStmt::make(let.first, let.second, result);
+            }
+
+            debug(3) << "Legalized store " << Stmt(op) << " => " << result << "\n";
+            return result;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        // Primary violation: there are too many output lanes.
+        if (op->type.lanes() > max_lanes) {
+            // Break it down in multiple legal-output-length shuffles, and concatenate them back together.
+            int total_lanes = op->type.lanes();
+
+            std::vector<Expr> output_chunks;
+            output_chunks.reserve((total_lanes + max_lanes - 1) / max_lanes);
+            for (int i = 0; i < total_lanes; i += max_lanes) {
+                int slice_len = std::min(max_lanes, total_lanes - i);
+
+                std::vector<int> slice_indices(slice_len);
+                for (int k = 0; k < slice_len; ++k) {
+                    slice_indices[k] = op->indices[i + k];
+                }
+
+                Expr sub_shuffle = Shuffle::make(op->vectors, slice_indices);
+
+                output_chunks.push_back(mutate(sub_shuffle));
+            }
+            return Shuffle::make_concat(output_chunks);
+        }
+
+        // Secondary violation: input vectors have too many lanes.
+        bool requires_mutation = false;
+        for (const auto &vec : op->vectors) {
+            if (vec.type().lanes() > max_lanes) {
+                requires_mutation = true;
+                break;
+            }
+        }
+
+        if (requires_mutation) {
+            debug(4) << "Legalizing Shuffle " << Expr(op) << "\n";
+            // We are dealing with a shuffle of an exceeding-lane-count vector argument.
+            // We can assume the variable here has extracted lane variables in surrounding Lets.
+            // So let's hope it's a simple case, and we can legalize.
+
+            vector<Expr> new_vectors;
+            vector<pair<int, int>> vector_and_lane_indices = op->vector_and_lane_indices();
+            for (int i = 0; i < int(op->vectors.size()); ++i) {
+                const Expr &vec = op->vectors[i];
+                if (vec.type().lanes() > max_lanes) {
+                    debug(4) << "  Arg " << i << ": " << vec << "\n";
+                    int num_vecs = (vec.type().lanes() + max_lanes - 1) / max_lanes;
+                    for (int i = 0; i < num_vecs; i++) {
+                        int lane_start = i * max_lanes;
+                        int lane_count_for_vec = std::min(vec.type().lanes() - lane_start, max_lanes);
+                        new_vectors.push_back(extract_lanes(vec, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices));
+                    }
+                } else {
+                    new_vectors.push_back(IRMutator::mutate(vec));
+                }
+            }
+            Expr result = simplify(Shuffle::make(new_vectors, op->indices));
+            debug(3) << "Legalized " << Expr(op) << " => " << result << "\n";
+            return result;
+        }
+
+        // Base case: everything legal in this Shuffle
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const VectorReduce *op) override {
+        // Written with the help of Gemini 3 Pro.
+        Expr value = mutate(op->value);
+
+        int input_lanes = value.type().lanes();
+        int output_lanes = op->type.lanes();
+
+        // Base case: we don't need legalization.
+        if (input_lanes <= max_lanes && output_lanes <= max_lanes) {
+            if (value.same_as(op->value)) {
+                return op;
+            } else {
+                return VectorReduce::make(op->op, value, output_lanes);
+            }
+        }
+
+        // Recursive splitting strategy.
+        // Case A: Segmented Reduction (Multiple Output Lanes)
+        // Example: VectorReduce( <16 lanes>, output_lanes=2 ) with max_lanes=4.
+        // Input is too big. We split the OUTPUT domain.
+        // We calculate which chunk of the input corresponds to the first half of the output.
+        if (output_lanes > 1) {
+            // 1. Calculate good splitting point
+            int out_split = output_lanes / 2;
+
+            // 2. However, do align to max_lanes to keep chunks native-sized if possible
+            if (out_split > max_lanes) {
+                out_split = (out_split / max_lanes) * max_lanes;
+            } else if (output_lanes > max_lanes) {
+                // If the total is > max, but half is < max (e.g. 6),
+                // we want to peel 'max' (4) rather than split (3).
+                out_split = max_lanes;
+            }
+
+            // Take remainder beyond the split point
+            int out_remaining = output_lanes - out_split;
+            internal_assert(out_remaining >= 1);
+
+            // Calculate the reduction factor to find where to split the input
+            // e.g., 16 input -> 2 output means factor is 8.
+            // If we want the first 1 output lane, we need the first 8 input lanes.
+            int reduction_factor = input_lanes / output_lanes;
+            int in_split = out_split * reduction_factor;
+            int in_remaining = input_lanes - in_split;
+
+            Expr arg_lo = extract_lanes(value, 0, 1, in_split, sliceable_vectors, requested_slices);
+            Expr arg_hi = extract_lanes(value, in_split, 1, in_remaining, sliceable_vectors, requested_slices);
+
+            // Recursively mutate the smaller reductions
+            Expr res_lo = mutate(VectorReduce::make(op->op, arg_lo, out_split));
+            Expr res_hi = mutate(VectorReduce::make(op->op, arg_hi, out_remaining));
+
+            // Concatenate the results to form the new vector
+            return Shuffle::make_concat({res_lo, res_hi});
+        }
+
+        // Case B: Horizontal Reduction (Single Output Lane)
+        // Example: VectorReduce( <16 lanes>, output_lanes=1 ) with max_lanes=4.
+        // We cannot split the output. We must split the INPUT, reduce both halves
+        // to scalars, and then combine them.
+        if (output_lanes == 1) {
+            int in_split = input_lanes / 2;
+            int in_remaining = input_lanes - in_split;
+
+            // Extract input halves
+            Expr arg_lo = extract_lanes(value, 0, 1, in_split, sliceable_vectors, requested_slices);
+            Expr arg_hi = extract_lanes(value, in_split, 1, in_remaining, sliceable_vectors, requested_slices);
+
+            // Recursively reduce both halves to scalars
+            Expr res_lo = mutate(VectorReduce::make(op->op, arg_lo, 1));
+            Expr res_hi = mutate(VectorReduce::make(op->op, arg_hi, 1));
+
+            // Combine using the standard binary operator for this reduction type
+            switch (op->op) {
+            case VectorReduce::Add:
+                return res_lo + res_hi;
+            case VectorReduce::SaturatingAdd:
+                return saturating_add(res_lo, res_hi);
+            case VectorReduce::Mul:
+                return res_lo * res_hi;
+            case VectorReduce::Min:
+                return min(res_lo, res_hi);
+            case VectorReduce::Max:
+                return max(res_lo, res_hi);
+            case VectorReduce::And:
+                return res_lo && res_hi;
+            case VectorReduce::Or:
+                return res_lo || res_hi;
+            default:
+                internal_error << "Unknown VectorReduce operator\n";
+                return Expr();
+            }
+        }
+
+        internal_error << "Unreachable";
+        return op;
+    }
+
+public:
+    LegalizeVectors(int max_lanes)
+        : max_lanes(max_lanes) {
+        internal_assert(max_lanes != 0) << "LegalizeVectors should not be called when there is no lane limit.";
+    }
+};
+
+}  // namespace
+
+Stmt legalize_vectors_in_device_loop(const For *op) {
+    int max_lanes = max_lanes_for_device(op->device_api, 0);
+
+    // Similar to CSE, lifting out stuff into variables.
+    // Pass 1): lift out Shuffles that exceed lane count into variables
+    // Pass 2): Rewrite those vector variables as bundles of vector variables, while legalizing all other stuff.
+    Stmt m0 = simplify(op->body);
+    Stmt m1 = common_subexpression_elimination(m0, false);
+    if (!m1.same_as(op->body)) {
+        debug(3) << "After CSE:\n"
+                 << m1 << "\n";
+    }
+    Stmt m2 = LiftLetToLetStmt().mutate(m1);
+    if (!m2.same_as(m1)) {
+        debug(3) << "After lifting Lets to LetStmts:\n"
+                 << m2 << "\n";
+    }
+
+    Stmt m3 = m2;
+    while (true) {
+        Stmt m = LiftExceedingVectors(max_lanes).mutate(m3);
+        bool modified = !m3.same_as(m);
+        m3 = std::move(m);
+        if (!modified) {
+            debug(3) << "Nothing got lifted out\n";
+            break;
+        } else {
+            debug(3) << "Atfer lifting exceeding vectors:\n"
+                     << m3 << "\n";
+        }
+    }
+
+    Stmt m4 = LegalizeVectors(max_lanes).mutate(m3);
+    if (!m4.same_as(m3)) {
+        debug(3) << "After legalizing vectors:\n"
+                 << m4 << "\n";
+    }
+    if (m4.same_as(m2)) {
+        debug(3) << "Vector Legalization did do nothing, returning input.\n";
+        return op;
+    }
+    Stmt m5 = simplify(m4);
+    if (!m4.same_as(m5)) {
+        debug(3) << "After simplify:\n"
+                 << m5 << "\n";
+    }
+    return For::make(op->name, op->min, op->max, op->for_type,
+                     op->partition_policy, op->device_api, m5);
+}
+
+Stmt legalize_vectors(const Stmt &s) {
+    return mutate_with(s, [&](auto *self, const For *op) {
+        if (max_lanes_for_device(op->device_api, 0)) {
+            return legalize_vectors_in_device_loop(op);
+        }
+        return self->visit_base(op);
+    });
+}
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/LegalizeVectors.h b/src/LegalizeVectors.h
new file mode 100644
index 000000000000..14fe8d806fb1
--- /dev/null
+++ b/src/LegalizeVectors.h
@@ -0,0 +1,19 @@
+#ifndef HALIDE_INTERNAL_LEGALIZE_VECTORS_H
+#define HALIDE_INTERNAL_LEGALIZE_VECTORS_H
+
+#include "Expr.h"
+
+/** \file
+ * Defines a lowering pass that legalizes vectorized expressions
+ * to have a maximal lane count.
+ */
+
+namespace Halide {
+namespace Internal {
+
+Stmt legalize_vectors(const Stmt &s);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 9b55bd20840d..0af72b265cf0 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -42,6 +42,7 @@
 #include "InjectHostDevBufferCopies.h"
 #include "Inline.h"
 #include "LICM.h"
+#include "LegalizeVectors.h"
 #include "LoopCarry.h"
 #include "LowerParallelTasks.h"
 #include "LowerWarpShuffles.h"
@@ -439,6 +440,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = flatten_nested_ramps(s);
     log("Lowering after flattening nested ramps:", s);
 
+    debug(1) << "Legalizing vectors...\n";
+    s = legalize_vectors(s);
+    log("Lowering after legalizing vectors:", s);
+
     debug(1) << "Removing dead allocations and moving loop invariant code...\n";
     s = remove_dead_allocations(s);
     s = simplify(s);
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index 0eb3bbaf3c15..3d1d7b8f053b 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -69,7 +69,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
         return value;
     }
 
-    if (info && op->type.is_int()) {
+    if (info && op->type.is_int_or_uint()) {
         switch (op->op) {
         case VectorReduce::Add:
             // Alignment of result is the alignment of the arg. Bounds
@@ -123,7 +123,8 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
     case VectorReduce::Add: {
         auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type);
         if (rewrite(h_add(x * broadcast(y, arg_lanes), lanes), h_add(x, lanes) * broadcast(y, lanes)) ||
-            rewrite(h_add(broadcast(x, arg_lanes) * y, lanes), h_add(y, lanes) * broadcast(x, lanes))) {
+            rewrite(h_add(broadcast(x, arg_lanes) * y, lanes), h_add(y, lanes) * broadcast(x, lanes)) ||
+            rewrite(h_add(broadcast(x, arg_lanes), lanes), broadcast(x * factor, lanes))) {
             return mutate(rewrite.result, info);
         }
         break;
@@ -136,7 +137,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
             rewrite(h_min(max(broadcast(x, arg_lanes), y), lanes), max(h_min(y, lanes), broadcast(x, lanes))) ||
             rewrite(h_min(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) ||
             rewrite(h_min(broadcast(x, c0), lanes), h_min(x, lanes), factor % c0 == 0) ||
-            rewrite(h_min(ramp(x, y, arg_lanes), lanes), x + min(y * (arg_lanes - 1), 0)) ||
+            (lanes == 1 && rewrite(h_min(ramp(x, y, arg_lanes), lanes), x + min(y * (arg_lanes - 1), 0))) ||
             false) {
             return mutate(rewrite.result, info);
         }
@@ -150,7 +151,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
             rewrite(h_max(max(broadcast(x, arg_lanes), y), lanes), max(h_max(y, lanes), broadcast(x, lanes))) ||
             rewrite(h_max(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) ||
             rewrite(h_max(broadcast(x, c0), lanes), h_max(x, lanes), factor % c0 == 0) ||
-            rewrite(h_max(ramp(x, y, arg_lanes), lanes), x + max(y * (arg_lanes - 1), 0)) ||
+            (lanes == 1 && rewrite(h_max(ramp(x, y, arg_lanes), lanes), x + max(y * (arg_lanes - 1), 0))) ||
             false) {
             return mutate(rewrite.result, info);
         }
@@ -164,14 +165,14 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
             rewrite(h_and(broadcast(x, arg_lanes) && y, lanes), h_and(y, lanes) && broadcast(x, lanes)) ||
             rewrite(h_and(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) ||
             rewrite(h_and(broadcast(x, c0), lanes), h_and(x, lanes), factor % c0 == 0) ||
-            rewrite(h_and(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes),
-                    x + max(y * (arg_lanes - 1), 0) < z) ||
-            rewrite(h_and(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes),
-                    x + max(y * (arg_lanes - 1), 0) <= z) ||
-            rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
-                    x < y + min(z * (arg_lanes - 1), 0)) ||
-            rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
-                    x <= y + min(z * (arg_lanes - 1), 0)) ||
+            (lanes == 1 && rewrite(h_and(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes),
+                                   x + max(y * (arg_lanes - 1), 0) < z)) ||
+            (lanes == 1 && rewrite(h_and(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes),
+                                   x + max(y * (arg_lanes - 1), 0) <= z)) ||
+            (lanes == 1 && rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
+                                   x < y + min(z * (arg_lanes - 1), 0))) ||
+            (lanes == 1 && rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
+                                   x <= y + min(z * (arg_lanes - 1), 0))) ||
             false) {
             return mutate(rewrite.result, info);
         }
@@ -186,14 +187,14 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
             rewrite(h_or(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) ||
             rewrite(h_or(broadcast(x, c0), lanes), h_or(x, lanes), factor % c0 == 0) ||
             // type of arg_lanes is somewhat indeterminate
-            rewrite(h_or(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes),
-                    x + min(y * (arg_lanes - 1), 0) < z) ||
-            rewrite(h_or(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes),
-                    x + min(y * (arg_lanes - 1), 0) <= z) ||
-            rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
-                    x < y + max(z * (arg_lanes - 1), 0)) ||
-            rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
-                    x <= y + max(z * (arg_lanes - 1), 0)) ||
+            (lanes == 1 && rewrite(h_or(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes),
+                                   x + min(y * (arg_lanes - 1), 0) < z)) ||
+            (lanes == 1 && rewrite(h_or(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes),
+                                   x + min(y * (arg_lanes - 1), 0) <= z)) ||
+            (lanes == 1 && rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
+                                   x < y + max(z * (arg_lanes - 1), 0))) ||
+            (lanes == 1 && rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
+                                   x <= y + max(z * (arg_lanes - 1), 0))) ||
             false) {
             return mutate(rewrite.result, info);
         }
diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 1c60e7a2510d..9f18a6fb25c1 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -98,7 +98,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) {
         Expr new_var = Variable::make(f.new_value.type(), f.new_name);
         Expr replacement = new_var;
 
-        debug(4) << "simplify let " << op->name << " = " << f.value << " in...\n";
+        debug(4) << "simplify let " << op->name << " = (" << f.value.type() << ") " << f.value << " in...\n";
 
         while (true) {
             const Variable *var = f.new_value.template as<Variable>();
@@ -180,6 +180,21 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) {
                 f.new_value = cast->value;
                 new_var = Variable::make(f.new_value.type(), f.new_name);
                 replacement = substitute(f.new_name, Cast::make(cast->type, new_var), replacement);
+            } else if (shuffle && shuffle->is_concat() && is_pure(shuffle)) {
+                // Substitute in all concatenates as they will likely simplify
+                // with other shuffles.
+                // As the structure of this while loop makes it hard to peel off
+                // pure operations from _all_ arguments to the Shuffle, we will
+                // instead substitute all of the vars that go in the shuffle, and
+                // instead guard against side effects by checking with `is_pure()`.
+                //
+                // Also, it is safe to substitute in without combinatorial
+                // blow-up, because deeply nested concats implies a
+                // combinatorially-large number of vector lanes, which we can't
+                // express in the type system anyway.
+                replacement = substitute(f.new_name, shuffle, replacement);
+                f.new_value = Expr();
+                break;
             } else if (shuffle && shuffle->is_slice()) {
                 // Replacing new_value below might free the shuffle
                 // indices vector, so save them now.
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index aecb4c6fc99a..644418664ffc 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -5,6 +5,7 @@
 namespace Halide {
 namespace Internal {
 
+using std::pair;
 using std::vector;
 
 Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
@@ -25,9 +26,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
         }
     }
 
-    // Mutate the vectors
     vector<Expr> new_vectors;
+    vector<int> new_indices = op->indices;
     bool changed = false;
+
+    // Mutate the vectors
     for (const Expr &vector : op->vectors) {
         ExprInfo v_info;
         Expr new_vector = mutate(vector, &v_info);
@@ -45,57 +48,164 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
         new_vectors.push_back(new_vector);
     }
 
-    // Try to convert a load with shuffled indices into a
-    // shuffle of a dense load.
+    // A concat of one vector, is just the vector.
+    // (Early check, this is repeated below, once the argument list is potentially reduced)
+    if (op->vectors.size() == 1 && op->is_concat()) {
+        return new_vectors[0];
+    }
+
+    Expr result = op;
+
+    // Analyze which input vectors are actually used. We will rewrite
+    // the vector of inputs and the indices jointly, and continue with
+    // those below.
+    {
+        vector<bool> arg_used(new_vectors.size());
+        // Figure out if all extracted lanes come from 1 component.
+        vector<pair<int, int>> src_vec_and_lane_idx = op->vector_and_lane_indices();
+        for (int i = 0; i < int(op->indices.size()); ++i) {
+            arg_used[src_vec_and_lane_idx[i].first] = true;
+        }
+        size_t num_args_used = 0;
+        for (bool used : arg_used) {
+            if (used) {
+                num_args_used++;
+            }
+        }
+
+        if (num_args_used < op->vectors.size()) {
+            // Not all arguments to the shuffle are used by the indices.
+            // Let's throw them out.
+            for (int vi = arg_used.size() - 1; vi >= 0; --vi) {
+                if (!arg_used[vi]) {
+                    int lanes_deleted = op->vectors[vi].type().lanes();
+                    int vector_start_lane = 0;
+                    for (int i = 0; i < vi; ++i) {
+                        vector_start_lane += op->vectors[i].type().lanes();
+                    }
+                    for (int &new_index : new_indices) {
+                        if (new_index > vector_start_lane) {
+                            internal_assert(new_index >= vector_start_lane + lanes_deleted);
+                            new_index -= lanes_deleted;
+                        }
+                    }
+                    new_vectors.erase(new_vectors.begin() + vi);
+                }
+            }
+
+            changed = true;
+        }
+    }
+
+    // Replace the op with the intermediate simplified result (if it changed), and continue.
+    if (changed) {
+        result = Shuffle::make(new_vectors, new_indices);
+        op = result.as<Shuffle>();
+        changed = false;
+    }
+
+    if (new_vectors.size() == 1) {
+        const Ramp *ramp = new_vectors[0].as<Ramp>();
+        if (ramp && ramp->base.type().is_scalar() && op->is_slice()) {
+            int first_lane_in_src = op->indices[0];
+            int slice_stride = op->slice_stride();
+            if (slice_stride >= 1) {
+                return mutate(Ramp::make(ramp->base + first_lane_in_src * ramp->stride,
+                                         ramp->stride * slice_stride,
+                                         op->indices.size()),
+                              nullptr);
+            }
+        }
+
+        // Test this again, but now after new_vectors got potentially shorter.
+        if (op->is_concat()) {
+            return new_vectors[0];
+        }
+    }
+
+    // Try to convert a Shuffle of Loads into a single Load of a Ramp.
+    // Make sure to not undo the work of the StageStridedLoads pass:
+    // only if the result of the shuffled indices is a *dense* ramp, we
+    // can proceed. There are two side cases: concatenations of scalars,
+    // and when the loads weren't dense to begin with.
     if (const Load *first_load = new_vectors[0].as<Load>()) {
         vector<Expr> load_predicates;
         vector<Expr> load_indices;
+        bool all_loads_are_dense = true;
         bool unpredicated = true;
+        bool concat_of_scalars = true;
         for (const Expr &e : new_vectors) {
             const Load *load = e.as<Load>();
             if (load && load->name == first_load->name) {
                 load_predicates.push_back(load->predicate);
                 load_indices.push_back(load->index);
                 unpredicated = unpredicated && is_const_one(load->predicate);
+                if (const Ramp *index_ramp = load->index.as<Ramp>()) {
+                    if (!is_const_one(index_ramp->stride)) {
+                        all_loads_are_dense = false;
+                    }
+                } else if (!load->index.type().is_scalar()) {
+                    all_loads_are_dense = false;
+                }
+                if (!load->index.type().is_scalar()) {
+                    concat_of_scalars = false;
+                }
             } else {
                 break;
             }
         }
 
+        debug(3) << "Shuffle of Load found: " << result << " where"
+                 << " all_loads_are_dense=" << all_loads_are_dense << ","
+                 << " concat_of_scalars=" << concat_of_scalars << "\n";
+
         if (load_indices.size() == new_vectors.size()) {
+            // All of the Shuffle arguments are Loads.
             Type t = load_indices[0].type().with_lanes(op->indices.size());
             Expr shuffled_index = Shuffle::make(load_indices, op->indices);
+            debug(3) << "  Shuffled index: " << shuffled_index << "\n";
             ExprInfo shuffled_index_info;
             shuffled_index = mutate(shuffled_index, &shuffled_index_info);
-            if (shuffled_index.as<Ramp>()) {
-                ExprInfo base_info;
-                if (const Ramp *r = shuffled_index.as<Ramp>()) {
-                    mutate(r->base, &base_info);
-                }
+            debug(3) << "  Simplified shuffled index: " << shuffled_index << "\n";
+            if (const Ramp *index_ramp = shuffled_index.as<Ramp>()) {
+                if (is_const_one(index_ramp->stride) || !all_loads_are_dense || concat_of_scalars) {
+                    ExprInfo base_info;
+                    mutate(index_ramp->base, &base_info);
 
-                ModulusRemainder alignment =
-                    ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment);
+                    ModulusRemainder alignment =
+                        ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment);
 
-                Expr shuffled_predicate;
-                if (unpredicated) {
-                    shuffled_predicate = const_true(t.lanes(), nullptr);
-                } else {
-                    shuffled_predicate = Shuffle::make(load_predicates, op->indices);
-                    shuffled_predicate = mutate(shuffled_predicate, nullptr);
+                    Expr shuffled_predicate;
+                    if (unpredicated) {
+                        shuffled_predicate = const_true(t.lanes(), nullptr);
+                    } else {
+                        shuffled_predicate = Shuffle::make(load_predicates, op->indices);
+                        shuffled_predicate = mutate(shuffled_predicate, nullptr);
+                    }
+                    t = first_load->type;
+                    t = t.with_lanes(op->indices.size());
+                    Expr result = Load::make(t, first_load->name, shuffled_index, first_load->image,
+                                             first_load->param, shuffled_predicate, alignment);
+                    debug(3) << "   => " << result << "\n";
+                    return result;
                 }
-                t = first_load->type;
-                t = t.with_lanes(op->indices.size());
-                return Load::make(t, first_load->name, shuffled_index, first_load->image,
-                                  first_load->param, shuffled_predicate, alignment);
+            } else {
+                // We can't... Leave it as a Shuffle of Loads.
+                // Note: no mutate-recursion as we are dealing here with a
+                // Shuffle of Loads, which have already undergone mutation
+                // early in this function (new_vectors).
+                return result;
             }
         }
     }
 
     // Try to collapse a shuffle of broadcasts into a single
     // broadcast. Note that it doesn't matter what the indices
-    // are.
+    // are. Only applies when the broadcast value is scalar,
+    // because Broadcast::make(vec, N) has vec.lanes() * N total
+    // lanes.
     const Broadcast *b1 = new_vectors[0].as<Broadcast>();
-    if (b1) {
+    if (b1 && b1->value.type().is_scalar()) {
         bool can_collapse = true;
         for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) {
             if (const Broadcast *b2 = new_vectors[i].as<Broadcast>()) {
@@ -289,13 +399,18 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
             if (inner_shuffle->is_concat()) {
                 int slice_min = op->indices.front();
                 int slice_max = op->indices.back();
+                if (slice_min > slice_max) {
+                    // Slices can go backward.
+                    std::swap(slice_min, slice_max);
+                }
                 int concat_index = 0;
                 int new_slice_start = -1;
                 vector<Expr> new_concat_vectors;
                 for (const auto &v : inner_shuffle->vectors) {
                     // Check if current concat vector overlaps with slice.
-                    if ((concat_index >= slice_min && concat_index <= slice_max) ||
-                        ((concat_index + v.type().lanes() - 1) >= slice_min && (concat_index + v.type().lanes() - 1) <= slice_max)) {
+                    int overlap_max = std::min(slice_max, concat_index + v.type().lanes() - 1);
+                    int overlap_min = std::max(slice_min, concat_index);
+                    if (overlap_min <= overlap_max) {
                         if (new_slice_start < 0) {
                             new_slice_start = concat_index;
                         }
@@ -305,17 +420,16 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
                     concat_index += v.type().lanes();
                 }
                 if (new_concat_vectors.size() < inner_shuffle->vectors.size()) {
-                    return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), op->slice_begin() - new_slice_start, op->slice_stride(), op->indices.size());
+                    return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors),
+                                               op->slice_begin() - new_slice_start,
+                                               op->slice_stride(),
+                                               op->indices.size());
                 }
             }
         }
     }
 
-    if (!changed) {
-        return op;
-    } else {
-        return Shuffle::make(new_vectors, op->indices);
-    }
+    return result;
 }
 
 }  // namespace Internal
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 2d149adbaf20..fc6fd9531983 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -732,8 +732,8 @@ class VectorSubs : public IRMutator {
 
         if (op->is_intrinsic(Call::prefetch)) {
             // We don't want prefetch args to ve vectorized, but we can't just skip the mutation
-            // (otherwise we can end up with dead loop variables. Instead, use extract_lane() on each arg
-            // to scalarize it again.
+            // (otherwise we can end up with dead loop variables). Instead, use extract_lane() on
+            // each arg to scalarize it again.
             for (auto &arg : new_args) {
                 if (arg.type().is_vector()) {
                     arg = extract_lane(arg, 0);
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index e9d345b6d403..db74739e20da 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -280,6 +280,8 @@ const char *vk_get_error_name(VkResult error) {
         return "VK_ERROR_FORMAT_NOT_SUPPORTED";
     case VK_ERROR_FRAGMENTED_POOL:
         return "VK_ERROR_FRAGMENTED_POOL";
+    case VK_ERROR_UNKNOWN:
+        return "VK_ERROR_UNKNOWN";
     case VK_ERROR_SURFACE_LOST_KHR:
         return "VK_ERROR_SURFACE_LOST_KHR";
     case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR:
@@ -303,6 +305,8 @@ const char *vk_get_error_name(VkResult error) {
     }
 }
 
+#define vk_report_error(user_context, code, func) (error((user_context)) << "Vulkan: " << (func) << " returned " << vk_get_error_name((code)) << " (code: " << (code) << ") ")
+
 // --------------------------------------------------------------------------
 
 }  // namespace
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index d2ef2ee5ba6f..35c38d05dd7f 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -85,7 +85,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator,
     debug(user_context)
         << " vk_create_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "queue_index: " << queue_index << ")\n";
+        << "queue_index: " << queue_index << ")";
 #endif
 
     if (allocator == nullptr) {
@@ -103,7 +103,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator,
 
     VkResult result = vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create command pool!\n";
+        vk_report_error(user_context, result, "vkCreateCommandPool");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -117,7 +117,7 @@ int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkResetCommandPool(allocator->current_device(), command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
@@ -135,7 +135,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -150,7 +150,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato
 
     VkResult result = vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to allocate command buffers!\n";
+        vk_report_error(user_context, result, "vkAllocateCommandBuffers");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -165,7 +165,7 @@ int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocat
         << "command_buffer: " << (void *)command_buffer << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -231,7 +231,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
     VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkBeginCommandBuffer");
         return halide_error_code_generic_error;
     }
 
@@ -242,7 +242,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkEndCommandBuffer");
         return halide_error_code_generic_error;
     }
 
@@ -272,7 +272,7 @@ int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer
 
     VkResult result = vkQueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkSubmitQueue");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -325,7 +325,7 @@ int vk_create_descriptor_pool(void *user_context,
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -362,7 +362,7 @@ int vk_create_descriptor_pool(void *user_context,
 
     VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateDescriptorPool");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -378,7 +378,7 @@ int vk_destroy_descriptor_pool(void *user_context,
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks());
@@ -402,7 +402,7 @@ int vk_create_descriptor_set_layout(void *user_context,
         << "layout: " << (void *)layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -460,7 +460,7 @@ int vk_create_descriptor_set_layout(void *user_context,
     // Create the descriptor set layout
     VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateDescriptorSetLayout");
         return halide_error_code_generic_error;
     }
 
@@ -478,7 +478,7 @@ int vk_destroy_descriptor_set_layout(void *user_context,
         << "layout: " << (void *)descriptor_set_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks());
@@ -500,7 +500,7 @@ int vk_create_descriptor_set(void *user_context,
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -515,7 +515,7 @@ int vk_create_descriptor_set(void *user_context,
 
     VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkAllocateDescriptorSets");
         return halide_error_code_generic_error;
     }
 
@@ -541,7 +541,7 @@ int vk_update_descriptor_set(void *user_context,
         << "descriptor_set: " << (void *)descriptor_set << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -599,7 +599,7 @@ int vk_update_descriptor_set(void *user_context,
             // retrieve the buffer from the region
             VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(owner->handle);
             if (device_buffer == nullptr) {
-                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!";
                 return halide_error_code_internal_error;
             }
 
@@ -698,7 +698,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!";
         return nullptr;
     }
 
@@ -711,7 +711,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
     // allocate a new region
     MemoryRegion *region = allocator->reserve(user_context, request);
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!\n";
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!";
         return nullptr;
     }
 
@@ -733,19 +733,19 @@ int vk_update_scalar_uniform_buffer(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!";
         return halide_error_code_internal_error;
     }
 
     // map the region to a host ptr
     uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region);
     if (host_ptr == nullptr) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!";
         return halide_error_code_internal_error;
     }
 
@@ -798,7 +798,7 @@ int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *
         << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -832,7 +832,7 @@ int vk_create_pipeline_layout(void *user_context,
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -841,7 +841,7 @@ int vk_create_pipeline_layout(void *user_context,
         if (descriptor_set_count > max_bound_descriptor_sets) {
             error(user_context) << "Vulkan: Number of descriptor sets for pipeline layout exceeds the number that can be bound by device!\n"
                                 << " requested: " << descriptor_set_count << ","
-                                << " available: " << max_bound_descriptor_sets << "\n";
+                                << " available: " << max_bound_descriptor_sets;
             return halide_error_code_incompatible_device_interface;
         }
     }
@@ -858,7 +858,7 @@ int vk_create_pipeline_layout(void *user_context,
 
     VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreatePipelineLayout");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -876,7 +876,7 @@ int vk_destroy_pipeline_layout(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -898,11 +898,12 @@ int vk_create_compute_pipeline(void *user_context,
     debug(user_context)
         << " vk_create_compute_pipeline (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
+        << "pipeline_name: " << pipeline_name << ", "
         << "shader_module: " << (void *)shader_module << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -928,7 +929,10 @@ int vk_create_compute_pipeline(void *user_context,
 
     VkResult result = vkCreateComputePipelines(allocator->current_device(), VK_NULL_HANDLE, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateComputePipeline")
+            << "failed to create compute pipeline " << pipeline_name << ".\n"
+            << " (This might be a bug in Halide. To debug this, see the HL_SPIRV_DUMP_FILE environment variable, and use the Khronos validator to make a bug report)";
+
         return halide_error_code_generic_error;
     }
 
@@ -955,24 +959,24 @@ int vk_setup_compute_pipeline(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!";
         return halide_error_code_generic_error;
     }
 
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!";
         return halide_error_code_generic_error;
     }
 
     VkResult result = VK_SUCCESS;
     const char *entry_point_name = shader_bindings->entry_point_name;
     if (entry_point_name == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!";
         return halide_error_code_generic_error;
     }
 
@@ -995,7 +999,7 @@ int vk_setup_compute_pipeline(void *user_context,
             } else {
                 // dynamic allocation
                 if (shared_mem_constant_id > 0) {
-                    error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is supported!!\n";
+                    error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!";
                     result = VK_ERROR_TOO_MANY_OBJECTS;
                     break;
                 }
@@ -1028,13 +1032,13 @@ int vk_setup_compute_pipeline(void *user_context,
             if (static_shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
                                     << " requested: " << static_shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
             if (dispatch_data->shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of dynamic shared memory used exceeds device limit!\n"
                                     << " requested: " << dispatch_data->shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
         }
@@ -1065,14 +1069,14 @@ int vk_setup_compute_pipeline(void *user_context,
             }
         }
         if (found_index == invalid_index) {
-            error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!\n";
+            error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!";
             result = VK_ERROR_INITIALIZATION_FAILED;
         }
     }
 
     // don't even attempt to create the pipeline layout if we encountered errors in the shader binding
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result) << "\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result);
         return halide_error_code_generic_error;
     }
 
@@ -1100,7 +1104,7 @@ int vk_setup_compute_pipeline(void *user_context,
         if (shader_bindings->compute_pipeline) {
             int error_code = vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline);
             if (error_code != halide_error_code_success) {
-                error(user_context) << "Vulkan: Failed to destroy compute pipeline!\n";
+                error(user_context) << "Vulkan: Failed to destroy compute pipeline!";
                 return halide_error_code_generic_error;
             }
             shader_bindings->compute_pipeline = VK_NULL_HANDLE;
@@ -1108,7 +1112,7 @@ int vk_setup_compute_pipeline(void *user_context,
 
         int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline));
         if (error_code != halide_error_code_success) {
-            error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+            error(user_context) << "Vulkan: Failed to create compute pipeline!";
             return error_code;
         }
 
@@ -1118,7 +1122,7 @@ int vk_setup_compute_pipeline(void *user_context,
         if (shader_bindings->compute_pipeline == VK_NULL_HANDLE) {
             int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline));
             if (error_code != halide_error_code_success) {
-                error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+                error(user_context) << "Vulkan: Failed to create compute pipeline!";
                 return error_code;
             }
         }
@@ -1138,7 +1142,7 @@ int vk_destroy_compute_pipeline(void *user_context,
         << "compute_pipeline: " << (void *)compute_pipeline << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -1160,12 +1164,12 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!";
         return nullptr;
     }
 
     if ((module_ptr == nullptr) || (module_size < (2 * sizeof(uint32_t)))) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!";
         return nullptr;
     }
 
@@ -1213,7 +1217,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     uint32_t idx = 1;  // skip past the header_word_count
     uint32_t shader_count = module_ptr[idx++];
     if (shader_count < 1) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!";
         return nullptr;  // no descriptors
     }
 
@@ -1222,7 +1226,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     size_t shader_bindings_size = shader_count * sizeof(VulkanShaderBinding);
     VulkanShaderBinding *shader_bindings = (VulkanShaderBinding *)vk_host_malloc(user_context, shader_bindings_size, 0, alloc_scope, allocator->callbacks());
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!\n";
+        error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!";
         return nullptr;
     }
     memset(shader_bindings, 0, shader_bindings_size);
@@ -1255,7 +1259,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
             size_t specialization_constants_size = specialization_constants_count * sizeof(VulkanSpecializationConstant);
             specialization_constants = (VulkanSpecializationConstant *)vk_host_malloc(user_context, specialization_constants_size, 0, alloc_scope, allocator->callbacks());
             if (specialization_constants == nullptr) {
-                error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!\n";
+                error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!";
                 return nullptr;
             }
             memset(specialization_constants, 0, specialization_constants_size);
@@ -1291,7 +1295,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
             size_t shared_memory_allocations_size = shared_memory_allocations_count * sizeof(VulkanSharedMemoryAllocation);
             shared_memory_allocations = (VulkanSharedMemoryAllocation *)vk_host_malloc(user_context, shared_memory_allocations_size, 0, alloc_scope, allocator->callbacks());
             if (shared_memory_allocations == nullptr) {
-                error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!\n";
+                error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!";
                 return nullptr;
             }
             memset(shared_memory_allocations, 0, shared_memory_allocations_size);
@@ -1356,7 +1360,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
 #endif
         shader_bindings[n].entry_point_name = (char *)vk_host_malloc(user_context, entry_point_name_length * sizeof(uint32_t), 0, alloc_scope, allocator->callbacks());
         if (shader_bindings[n].entry_point_name == nullptr) {
-            error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!\n";
+            error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!";
             return nullptr;
         }
 
@@ -1408,7 +1412,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all
             if (static_shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
                                     << " requested: " << static_shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
         }
@@ -1420,7 +1424,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all
         if (shader_count > max_descriptors) {
             error(user_context) << "Vulkan: Number of required descriptor sets exceeds the amount available for device!\n"
                                 << " requested: " << shader_count << ","
-                                << " available: " << max_descriptors << "\n";
+                                << " available: " << max_descriptors;
             return halide_error_code_incompatible_device_interface;
         }
     }
@@ -1516,7 +1520,7 @@ VulkanCompilationCacheEntry *vk_compile_kernel_module(void *user_context, Vulkan
         // Compile the "SPIR-V Module" for the kernel
         cache_entry->compiled_modules[i] = vk_compile_shader_module(user_context, allocator, (const char *)spirv_ptr, (int)spirv_size);
         if (cache_entry->compiled_modules[i] == nullptr) {
-            debug(user_context) << "Vulkan: Failed to compile shader module!\n";
+            debug(user_context) << "Vulkan: Failed to compile shader module!";
             error_code = halide_error_code_generic_error;
         }
 
@@ -1556,12 +1560,12 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!";
         return nullptr;
     }
 
     if ((ptr == nullptr) || (size <= 0)) {
-        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!\n";
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!";
         return nullptr;
     }
 
@@ -1599,7 +1603,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
     VulkanCompiledShaderModule *compiled_module = (VulkanCompiledShaderModule *)vk_host_malloc(user_context, sizeof(VulkanCompiledShaderModule), 0, alloc_scope, allocator->callbacks());
     if (compiled_module == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!\n";
+        error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!";
         return nullptr;
     }
     memset(compiled_module, 0, sizeof(VulkanCompiledShaderModule));
@@ -1607,7 +1611,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     // decode the entry point data and extract the shader bindings
     VulkanShaderBinding *decoded_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size);
     if (decoded_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings!";
         return nullptr;
     }
 
@@ -1624,8 +1628,8 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     compiled_module->shader_count = shader_count;
 
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &compiled_module->shader_module);
-    if ((result != VK_SUCCESS)) {
-        error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
+    if (result != VK_SUCCESS) {
+        vk_report_error(user_context, result, "vkCreateShaderModule");
         vk_host_free(user_context, compiled_module->shader_bindings, allocator->callbacks());
         vk_host_free(user_context, compiled_module, allocator->callbacks());
         return nullptr;
@@ -1635,7 +1639,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     if (compiled_module->shader_count) {
         compiled_module->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, compiled_module->shader_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
         if (compiled_module->descriptor_set_layouts == nullptr) {
-            error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
+            error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!";
             return nullptr;
         }
         memset(compiled_module->descriptor_set_layouts, 0, compiled_module->shader_count * sizeof(VkDescriptorSetLayout));
@@ -1808,7 +1812,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
             VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
             if (!src_buffer || !dst_buffer) {
-                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!";
                 return halide_error_code_internal_error;
             }
 
@@ -1846,7 +1850,7 @@ int vk_device_crop_from_offset(void *user_context,
 
     VulkanContext ctx(user_context);
     if (ctx.error != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        error(user_context) << "Vulkan: Failed to acquire context!";
         return ctx.error;
     }
 
@@ -1855,14 +1859,14 @@ int vk_device_crop_from_offset(void *user_context,
 #endif
 
     if (byte_offset < 0) {
-        error(user_context) << "Vulkan: Invalid offset for device crop!\n";
+        error(user_context) << "Vulkan: Invalid offset for device crop!";
         return halide_error_code_device_crop_failed;
     }
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(src->device);
     if (device_region == nullptr) {
-        error(user_context) << "Vulkan: Failed to crop region! Invalid device region!\n";
+        error(user_context) << "Vulkan: Failed to crop region! Invalide device region!";
         return halide_error_code_device_crop_failed;
     }
 
@@ -1873,7 +1877,7 @@ int vk_device_crop_from_offset(void *user_context,
     region_indexing.offset = byte_offset / src->type.bytes();
     MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, region_indexing);
     if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!\n";
+        error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!";
         return halide_error_code_device_crop_failed;
     }
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 64f2c9ac6825..9edfd0476cbf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,7 +4,7 @@ include(CheckCXXCompilerFlag)
 # Internal tests are a special case.
 # HalideTestHelpers depends on this test being present.
 add_executable(_test_internal internal.cpp)
-target_link_libraries(_test_internal PRIVATE Halide::Test)
+target_link_libraries(_test_internal PRIVATE Halide::Test Halide::TerminateHandler)
 target_include_directories(_test_internal PRIVATE "${Halide_SOURCE_DIR}/src")
 target_precompile_headers(_test_internal PRIVATE <Halide.h>)
 if (Halide_CCACHE_BUILD)
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 6d41a9e71219..d732f1e72284 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -124,6 +124,7 @@ tests(GROUPS correctness
       fuse.cpp
       fuse_gpu_threads.cpp
       fused_where_inner_extent_is_zero.cpp
+      fuzz_extract_lanes.cpp
       fuzz_float_stores.cpp
       fuzz_schedule.cpp
       fuzz_simplify.cpp
@@ -221,6 +222,7 @@ tests(GROUPS correctness
       math.cpp
       median3x3.cpp
       memoize_cloned.cpp
+      metal_long_vectors.cpp
       metal_precompiled_shaders.cpp
       min_extent.cpp
       mod.cpp
diff --git a/test/correctness/fuzz_extract_lanes.cpp b/test/correctness/fuzz_extract_lanes.cpp
new file mode 100644
index 000000000000..e23e60ccf0be
--- /dev/null
+++ b/test/correctness/fuzz_extract_lanes.cpp
@@ -0,0 +1,492 @@
+#include "Halide.h"
+#include <array>
+#include <functional>
+#include <random>
+
+// Fuzz test for deinterleave / extract_lane operations in Deinterleave.cpp.
+// Constructs random vector expressions covering the IR node types that
+// the Deinterleaver has visit methods for, evaluates them by JIT-compiling
+// with a custom lowering pass, then checks that deinterleave() produces
+// results consistent with the original expression.
+
+namespace {
+
+using std::string;
+using std::vector;
+using namespace Halide;
+using namespace Halide::Internal;
+
+using RandomEngine = std::mt19937_64;
+
+constexpr int fuzz_var_count = 3;
+std::vector<Param<int>> fuzz_vars(fuzz_var_count);
+
+template<typename T>
+decltype(auto) random_choice(RandomEngine &rng, T &&choices) {
+    std::uniform_int_distribution<size_t> dist(0, std::size(choices) - 1);
+    return choices[dist(rng)];
+}
+
+Type fuzz_types[] = {UInt(8), UInt(16), UInt(32), UInt(64), Int(8), Int(16), Int(32), Int(64)};
+
+Type random_scalar_type(RandomEngine &rng) {
+    return random_choice(rng, fuzz_types);
+}
+
+int random_factor(RandomEngine &rng, int x) {
+    vector<int> factors;
+    factors.reserve(x);
+    for (int i = 1; i < x; i++) {
+        if (x % i == 0) {
+            factors.push_back(i);
+        }
+    }
+    return random_choice(rng, factors);
+}
+
+Expr random_const(RandomEngine &rng, Type t) {
+    int val = (int)((int8_t)(rng() & 0x0f));
+    if (t.is_vector()) {
+        return Broadcast::make(cast(t.element_of(), val), t.lanes());
+    } else {
+        return cast(t, val);
+    }
+}
+
+Expr random_leaf(RandomEngine &rng, Type t) {
+    if (t.is_scalar()) {
+        if (rng() & 1) {
+            // Variable
+            std::uniform_int_distribution dist(0, fuzz_var_count - 1);
+            return cast(t, fuzz_vars[dist(rng)]);
+        } else {
+            return random_const(rng, t);
+        }
+    }
+    // For vector types, build from Ramp or Broadcast
+    int lanes = t.lanes();
+    if (rng() & 1) {
+        Expr base = random_leaf(rng, t.element_of());
+        Expr stride = random_const(rng, t.element_of());
+        return Ramp::make(base, stride, lanes);
+    } else {
+        Expr val = random_leaf(rng, t.element_of());
+        return Broadcast::make(val, lanes);
+    }
+}
+
+Expr random_vector_expr(RandomEngine &rng, Type t, int depth) {
+    if (depth <= 0 || t.lanes() == 1) {
+        return random_leaf(rng, t);
+    }
+
+    // Weight the choices to cover all Deinterleaver visit methods:
+    // Broadcast, Ramp, Cast, Reinterpret, Call (via abs), Shuffle,
+    // VectorReduce, Add/Sub/Min/Max (handled by default IRMutator)
+    std::function<Expr()> ops[] = {
+        // Leaf
+        [&]() -> Expr {
+            return random_leaf(rng, t);
+        },
+        // Add
+        [&]() -> Expr {
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            return a + b;
+        },
+        // Sub (only for signed types to avoid unsigned underflow coercion errors)
+        [&]() -> Expr {
+            if (t.is_uint()) {
+                // Fall back to Add for unsigned types
+                Expr a = random_vector_expr(rng, t, depth - 1);
+                Expr b = random_vector_expr(rng, t, depth - 1);
+                return a + b;
+            }
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            return a - b;
+        },
+        // Min
+        [&]() -> Expr {
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            return min(a, b);
+        },
+        // Max
+        [&]() -> Expr {
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            internal_assert(a.type() == b.type()) << a << " " << b;
+            return max(a, b);
+        },
+        // Select
+        [&]() -> Expr {
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            Expr c = random_vector_expr(rng, t, depth - 1);
+            Expr cond = (a > b);
+            return select(cond, a, c);
+        },
+        // Cast
+        [&]() -> Expr {
+            // Cast from a different type
+            Type other = random_scalar_type(rng).with_lanes(t.lanes());
+            while (other == t) {
+                other = random_scalar_type(rng).with_lanes(t.lanes());
+            }
+            Expr e = random_vector_expr(rng, other, depth - 1);
+            return Cast::make(t, e);
+        },
+        // Reinterpret (different bit width, changes lane count)
+        [&]() -> Expr {
+            int total_bits = t.bits() * t.lanes();
+            // Pick a different bit width that divides the total bits evenly
+            int bit_widths[] = {8, 16, 32, 64};
+            vector<int> valid_widths;
+            for (int bw : bit_widths) {
+                if (total_bits % bw == 0) {
+                    valid_widths.push_back(bw);
+                }
+            }
+            // Should at least be able to preserve the existing bit width and change signedness.
+            internal_assert(!valid_widths.empty());
+            int other_bits = random_choice(rng, valid_widths);
+            int other_lanes = total_bits / other_bits;
+            Type other = ((rng() & 1) ? Int(other_bits) : UInt(other_bits)).with_lanes(other_lanes);
+            Expr e = random_vector_expr(rng, other, depth - 1);
+            return Reinterpret::make(t, e);
+        },
+        // Broadcast of sub-expression
+        [&]() -> Expr {
+            int f = random_factor(rng, t.lanes());
+            Expr val = random_vector_expr(rng, t.with_lanes(f), depth - 1);
+            return Broadcast::make(val, t.lanes() / f);
+        },
+        // Ramp
+        [&]() -> Expr {
+            int f = random_factor(rng, t.lanes());
+            Type sub_t = t.with_lanes(f);
+            Expr base = random_vector_expr(rng, sub_t, depth - 1);
+            Expr stride = random_const(rng, sub_t);
+            return Ramp::make(base, stride, t.lanes() / f);
+        },
+        // Shuffle (interleave)
+        [&]() -> Expr {
+            if (t.lanes() >= 4 && t.lanes() % 2 == 0) {
+                int half = t.lanes() / 2;
+                Expr a = random_vector_expr(rng, t.with_lanes(half), depth - 1);
+                Expr b = random_vector_expr(rng, t.with_lanes(half), depth - 1);
+                return Shuffle::make_interleave({a, b});
+            }
+            // Fall back to a simple expression
+            return random_vector_expr(rng, t, depth - 1);
+        },
+        // Shuffle (concat)
+        [&]() -> Expr {
+            if (t.lanes() >= 4 && t.lanes() % 2 == 0) {
+                int half = t.lanes() / 2;
+                Expr a = random_vector_expr(rng, t.with_lanes(half), depth - 1);
+                Expr b = random_vector_expr(rng, t.with_lanes(half), depth - 1);
+                return Shuffle::make_concat({a, b});
+            }
+            return random_vector_expr(rng, t, depth - 1);
+        },
+        // Shuffle (slice)
+        [&]() -> Expr {
+            // Make a wider vector and slice it
+            if (t.lanes() <= 8) {
+                int wider = t.lanes() * 2;
+                Expr e = random_vector_expr(rng, t.with_lanes(wider), depth - 1);
+                // Slice: take every other element starting at 0 or 1
+                int start = rng() & 1;
+                return Shuffle::make_slice(e, start, 2, t.lanes());
+            }
+            return random_vector_expr(rng, t, depth - 1);
+        },
+        // VectorReduce (only when we can make it work with lane counts)
+        [&]() -> Expr {
+            // Input has more lanes, output has t.lanes() lanes
+            // factor must divide input lanes, and input lanes = t.lanes() * factor
+            int factor = (rng() % 3) + 2;
+            int input_lanes = t.lanes() * factor;
+            if (input_lanes <= 32) {
+                VectorReduce::Operator ops[] = {
+                    VectorReduce::Add,
+                    VectorReduce::Min,
+                    VectorReduce::Max,
+                };
+                auto op = random_choice(rng, ops);
+                Expr val = random_vector_expr(rng, t.with_lanes(input_lanes), depth - 1);
+                internal_assert(val.type().lanes() == input_lanes) << val;
+                return VectorReduce::make(op, val, t.lanes());
+            }
+            return random_vector_expr(rng, t, depth - 1);
+        },
+        // Call node (using a pure intrinsic like absd)
+        [&]() -> Expr {
+            Expr a = random_vector_expr(rng, t, depth - 1);
+            Expr b = random_vector_expr(rng, t, depth - 1);
+            return cast(t, absd(a, b));
+        },
+    };
+
+    Expr e = random_choice(rng, ops)();
+    internal_assert(e.type() == t) << e.type() << " " << t << " " << e;
+    return e;
+}
+
+// A custom lowering pass that replaces a specific dummy store RHS with the
+// desired test expression. This lets us JIT-evaluate arbitrary vector Exprs.
+class InjectExpr : public IRMutator {
+    using IRMutator::visit;
+
+    string func_name;
+    const std::vector<Expr> &replacements;
+    int idx = 0;
+
+    Stmt visit(const Store *op) override {
+        // Replace calls to our dummy function with the replacement expr
+        internal_assert(idx < (int)replacements.size());
+        if (op->name == func_name) {
+            return Store::make(op->name, flatten_nested_ramps(replacements[idx++]),
+                               op->index, op->param, op->predicate, op->alignment);
+        }
+        return IRMutator::visit(op);
+    }
+
+public:
+    InjectExpr(const string &func_name, const std::vector<Expr> &replacements)
+        : func_name(func_name), replacements(replacements) {
+    }
+};
+
+// Evaluate a vector expression by JIT-compiling it. Returns the values
+// as a vector of int64_t (to hold any integer type).
+// The expression may reference variables a, b, c which are set to fixed values.
+bool evaluate_vector_exprs(const std::vector<Expr> &e,
+                           Buffer<int64_t> &result) {
+    Type t = e[0].type();
+    int lanes = t.lanes();
+
+    // Create a Func that outputs a vector of the right size
+    Func f("test_func");
+    Var x("x"), y("y");
+
+    // We define f(x, y) as a dummy, then inject our expressions via a custom
+    // lowering pass
+    Expr fuzz_var_sum = 0;
+    for (int i = 0; i < fuzz_var_count; i++) {
+        fuzz_var_sum += fuzz_vars[i];
+    }
+    f(x, y) = cast(t.element_of(), fuzz_var_sum);
+    f.bound(x, 0, lanes)
+        .bound(y, 0, (int)e.size())
+        .vectorize(x)
+        .unroll(y);
+
+    // The custom lowering pass replaces the dummy RHS
+    InjectExpr injector(f.name(), e);
+
+    auto buf = Runtime::Buffer<>(t.element_of(), {lanes, (int)e.size()});
+
+    Pipeline p(f);
+    p.add_custom_lowering_pass(&injector, nullptr);
+    if (get_target_from_environment() == get_host_target()) {
+        p.realize(buf);
+    } else {
+        // Compile something, to be able to at least test CodeGen from the backends and LLVM.
+        p.compile_to_assembly("fuzz_extract_lanes.s", {fuzz_vars[0], fuzz_vars[1], fuzz_vars[2]}, "fuzz_func");
+        return false;
+    }
+
+    // Upcast results to int64 for easier comparison
+    internal_assert(result.height() == (int)e.size());
+    internal_assert(result.width() == lanes);
+    for (int y = 0; y < (int)e.size(); y++) {
+        for (int x = 0; x < lanes; x++) {
+            if (t.is_uint()) {
+                switch (t.bits()) {
+                case 8:
+                    result(x, y) = buf.as<uint8_t>()(x, y);
+                    break;
+                case 16:
+                    result(x, y) = buf.as<uint16_t>()(x, y);
+                    break;
+                case 32:
+                    result(x, y) = buf.as<uint32_t>()(x, y);
+                    break;
+                case 64:
+                    result(x, y) = buf.as<uint64_t>()(x, y);
+                    break;
+                default:
+                    return false;
+                }
+            } else {
+                switch (t.bits()) {
+                case 8:
+                    result(x, y) = buf.as<int8_t>()(x, y);
+                    break;
+                case 16:
+                    result(x, y) = buf.as<int16_t>()(x, y);
+                    break;
+                case 32:
+                    result(x, y) = buf.as<int32_t>()(x, y);
+                    break;
+                case 64:
+                    result(x, y) = buf.as<int64_t>()(x, y);
+                    break;
+                default:
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+template<typename T>
+T initialize_rng() {
+    constexpr size_t kStateWords = T::state_size * sizeof(typename T::result_type) / sizeof(uint32_t);
+    vector<uint32_t> random(kStateWords);
+    std::generate(random.begin(), random.end(), std::random_device{});
+    std::seed_seq seed_seq(random.begin(), random.end());
+    return T{seed_seq};
+}
+
+bool test_one(RandomEngine &rng) {
+    // Pick a random vector width and type
+    int lanes = std::uniform_int_distribution(4, 16)(rng);
+    Type scalar_t = random_scalar_type(rng);
+    Type t = scalar_t.with_lanes(lanes);
+
+    // Pick random deinterleave parameters
+    int starting_lane = std::uniform_int_distribution(0, lanes - 1)(rng);
+    int ending_lane = std::uniform_int_distribution(0, lanes - 1)(rng);
+    int new_lanes = std::abs(ending_lane - starting_lane) + 1;
+    int lane_stride = std::uniform_int_distribution(1, new_lanes)(rng);
+    // bias it towards small strides
+    lane_stride = std::uniform_int_distribution(1, lane_stride)(rng);
+    new_lanes /= lane_stride;
+    if (starting_lane > ending_lane) {
+        lane_stride = -lane_stride;
+    }
+
+    // Generate a batch of random vector expressions
+    constexpr int batch_size = 64;
+    constexpr int depth = 4;
+    std::vector<Expr> original(batch_size);
+    std::vector<Expr> sliced(batch_size);
+
+    for (int i = 0; i < batch_size; i++) {
+        original[i] = random_vector_expr(rng, t, depth);
+        sliced[i] = extract_lanes(original[i], starting_lane, lane_stride, new_lanes);
+        internal_assert(sliced[i].type() == scalar_t.with_lanes(new_lanes))
+            << sliced[i].type() << " vs " << scalar_t.with_lanes(new_lanes);
+    }
+
+    // Pick random variable values
+    for (int i = 0; i < fuzz_var_count; i++) {
+        fuzz_vars[i].set((int)((int8_t)(rng() & 0x0f)));
+    }
+
+    // Evaluate both
+    Buffer<int64_t> orig_vals(lanes, batch_size), sliced_vals(new_lanes, batch_size);
+    if (!evaluate_vector_exprs(original, orig_vals) ||
+        !evaluate_vector_exprs(sliced, sliced_vals)) {
+        // Can't evaluate this for whatever reason
+        return true;
+    }
+
+    // Check that the sliced values match the corresponding lanes of the original
+    for (int y = 0; y < batch_size; y++) {
+        for (int x = 0; x < new_lanes; x++) {
+            int orig_lane = starting_lane + x * lane_stride;
+            if (sliced_vals(x, y) != orig_vals(orig_lane, y)) {
+                std::cerr << "MISMATCH! (y=" << y << ", x=" << x << ")\n"
+                          << "Original expr: " << original[y] << "\n"
+                          << "Original type: " << original[y].type() << "\n"
+                          << "ExtractLanes params: starting_lane=" << starting_lane
+                          << " lane_stride=" << lane_stride
+                          << " new_lanes=" << new_lanes << "\n"
+                          << "Sliced expr: " << sliced[y] << "\n"
+                          << "Variables:";
+                for (int j = 0; j < fuzz_var_count; j++) {
+                    std::cerr << " " << fuzz_vars[j].name() << "=" << fuzz_vars[j].get() << "\n";
+                }
+                std::cerr << "\n"
+                          << "Original values:";
+                for (int j = 0; j < lanes; j++) {
+                    std::cerr << " " << orig_vals(j, y);
+                }
+                std::cerr << "\n"
+                          << "Sliced values:";
+                for (int j = 0; j < new_lanes; j++) {
+                    std::cerr << " " << sliced_vals(j, y);
+                }
+                std::cerr << "\n";
+                return false;
+            }
+        }
+
+        std::cerr << "Original values:";
+        for (int j = 0; j < lanes; j++) {
+            std::cerr << " " << orig_vals(j, y);
+        }
+        std::cerr << "    Sliced values:";
+        for (int j = 0; j < new_lanes; j++) {
+            std::cerr << " " << sliced_vals(j, y);
+        }
+        std::cerr << "  Correct.\n";
+    }
+
+    return true;
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+    Target t = get_jit_target_from_environment();
+    if (t.has_feature(Target::SVE2)) {
+        printf("[SKIP-WITH-ISSUE-9026] LLVM generates incorrect IR for some expressions.\n");
+        return 0;
+    }
+    if (t.arch != Target::X86 || t.bits != 64) {
+        printf("[SKIP-WITH-ISSUE-9040] Only running test on X86-64 for now. See also #9044.");
+        return 0;
+    }
+    auto seed_generator = initialize_rng<RandomEngine>();
+
+    /* Seeds known to have failed in the past: */
+    std::vector<uint64_t> seeds_to_try = {
+        11290674455725750672ull,
+        18322803614019275106ull,
+        12847901530538798383ull,
+
+        // Failures on ARM:
+        5792148528566212763,
+        6300344786331520063,
+    };
+
+    size_t num_iters = (argc > 1) ? 1 : 64;
+
+    for (size_t i = 0; i < num_iters; i++) {
+        uint64_t seed = seed_generator();
+        if (i < seeds_to_try.size()) {
+            seed = seeds_to_try[i];
+        }
+        if (argc > 1) {
+            std::istringstream{argv[1]} >> seed;
+        }
+        std::cout << "Seed: " << seed << "\n";
+        RandomEngine rng{seed};
+
+        if (!test_one(rng)) {
+            std::cout << "Failed with seed " << seed << "\n";
+            return 1;
+        }
+    }
+
+    std::cout << "Success!\n";
+    return 0;
+}
diff --git a/test/error/metal_vector_too_large.cpp b/test/correctness/metal_long_vectors.cpp
similarity index 89%
rename from test/error/metal_vector_too_large.cpp
rename to test/correctness/metal_long_vectors.cpp
index bf4c74bb75a0..74c2e981fc2d 100644
--- a/test/error/metal_vector_too_large.cpp
+++ b/test/correctness/metal_long_vectors.cpp
@@ -9,7 +9,7 @@ int main(int argc, char **argv) {
     Var x("x"), y("y");
 
     f(x, y) = input(x, y) + 42;
-    f.vectorize(x, 16).gpu_blocks(y, DeviceAPI::Metal);
+    f.vectorize(x, 32).gpu_blocks(y, DeviceAPI::Metal);
 
     std::string test_object = Internal::get_test_tmp_dir() + "metal_vector_too_large.o";
     Target mac_target("x86-64-osx-metal");
diff --git a/test/correctness/require.cpp b/test/correctness/require.cpp
index 625383f460df..58226077d971 100644
--- a/test/correctness/require.cpp
+++ b/test/correctness/require.cpp
@@ -9,7 +9,7 @@ void halide_error(JITUserContext *ctx, const char *msg) {
     // Emitting "error.*:" to stdout or stderr will cause CMake to report the
     // test as a failure on Windows, regardless of error code returned,
     // hence the abbreviation to "err".
-    printf("Saw (Expected) Halide Err: %s\n", msg);
+    printf("Saw (Expected) Halide Err: %s", msg);
     error_occurred = true;
 }
 
@@ -46,14 +46,18 @@ static void test(int vector_width) {
     if (!error_occurred) {
         printf("There should have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 
+    printf("\n");
+
     p1.set(1);
     p2.set(kPrime1 - 1);
     error_occurred = false;
     result = f.realize({realize_width});
     if (error_occurred) {
-        printf("There should not have been a requirement error (vector_width = %d)\n", vector_width);
+        printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
     }
     for (int i = 0; i < realize_width; ++i) {
@@ -64,6 +68,8 @@ static void test(int vector_width) {
             exit(1);
         }
     }
+    printf("OK\n");
+    printf("\n");
 
     ImageParam input(Int(32), 2);
     Expr h = require(p1 == p2, p1);
@@ -81,8 +87,12 @@ static void test(int vector_width) {
     if (!error_occurred) {
         printf("There should have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 
+    printf("\n");
+
     p1.set(16);
     p2.set(16);
 
@@ -91,6 +101,8 @@ static void test(int vector_width) {
     if (error_occurred) {
         printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 }
 
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 5a61df34252e..8139751d56e6 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -507,20 +507,27 @@ class SimdOpCheckTest {
             }));
         }
 
+        std::vector<TestResult> failed_tests;
+        constexpr int tabstop = 32;
         for (auto &f : futures) {
             auto result = f.get();
-            constexpr int tabstop = 32;
             const int spaces = std::max(1, tabstop - (int)result.op.size());
             std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n";
             if (!result.error_msg.empty()) {
                 std::cerr << result.error_msg;
-                // The thread-pool destructor will block until in-progress tasks
-                // are done, and then will discard any tasks that haven't been
-                // launched yet.
-                return false;
+                failed_tests.push_back(std::move(result));
             }
         }
 
+        if (!failed_tests.empty()) {
+            std::cerr << "SIMD op check summary: " << failed_tests.size() << " tests failed:\n";
+            for (auto &result : failed_tests) {
+                const int spaces = std::max(1, tabstop - (int)result.op.size());
+                std::cerr << "   " << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n";
+            }
+            return false;
+        }
+
         return true;
     }
 
diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp
index 628de4d91504..de10bde5a1b9 100644
--- a/test/correctness/simplify.cpp
+++ b/test/correctness/simplify.cpp
@@ -810,6 +810,24 @@ void check_vectors() {
           int_vector);
     check(VectorReduce::make(VectorReduce::Max, Broadcast::make(int_vector, 4), 8),
           VectorReduce::make(VectorReduce::Max, Broadcast::make(int_vector, 4), 8));
+
+    {
+        // h_add(broadcast(x, 8), 4) should simplify to broadcast(x * 2, 4)
+        check(VectorReduce::make(VectorReduce::Add, broadcast(x, 8), 4),
+              broadcast(x * 2, 4));
+    }
+
+    {
+        Expr const_u8 = cast(UInt(8), 3);
+        check(VectorReduce::make(VectorReduce::Add, broadcast(const_u8, 9), 3), broadcast(cast(UInt(8), 9), 3));
+    }
+
+    {
+        // Test VectorReduce::Add on a variable of unsigned type to ensure the multiplied factor
+        // keeps the correct type and avoids type-mismatch assertion failures.
+        Expr u8_x = Variable::make(UInt(8), "u8_x");
+        check(VectorReduce::make(VectorReduce::Add, broadcast(u8_x, 9), 3), broadcast(u8_x * cast(UInt(8), 3), 3));
+    }
 }
 
 void check_bounds() {
diff --git a/test/correctness/specialize.cpp b/test/correctness/specialize.cpp
index 1a807003f72a..8df87dd27333 100644
--- a/test/correctness/specialize.cpp
+++ b/test/correctness/specialize.cpp
@@ -128,6 +128,11 @@ int main(int argc, char **argv) {
             }
         }
 
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
+
         // Should have used vector stores
         if (!vector_store || scalar_store) {
             printf("This was supposed to use vector stores\n");
@@ -156,6 +161,11 @@ int main(int argc, char **argv) {
             }
         }
 
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
+
         // Should have used scalar stores
         if (vector_store || !scalar_store) {
             printf("This was supposed to use scalar stores\n");
@@ -243,6 +253,10 @@ int main(int argc, char **argv) {
         // Check we don't crash with the small input, and that it uses scalar stores
         reset_trace();
         f.realize({5});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (!scalar_store || vector_store) {
             printf("These stores were supposed to be scalar.\n");
             return 1;
@@ -254,6 +268,10 @@ int main(int argc, char **argv) {
 
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (scalar_store || !vector_store) {
             printf("These stores were supposed to be vector.\n");
             return 1;
@@ -282,6 +300,10 @@ int main(int argc, char **argv) {
         // Check we used scalar stores for a strided input.
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (!scalar_store || vector_store) {
             printf("These stores were supposed to be scalar.\n");
             return 1;
@@ -293,6 +315,10 @@ int main(int argc, char **argv) {
 
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (scalar_store || !vector_store) {
             printf("These stores were supposed to be vector.\n");
             return 1;
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index 8a82f5ca33d1..a847304cbdd1 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -22,7 +22,7 @@ class CheckForStridedLoads : public IRMutator {
         if (const Ramp *r = op->index.as<Ramp>()) {
             if (op->name == buf_name) {
                 bool dense = is_const_one(r->stride);
-                found |= !dense;
+                found_strided_load |= !dense;
                 dense_loads += dense;
             }
         }
@@ -30,27 +30,27 @@ class CheckForStridedLoads : public IRMutator {
     }
 
 public:
-    bool found = false;
+    bool found_strided_load = false;
     int dense_loads = 0;
     std::string buf_name;
 
     void check(Func f, int desired_dense_loads, std::string name = "buf") {
-        found = false;
+        found_strided_load = false;
         dense_loads = 0;
         buf_name = name;
         f.add_custom_lowering_pass(this, nullptr);
         f.compile_jit();
-        assert(!found);
+        assert(!found_strided_load);
         assert(dense_loads == desired_dense_loads);
     }
 
     void check_not(Func f, int desired_dense_loads, std::string name = "buf") {
-        found = false;
+        found_strided_load = false;
         dense_loads = 0;
         buf_name = name;
         f.add_custom_lowering_pass(this, nullptr);
         f.compile_jit();
-        assert(found);
+        assert(found_strided_load);
         assert(dense_loads == desired_dense_loads);
     }
 } checker;
diff --git a/test/correctness/vector_shuffle.cpp b/test/correctness/vector_shuffle.cpp
index aff6fcbcddcf..f0a62ab3d8cd 100644
--- a/test/correctness/vector_shuffle.cpp
+++ b/test/correctness/vector_shuffle.cpp
@@ -1,10 +1,20 @@
 #include "Halide.h"
+#include <algorithm>
+#include <cstdlib>
 #include <stdio.h>
 
 using namespace Halide;
 
-int main(int argc, char **argv) {
-    Target target = get_jit_target_from_environment();
+int test_with_indices(const Target &target, const std::vector<int> &indices0, const std::vector<int> &indices1) {
+    printf("indices0:");
+    for (int i : indices0) {
+        printf(" %d", i);
+    }
+    printf("    indices1:");
+    for (int i : indices1) {
+        printf(" %d", i);
+    }
+    printf("\n");
 
     Var x{"x"}, y{"y"};
     Func f0{"f0"}, f1{"f1"}, g{"g"};
@@ -12,15 +22,6 @@ int main(int argc, char **argv) {
     f1(x, y) = x * (y + 3);
     Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)});
     Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)});
-    std::vector<int> indices0;
-    std::vector<int> indices1;
-    if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) {
-        indices0 = {3, 1, 6, 7, 2, 4, 0, 5};
-        indices1 = {1, 0, 3, 4, 7, 0, 5, 2};
-    } else {
-        indices0 = {3, 1, 6, 7};
-        indices1 = {1, 0, 3, 4};
-    }
     Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0);
     Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1);
     Expr result = shuffle1 * shuffle2;
@@ -55,6 +56,94 @@ int main(int argc, char **argv) {
             return 1;
         }
     }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+
+    for (int vec_size = 8; vec_size > 1; vec_size /= 2) {
+        printf("Testing vector size %d...\n", vec_size);
+        std::vector<int> indices0, indices1;
+
+        // Test 1: All indices: foreward/backward and combined
+        for (int i = 0; i < vec_size; ++i) {
+            indices0.push_back(i);                 // forward
+            indices1.push_back(vec_size - i - 1);  // backward
+        }
+        printf("  All indices forward...\n");
+        if (test_with_indices(target, indices0, indices0)) {
+            return 1;
+        }
+        printf("  All indices backward...\n");
+        if (test_with_indices(target, indices1, indices1)) {
+            return 1;
+        }
+        printf("  All indices mixed forware / backward...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        // Test 2: Shuffled indices (4 repetitions)
+        for (int r = 0; r < 4; ++r) {
+            // Shuffle with Fisher-Yates
+            for (int i = vec_size - 1; i >= 1; --i) {
+                // indices0
+                int idx = std::rand() % (i + 1);
+                std::swap(indices0[idx], indices0[i]);
+                // indices1
+                idx = std::rand() % (i + 1);
+                std::swap(indices1[idx], indices1[i]);
+            }
+            printf("  Randomly shuffled...\n");
+            if (test_with_indices(target, indices0, indices1)) {
+                return 1;
+            }
+        }
+
+        // Test 3: Interleaved
+        indices0.clear();
+        indices1.clear();
+        for (int i = 0; i < vec_size / 2; ++i) {
+            // interleave (A, B)
+            indices0.push_back(i);
+            indices0.push_back(i + vec_size / 2);
+
+            // interleave (B, A)
+            indices1.push_back(i + vec_size / 2);
+            indices1.push_back(i);
+        }
+        printf("  Interleaved...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        // Test 4: Concat (not-really, as the input-vectors are size 4, so only if vec_size == 8, it's a concat)
+        indices0.clear();
+        indices1.clear();
+        for (int i = 0; i < vec_size; ++i) {
+            // concat (A, B)
+            indices0.push_back(i);
+
+            // concat (B, A)
+            indices1.push_back((i + vec_size / 2) % (vec_size / 2));
+        }
+        printf("  Concat...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        if (vec_size == 4) {
+            indices0 = {1, 3, 2, 0};
+            indices1 = {2, 3, 1, 0};
+
+            printf("  Specific index combination, known to have caused problems...\n");
+            if (test_with_indices(target, indices0, indices1)) {
+                return 1;
+            }
+        }
+    }
+
     printf("Success!\n");
     return 0;
 }
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index fc9496af0244..df6e74a2b5f5 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -82,7 +82,6 @@ tests(GROUPS error
       memoize_output_invalid.cpp
       memoize_redefine_eviction_key.cpp
       metal_threads_too_large.cpp
-      metal_vector_too_large.cpp
       mismatch_runtime_vscale.cpp
       missing_args.cpp
       no_default_device.cpp
diff --git a/test/performance/nested_vectorization_gemm.cpp b/test/performance/nested_vectorization_gemm.cpp
index 660d3d7bbdf8..4d831e4ba247 100644
--- a/test/performance/nested_vectorization_gemm.cpp
+++ b/test/performance/nested_vectorization_gemm.cpp
@@ -300,7 +300,6 @@ int main(int argc, char **argv) {
             return 1;
         }
     }
-    printf("Success!\n");
 
     // 8-bit sparse blur into 32-bit accumulator
     {
@@ -396,5 +395,6 @@ int main(int argc, char **argv) {
         }
     }
 
+    printf("Success!\n");
     return 0;
 }