diff --git a/.gitignore b/.gitignore index a08b8e8dd7f3..888235a389d8 100644 --- a/.gitignore +++ b/.gitignore @@ -240,6 +240,9 @@ xcuserdata # NeoVim + clangd .cache +# CCLS +.ccls-cache + # Emacs tags TAGS diff --git a/Makefile b/Makefile index c668cf20fdcd..6c6439478acd 100644 --- a/Makefile +++ b/Makefile @@ -535,6 +535,7 @@ SOURCE_FILES = \ IRVisitor.cpp \ JITModule.cpp \ Lambda.cpp \ + LegalizeVectors.cpp \ Lerp.cpp \ LICM.cpp \ LLVM_Output.cpp \ @@ -737,6 +738,7 @@ HEADER_FILES = \ IRVisitor.h \ JITModule.h \ Lambda.h \ + LegalizeVectors.h \ Lerp.h \ LICM.h \ LLVM_Output.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 036b92651667..22dc6202b0f1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -37,7 +37,8 @@ endif () set_target_properties(Halide PROPERTIES POSITION_INDEPENDENT_CODE ON) ## -# Lists of source files. Keep ALL lists sorted in alphabetical order. +# Lists of source files. Keep ALL lists sorted in case-insensitive alphabetical order. +# (neo)vim users can use ":sort i" in visual line mode. ## # The externally-visible header files that go into making Halide.h. @@ -145,6 +146,7 @@ target_sources( IRVisitor.h JITModule.h Lambda.h + LegalizeVectors.h Lerp.h LICM.h LLVM_Output.h @@ -323,6 +325,7 @@ target_sources( IRVisitor.cpp JITModule.cpp Lambda.cpp + LegalizeVectors.cpp Lerp.cpp LICM.cpp LLVM_Output.cpp diff --git a/src/CSE.cpp b/src/CSE.cpp index c2a46d93bc4d..6051e5e9cf62 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -33,6 +33,11 @@ bool should_extract(const Expr &e, bool lift_all) { return false; } + if (const Call *c = e.as()) { + // Calls with side effects should not be moved. + return c->is_pure() || c->call_type == Call::Halide; + } + if (lift_all) { return true; } diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 7178e82965d8..592072a677c1 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1524,7 +1524,7 @@ void CodeGen_ARM::visit(const Store *op) { // Declare the function std::ostringstream instr; vector arg_types; - llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, false, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed); + llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, true, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed); if (target.bits == 32) { instr << "llvm.arm.neon.vst" << num_vecs diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 065dcebd1a64..9d33f7d4643b 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -1157,7 +1157,7 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b, internal_assert(result_elements > 0); llvm::Type *result_ty = get_vector_type(element_ty, result_elements); - // Try to rewrite shuffles that only access the elements of b. + // Find the range of non-dont-care indices. int min = INT_MAX; int max = -1; for (int idx : indices) { @@ -1169,6 +1169,8 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b, if (min == INT_MAX) { return llvm::PoisonValue::get(result_ty); } + + // Try to rewrite shuffles that only access the elements of b. if (min >= a_elements) { vector shifted_indices(indices); for (int &i : shifted_indices) { @@ -1565,6 +1567,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector &indices) { Value *ret = nullptr; for (int i = 0; i < lut_elements; i += native_elements) { Value *lut_i = slice_vector(lut, i, native_elements); + internal_assert(get_vector_num_elements(lut_i->getType()) == native_elements); vector indices_i(native_elements); vector mask(native_elements); bool all_used = true; diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 300dfa096a1e..bad10f263661 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -5093,10 +5093,11 @@ Value *CodeGen_LLVM::shuffle_vectors(Value *a, Value *b, } // Check for type identity *after* normalizing to fixed vectors internal_assert(a->getType() == b->getType()); + int elements_a = get_vector_num_elements(a->getType()); vector llvm_indices(indices.size()); for (size_t i = 0; i < llvm_indices.size(); i++) { if (indices[i] >= 0) { - internal_assert(indices[i] < get_vector_num_elements(a->getType()) * 2); + internal_assert(indices[i] < elements_a * 2) << indices[i] << " " << elements_a * 2; llvm_indices[i] = ConstantInt::get(i32_t, indices[i]); } else { // Only let -1 be undef. diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 48f1468c1316..a1de1be84099 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -2086,31 +2086,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) { debug(3) << "\n"; if (arg_ids.size() == 1) { - // 1 argument, just do a simple assignment via a cast SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]); builder.update_id(result_id); } else if (arg_ids.size() == 2) { - - // 2 arguments, use a composite insert to update even and odd indices - uint32_t even_idx = 0; - uint32_t odd_idx = 1; - SpvFactory::Indices even_indices; - SpvFactory::Indices odd_indices; - for (int i = 0; i < op_lanes; ++i) { - even_indices.push_back(even_idx); - odd_indices.push_back(odd_idx); - even_idx += 2; - odd_idx += 2; + // 2 arguments, use vector-shuffle with logical indices indexing into (vec1[0], vec1[1], ..., vec2[0], vec2[1], ...) + SpvFactory::Indices logical_indices; + for (int i = 0; i < arg_lanes; ++i) { + logical_indices.push_back(uint32_t(i)); + logical_indices.push_back(uint32_t(i + arg_lanes)); } SpvId type_id = builder.declare_type(op->type); - SpvId value_id = builder.declare_null_constant(op->type); - SpvId partial_id = builder.reserve_id(SpvResultId); SpvId result_id = builder.reserve_id(SpvResultId); - builder.append(SpvFactory::composite_insert(type_id, partial_id, arg_ids[0], value_id, even_indices)); - builder.append(SpvFactory::composite_insert(type_id, result_id, arg_ids[1], partial_id, odd_indices)); + builder.append(SpvFactory::vector_shuffle(type_id, result_id, arg_ids[0], arg_ids[1], logical_indices)); builder.update_id(result_id); } else { @@ -2140,7 +2130,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) { } else if (op->is_extract_element()) { int idx = op->indices[0]; internal_assert(idx >= 0); - internal_assert(idx <= op->vectors[0].type().lanes()); + internal_assert(idx < op->vectors[0].type().lanes()); if (op->vectors[0].type().is_vector()) { SpvFactory::Indices indices = {(uint32_t)idx}; SpvId type_id = builder.declare_type(op->type); diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index f7a5b5f49aa8..d6d6463d614d 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -17,6 +17,33 @@ namespace Internal { using std::pair; +std::string variable_name_with_extracted_lanes( + const std::string &varname, int varlanes, + int starting_lane, int lane_stride, int new_lanes) { + + if (lane_stride * new_lanes == varlanes) { + if (starting_lane == 0 && lane_stride == 2) { + return varname + ".even_lanes"; + } else if (starting_lane == 1 && lane_stride == 2) { + return varname + ".odd_lanes"; + } + } + if (lane_stride == 1) { + return varname + ".lanes_" + std::to_string(starting_lane) + + "_to_" + std::to_string(starting_lane + new_lanes - 1); + } else { + // Just specify the slice + std::string name = varname; + name += ".slice_"; + name += std::to_string(starting_lane); + name += "_"; + name += std::to_string(lane_stride); + name += "_"; + name += std::to_string(new_lanes); + return name; + } +} + namespace { class StoreCollector : public IRMutator { @@ -176,13 +203,17 @@ Stmt collect_strided_stores(const Stmt &stmt, const std::string &name, int strid return collect.mutate(stmt); } -class Deinterleaver : public IRGraphMutator { +class ExtractLanes : public IRMutator { public: - Deinterleaver(int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets) + ExtractLanes( + int starting_lane, int lane_stride, int new_lanes, + const Scope<> &sliceable_lets, + Scope> &requested_slices) : starting_lane(starting_lane), lane_stride(lane_stride), new_lanes(new_lanes), - external_lets(lets) { + requested_slices(requested_slices) { + this->sliceable_lets.set_containing_scope(&sliceable_lets); } private: @@ -190,35 +221,177 @@ class Deinterleaver : public IRGraphMutator { int lane_stride; int new_lanes; - // lets for which we have even and odd lane specializations - const Scope<> &external_lets; + // vector lets we're allowed to request slices of + Scope<> sliceable_lets; + + // We populate this with the slices we need from the external_lets. + Scope> &requested_slices; using IRMutator::visit; + inline bool needs_extracting(const Expr &op) { + if (op.type().is_scalar()) { + return false; + } + return !(starting_lane == 0 && lane_stride == 1 && new_lanes == op.type().lanes()); + } + + Expr extract_lanes_from_make_struct(const Call *op) { + internal_assert(op); + internal_assert(op->is_intrinsic(Call::make_struct)); + auto [args, changed] = mutate_with_changes(op->args); + if (!changed) { + return op; + } + return Call::make(op->type, Call::make_struct, args, Call::Intrinsic); + } + + Expr extract_lanes_trace(const Call *op) { + auto event = as_const_int(op->args[6]); + internal_assert(event); + if (*event == halide_trace_load || *event == halide_trace_store) { + debug(3) << "Extracting Trace Lanes: " << Expr(op) << "\n"; + const Expr &func = op->args[0]; + Expr values = extract_lanes_from_make_struct(op->args[1].as()); + Expr coords = extract_lanes_from_make_struct(op->args[2].as()); + const Expr &type_code = op->args[3]; + const Expr &type_bits = op->args[4]; + int type_lanes = *as_const_int(op->args[5]); + const Expr &event = op->args[6]; + const Expr &parent_id = op->args[7]; + const Expr &idx = op->args[8]; + int size = *as_const_int(op->args[9]); + const Expr &tag = op->args[10]; + + int num_vecs = op->args[2].as()->args.size(); + internal_assert(size == type_lanes * num_vecs) << Expr(op); + std::vector args = { + func, + values, coords, + type_code, type_bits, Expr(new_lanes), + event, parent_id, idx, Expr(new_lanes * num_vecs), + tag}; + Expr result = Call::make(Int(32), Call::trace, args, Call::Extern); + debug(4) << " => " << result << "\n"; + return result; + } + + internal_error << "Unhandled trace call in ExtractLanes: " << *event; + } + + Expr visit(const Let *op) override { + + // Visit an entire chain of lets in a single method to conserve stack space. + + // This logic is very to the same visit method in interleaver, but not + // the same. We don't mutate the let values by default, we just produce + // any requested slices of them. + + struct Frame { + const Let *op; + ScopedBinding<> binding; + Frame(const Let *op, Scope &scope) + : op(op), + binding(op->value.type().is_vector(), scope, op->name) { + } + }; + std::vector frames; + Expr result; + + do { + result = op->body; + frames.emplace_back(op, sliceable_lets); + } while ((op = result.template as())); + + result = mutate(result); + + std::set vars_used; + auto track_vars_used = [&](const Expr &e) { + return visit_with(e, + [&](auto *self, const Variable *var) { + vars_used.insert(var->name); + }); + }; + track_vars_used(result); + + for (const auto &frame : reverse_view(frames)) { + + // The original variable, if it's needed. + if (vars_used.count(frame.op->name)) { + result = Let::make(frame.op->name, frame.op->value, result); + track_vars_used(frame.op->value); + } + + // For vector lets, we may additionally need lets for the requested + // slices of this variable: + if (frame.op->value.type().is_vector()) { + if (std::vector *reqs = requested_slices.shallow_find(frame.op->name)) { + for (const VectorSlice &sl : *reqs) { + Expr slice; + { + ScopedValue old_start(starting_lane, sl.start); + ScopedValue old_stride(lane_stride, sl.stride); + ScopedValue old_count(new_lanes, sl.count); + slice = mutate(frame.op->value); + } + track_vars_used(slice); + result = Let::make(sl.variable_name, slice, result); + } + requested_slices.pop(frame.op->name); + } + } + } + + return result; + } + Expr visit(const VectorReduce *op) override { - std::vector input_lanes; + if (!needs_extracting(op)) { + return op; + } int factor = op->value.type().lanes() / op->type.lanes(); - for (int i = starting_lane; i < op->type.lanes(); i += lane_stride) { - for (int j = 0; j < factor; j++) { - input_lanes.push_back(i * factor + j); + if (lane_stride != 1) { + std::vector input_lanes; + for (int i = 0; i < new_lanes; ++i) { + int lane_start = (starting_lane + lane_stride * i) * factor; + for (int j = 0; j < factor; j++) { + input_lanes.push_back(lane_start + j); + } + } + Expr in = Shuffle::make({op->value}, input_lanes); + return VectorReduce::make(op->op, in, new_lanes); + } else { + Expr in; + { + ScopedValue old_starting_lane(starting_lane, starting_lane * factor); + ScopedValue old_new_lanes(new_lanes, new_lanes * factor); + in = mutate(op->value); } + if (new_lanes == op->type.lanes() && in.same_as(op->value)) { + return op; + } + return VectorReduce::make(op->op, in, new_lanes); } - Expr in = Shuffle::make({op->value}, input_lanes); - return VectorReduce::make(op->op, in, new_lanes); } Expr visit(const Broadcast *op) override { + if (const Call *call = op->value.as()) { + if (call->name == Call::trace) { + Expr value = extract_lanes_trace(call); + if (new_lanes == 1) { + return value; + } else { + return Broadcast::make(value, new_lanes); + } + } + } if (new_lanes == 1) { if (op->value.type().lanes() == 1) { return op->value; } else { - int old_starting_lane = starting_lane; - int old_lane_stride = lane_stride; - starting_lane = starting_lane % op->value.type().lanes(); - lane_stride = op->value.type().lanes(); + ScopedValue old_starting_lane(starting_lane, starting_lane % op->value.type().lanes()); + ScopedValue old_lane_stride(lane_stride, op->value.type().lanes()); Expr e = mutate(op->value); - starting_lane = old_starting_lane; - lane_stride = old_lane_stride; return e; } } @@ -227,57 +400,70 @@ class Deinterleaver : public IRGraphMutator { return mutate(flatten_nested_ramps(op)); } + if (new_lanes == op->type.lanes()) { + return op; + } return Broadcast::make(op->value, new_lanes); } Expr visit(const Load *op) override { - if (op->type.is_scalar()) { + if (!needs_extracting(op)) { return op; - } else { - Type t = op->type.with_lanes(new_lanes); - ModulusRemainder align = op->alignment; - // The alignment of a Load refers to the alignment of the first - // lane, so we can preserve the existing alignment metadata if the - // deinterleave is asking for any subset of lanes that includes the - // first. Otherwise we just drop it. We could check if the index is - // a ramp with constant stride or some other special case, but if - // that's the case, the simplifier is very good at figuring out the - // alignment, and it has access to context (e.g. the alignment of - // enclosing lets) that we do not have here. - if (starting_lane != 0) { - align = ModulusRemainder(); - } - return Load::make(t, op->name, mutate(op->index), op->image, op->param, mutate(op->predicate), align); } + Type t = op->type.with_lanes(new_lanes); + ModulusRemainder align = op->alignment; + // The alignment of a Load refers to the alignment of the first + // lane, so we can preserve the existing alignment metadata if the + // deinterleave is asking for any subset of lanes that includes the + // first. Otherwise we just drop it. We could check if the index is + // a ramp with constant stride or some other special case, but if + // that's the case, the simplifier is very good at figuring out the + // alignment, and it has access to context (e.g. the alignment of + // enclosing lets) that we do not have here. + if (starting_lane != 0) { + align = ModulusRemainder(); + } + return Load::make(t, op->name, mutate(op->index), op->image, op->param, mutate(op->predicate), align); } Expr visit(const Ramp *op) override { + if (!needs_extracting(op)) { + return op; + } int base_lanes = op->base.type().lanes(); if (base_lanes > 1) { if (new_lanes == 1) { int index = starting_lane / base_lanes; - Expr expr = op->base + cast(op->base.type(), index) * op->stride; + Expr expr = simplify(op->base + cast(op->base.type(), index) * op->stride); ScopedValue old_starting_lane(starting_lane, starting_lane % base_lanes); ScopedValue old_lane_stride(lane_stride, base_lanes); expr = mutate(expr); return expr; } else if (base_lanes == lane_stride && starting_lane < base_lanes) { - // Base class mutator actually works fine in this - // case, but we only want one lane from the base and - // one lane from the stride. - ScopedValue old_new_lanes(new_lanes, 1); - return IRMutator::visit(op); + // We want one lane from the base and one lane from + // the stride, then build a new ramp with the right + // number of steps. + int ramp_lanes = new_lanes; + { + ScopedValue old_new_lanes(new_lanes, 1); + Expr new_base = mutate(op->base); + Expr new_stride = mutate(op->stride); + if (ramp_lanes == 1) { + return new_base; + } + return Ramp::make(new_base, new_stride, ramp_lanes); + } } else { // There is probably a more efficient way to this by // generalizing the two cases above. return mutate(flatten_nested_ramps(op)); } } - Expr expr = op->base + cast(op->base.type(), starting_lane) * op->stride; + Expr expr = simplify(op->base + cast(op->base.type(), starting_lane) * op->stride); internal_assert(expr.type() == op->base.type()); if (new_lanes > 1) { - expr = Ramp::make(expr, op->stride * lane_stride, new_lanes); + expr = Ramp::make(expr, simplify(op->stride * cast(op->base.type(), lane_stride)), new_lanes); } return expr; } @@ -294,39 +480,49 @@ class Deinterleaver : public IRGraphMutator { } Expr visit(const Variable *op) override { - if (op->type.is_scalar()) { + if (!needs_extracting(op)) { return op; - } else { + } - Type t = op->type.with_lanes(new_lanes); - if (external_lets.contains(op->name) && - starting_lane == 0 && - lane_stride == 2) { - return Variable::make(t, op->name + ".even_lanes", op->image, op->param, op->reduction_domain); - } else if (external_lets.contains(op->name) && - starting_lane == 1 && - lane_stride == 2) { - return Variable::make(t, op->name + ".odd_lanes", op->image, op->param, op->reduction_domain); - } else if (external_lets.contains(op->name) && - starting_lane == 0 && - lane_stride == 3) { - return Variable::make(t, op->name + ".lanes_0_of_3", op->image, op->param, op->reduction_domain); - } else if (external_lets.contains(op->name) && - starting_lane == 1 && - lane_stride == 3) { - return Variable::make(t, op->name + ".lanes_1_of_3", op->image, op->param, op->reduction_domain); - } else if (external_lets.contains(op->name) && - starting_lane == 2 && - lane_stride == 3) { - return Variable::make(t, op->name + ".lanes_2_of_3", op->image, op->param, op->reduction_domain); + Type t = op->type.with_lanes(new_lanes); + + if (sliceable_lets.contains(op->name)) { + // The variable accessed is marked as sliceable by the caller. + // Let's request a slice and pretend it exists. + std::string sliced_var_name = variable_name_with_extracted_lanes( + op->name, op->type.lanes(), + starting_lane, lane_stride, new_lanes); + + VectorSlice new_sl; // When C++20 lands: Designated initializer + new_sl.start = starting_lane; + new_sl.stride = lane_stride; + new_sl.count = new_lanes; + new_sl.variable_name = sliced_var_name; + + if (auto *vec = requested_slices.shallow_find(op->name)) { + bool found = false; + for (const VectorSlice &existing_sl : *vec) { + if (existing_sl.start == starting_lane && + existing_sl.stride == lane_stride && + existing_sl.count == new_lanes) { + found = true; + break; + } + } + if (!found) { + vec->push_back(std::move(new_sl)); + } } else { - return give_up_and_shuffle(op); + requested_slices.push(op->name, {std::move(new_sl)}); } + return Variable::make(t, sliced_var_name, op->image, op->param, op->reduction_domain); + } else { + return give_up_and_shuffle(op); } } Expr visit(const Cast *op) override { - if (op->type.is_scalar()) { + if (!needs_extracting(op)) { return op; } else { Type t = op->type.with_lanes(new_lanes); @@ -335,122 +531,315 @@ class Deinterleaver : public IRGraphMutator { } Expr visit(const Reinterpret *op) override { - if (op->type.is_scalar()) { + // Written with assistance from Gemini 3 Pro, which required a lot of baby-sitting. + + // Simple case of a scalar reinterpret: always one lane: + if (!needs_extracting(op)) { return op; - } else if (op->type.bits() != op->value.type().bits()) { - return give_up_and_shuffle(op); - } else { - Type t = op->type.with_lanes(new_lanes); - return Reinterpret::make(t, mutate(op->value)); } + + int out_bits = op->type.bits(); + int in_bits = op->value.type().bits(); + + internal_assert(out_bits % in_bits == 0 || in_bits % out_bits == 0); + + // Case A: Stride 1. Calculate everything with bit-offsets + if (lane_stride == 1) { + + // Compute range of bits required from the input. + int start_bit = starting_lane * out_bits; + int total_bits = new_lanes * out_bits; + int end_bit = start_bit + total_bits; + + // Convert this to a range of lane indices + int start_input_lane = start_bit / in_bits; + int end_input_lane = (end_bit + in_bits - 1) / in_bits; + int num_input_lanes = end_input_lane - start_input_lane; + + // Actually now get those lanes from the input. + Expr extracted_input_lanes; + { + ScopedValue old_sl(starting_lane, start_input_lane); + ScopedValue old_nl(new_lanes, num_input_lanes); + extracted_input_lanes = mutate(op->value); + } + + // The range of lanes we extracted from the input still might be too big, because + // we had to grab whole elements from the input, which can be coarser if out_bits > in_bits. + // So calculate how many lanes we extracted, when measured in the reinterpreted output type. + int intm_lanes = (num_input_lanes * in_bits) / out_bits; + Expr reinterpreted = Reinterpret::make(op->type.with_lanes(intm_lanes), extracted_input_lanes); + + // Now calculate how many we output Type lanes we need to trim away. + int bits_to_strip_front = start_bit - (start_input_lane * in_bits); + int lanes_to_strip_front = bits_to_strip_front / out_bits; + + if (lanes_to_strip_front == 0 && intm_lanes == new_lanes) { + return reinterpreted; + } else { + return Shuffle::make_slice(reinterpreted, lanes_to_strip_front, 1, new_lanes); + } + } + + // Case B: Stride != 1. We are effectively gathering. + // We will rewrite those Reinterprets as a Concat of Reinterprets of extracted lanes. + std::vector chunks(new_lanes); + for (int i = 0; i < new_lanes; ++i) { + // Find the bit range of this element in the output + int start_bit = (starting_lane + lane_stride * i) * out_bits; + int end_bit = start_bit + out_bits; + + // Map it to input lanes + int start_input_lane = start_bit / in_bits; + int end_input_lane = (end_bit + in_bits - 1) / in_bits; + int num_input_lanes = end_input_lane - start_input_lane; + + // Grab this range of lanes from the input. + Expr input_chunk; + { + ScopedValue old_start(starting_lane, start_input_lane); + ScopedValue old_stride(lane_stride, 1); + ScopedValue old_count(new_lanes, num_input_lanes); + input_chunk = mutate(op->value); + } + + // Reinterpret the chunk. + int extracted_bits = num_input_lanes * in_bits; + int reinterpreted_lanes = extracted_bits / out_bits; + internal_assert(reinterpreted_lanes != 0); + + Expr reinterpreted = Reinterpret::make(op->type.with_lanes(reinterpreted_lanes), input_chunk); + + // Now, in case of demotion: + // Example: + // R = ExtractLanes(Reinterpret([u32, u32, u32, u32], u8), 0, 2, 4) + // = ExtractLanes([u8_0, u8_1, u8_2, u8_3, ...], 0, 2, 4) + // = [u8_0, u8_2, u8_4, u8_6] + // A single extracted u32 element is too large, even after reinterpreting. + // So we need to slice the reinterpreted result. + int bit_offset = start_bit - (start_input_lane * in_bits); + int lane_offset = bit_offset / out_bits; + + if (lane_offset == 0 && reinterpreted_lanes == 1) { + chunks[i] = std::move(reinterpreted); + } else { + chunks[i] = Shuffle::make_extract_element(reinterpreted, lane_offset); + } + } + + // In case of demotion, we will potentially extract and reinterpret the same input lane several times. + // Simplification afterwards will turn them into Lets. + + return Shuffle::make_concat(chunks); } Expr visit(const Call *op) override { - Type t = op->type.with_lanes(new_lanes); + internal_assert(op->type.lanes() >= starting_lane + lane_stride * (new_lanes - 1)) << Expr(op) << starting_lane << " " << lane_stride << " " << new_lanes; // Don't mutate scalars - if (op->type.is_scalar()) { + if (!needs_extracting(op)) { return op; } else { - // Vector calls are always parallel across the lanes, so we // can just deinterleave the args. + Type t = op->type.with_lanes(new_lanes); - // Beware of intrinsics for which this is not true! - auto args = mutate(op->args); - return Call::make(t, op->name, args, op->call_type, - op->func, op->value_index, op->image, op->param); + auto [args, changed] = mutate_with_changes(op->args); + if (!changed) { + // It's possible that this is a slice where output lanes = input + // lanes (e.g. reversing a vector) and the args are invariant + // under that slice (e.g. they are broadcasts). + internal_assert(t == op->type); + return op; + } else { + return Call::make(t, op->name, args, op->call_type, + op->func, op->value_index, op->image, op->param); + } } } Expr visit(const Shuffle *op) override { + if (!needs_extracting(op)) { + return op; + } + + // Special case 1: Scalar extraction + if (new_lanes == 1) { + // Find in which vector it sits. + int index = op->indices[starting_lane]; + for (const auto &vec : op->vectors) { + if (index < vec.type().lanes()) { + // We found the source vector. Extract the scalar from it. + ScopedValue old_start(starting_lane, index); + ScopedValue old_stride(lane_stride, 1); // Stride doesn't matter for scalar + ScopedValue old_count(new_lanes, 1); + return mutate(vec); + } + index -= vec.type().lanes(); + } + internal_error << "extract_lane index out of bounds: " << Expr(op) << " " << index << "\n"; + } + if (op->is_interleave()) { // Special case where we can discard some of the vector arguments entirely. - internal_assert(starting_lane >= 0 && starting_lane < lane_stride); - if ((int)op->vectors.size() == lane_stride) { - return op->vectors[starting_lane]; - } else if ((int)op->vectors.size() % lane_stride == 0) { - // Pick up every lane-stride vector. - std::vector new_vectors(op->vectors.size() / lane_stride); - for (size_t i = 0; i < new_vectors.size(); i++) { - new_vectors[i] = op->vectors[i * lane_stride + starting_lane]; + internal_assert(starting_lane >= 0); + int n_vectors = (int)op->vectors.size(); + + // Case A: Stride is a multiple of the number of input vectors. + // Example: extract_lanes(interleave(A, B), stride=4) + // result comes from either A or B, depending on starting lane modulo number of vectors, + // required stride of said vector is lane_stride / num_vectors + if (lane_stride > 0 && lane_stride % n_vectors == 0) { + const Expr &vec = op->vectors[starting_lane % n_vectors]; + if (vec.type().lanes() == new_lanes) { + // We need all lanes of this vector, just return it. + return vec; + } else { + // We don't need all lanes, unfortunately. Let's extract the part we need. + ScopedValue old_starting_lane(starting_lane, starting_lane / n_vectors); + ScopedValue old_lane_stride(lane_stride, lane_stride / n_vectors); + return mutate(vec); + } + } + + // Case B: Number of vectors is a multiple of the stride. + // Eg: extract_lanes(interleave(a, b, c, d, e, f), start=8, stride=3) + // = extract_lanes(a0, b0, c0, d0, e0, f0, a1, b1, c1, d1, e1, f1, ...) + // = (a2, c2, e2, c1, ...) + // = interleave(a, c) + if (lane_stride > 0 && n_vectors % lane_stride == 0) { + int num_required_vectors = n_vectors / lane_stride; + + // The result is only an interleave if the number of constituent + // vectors divides the number of total required lanes. + if (new_lanes % num_required_vectors == 0) { + int lanes_per_vec = new_lanes / num_required_vectors; + + // Pick up every lane-stride vector. + std::vector new_vectors(num_required_vectors); + for (size_t i = 0; i < new_vectors.size(); i++) { + int absolute_lane_index = starting_lane + i * lane_stride; + int src_vec_idx = absolute_lane_index % n_vectors; + int vec_lane_start = absolute_lane_index / n_vectors; + const Expr &vec = op->vectors[src_vec_idx]; + + ScopedValue old_starting_lane(starting_lane, vec_lane_start); + ScopedValue old_lane_stride(lane_stride, 1); + ScopedValue old_new_lanes(new_lanes, lanes_per_vec); + new_vectors[i] = mutate(vec); + } + return Shuffle::make_interleave(new_vectors); } - return Shuffle::make_interleave(new_vectors); } } - // Keep the same set of vectors and extract every nth numeric - // arg to the shuffle. - std::vector indices; + // General case fallback + std::vector indices(new_lanes); + bool constant_stride = true; for (int i = 0; i < new_lanes; i++) { - int idx = i * lane_stride + starting_lane; - indices.push_back(op->indices[idx]); - } - - // If this is extracting a single lane, try to recursively deinterleave rather - // than leaving behind a shuffle. - if (indices.size() == 1) { - int index = indices.front(); - for (const auto &i : op->vectors) { - if (index < i.type().lanes()) { - ScopedValue lane(starting_lane, index); - return mutate(i); + int idx = op->indices[i * lane_stride + starting_lane]; + indices[i] = idx; + if (i > 1 && constant_stride) { + int stride = indices[1] - indices[0]; + if (indices[i] != indices[i - 1] + stride) { + constant_stride = false; } - index -= i.type().lanes(); } - internal_error << "extract_lane index out of bounds: " << Expr(op) << " " << index << "\n"; + } + + // One optimization if we take a slice of a single vector. + if (constant_stride) { + int stride = indices[1] - indices[0]; + int first_idx = indices.front(); + int last_idx = indices.back(); + + // Find which vector contains this range + int current_bound = 0; + for (const auto &vec : op->vectors) { + int vec_lanes = vec.type().lanes(); + + // Check if the START of the ramp is in this vector + if (first_idx >= current_bound && first_idx < current_bound + vec_lanes) { + + // We found the vector containing the start. + // Now, because it is a linear ramp, we only need to check if the + // END of the ramp is also within this same vector. + // (This handles negative strides, forward strides, and broadcasts correctly). + if (last_idx >= current_bound && last_idx < current_bound + vec_lanes) { + + // Calculate the start index relative to this specific vector + int local_start = first_idx - current_bound; + + ScopedValue s_start(starting_lane, local_start); + ScopedValue s_stride(lane_stride, stride); + // new_lanes is already correct + return mutate(vec); + } + + // If the start is here but the end is elsewhere, the ramp crosses + // vector boundaries. We cannot optimize this as a single vector extraction. + break; + } + current_bound += vec_lanes; + } } return Shuffle::make(op->vectors, indices); } }; -Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets) { - e = substitute_in_all_lets(e); - Deinterleaver d(starting_lane, lane_stride, new_lanes, lets); - e = d.mutate(e); - e = common_subexpression_elimination(e); - return simplify(e); -} - -Expr extract_odd_lanes(const Expr &e, const Scope<> &lets) { - internal_assert(e.type().lanes() % 2 == 0); - return deinterleave(e, 1, 2, e.type().lanes() / 2, lets); -} +} // namespace -Expr extract_even_lanes(const Expr &e, const Scope<> &lets) { - internal_assert(e.type().lanes() % 2 == 0); - return deinterleave(e, 0, 2, (e.type().lanes() + 1) / 2, lets); +Expr extract_lanes(const Expr &original_expr, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets, Scope> &requested_sliced_lets) { + internal_assert(starting_lane + (new_lanes - 1) * lane_stride <= original_expr.type().lanes()) + << "Extract lanes with start:" << starting_lane << ", stride:" << lane_stride << ", new_lanes:" << new_lanes << " " + << "out of " << original_expr.type() << " which goes out of bounds."; + + debug(3) << "ExtractLanes " + << "(start:" << starting_lane << ", stride:" << lane_stride << ", new_lanes:" << new_lanes << "): " + << original_expr << " of Type: " << original_expr.type() << "\n"; + Type original_type = original_expr.type(); + ExtractLanes d(starting_lane, lane_stride, new_lanes, lets, requested_sliced_lets); + Expr e = d.mutate(original_expr); + e = common_subexpression_elimination(e); + debug(3) << " => " << e << "\n"; + Type final_type = e.type(); + internal_assert(original_type.code() == final_type.code()) + << "Underlying types not identical after extract_lanes:\n" + << "Before: " << original_expr << "\n" + << "After: " << e << "\n"; + internal_assert(new_lanes == final_type.lanes()) + << "Number of lanes incorrect after extract_lanes: " << final_type.lanes() << " while expected was " << new_lanes << ": extract_lanes(" << starting_lane << ", " << lane_stride << ", " << new_lanes << "):\n" + << "Input: " << original_expr << "\nResult: " << e; + return e; } -Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) { - internal_assert(e.type().lanes() % 3 == 0); - return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets); +Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes) { + Scope<> lets; + Scope> req; + return extract_lanes(e, starting_lane, lane_stride, new_lanes, lets, req); } -} // namespace - Expr extract_even_lanes(const Expr &e) { internal_assert(e.type().lanes() % 2 == 0); - Scope<> lets; - return extract_even_lanes(e, lets); + return extract_lanes(e, 0, 2, e.type().lanes() / 2); } Expr extract_odd_lanes(const Expr &e) { internal_assert(e.type().lanes() % 2 == 0); - Scope<> lets; - return extract_odd_lanes(e, lets); + return extract_lanes(e, 1, 2, e.type().lanes() / 2); } Expr extract_lane(const Expr &e, int lane) { - Scope<> lets; - return deinterleave(e, lane, e.type().lanes(), 1, lets); + return extract_lanes(e, lane, e.type().lanes(), 1); } namespace { +// Change name to DenisfyStridedLoadsAndStores? class Interleaver : public IRMutator { Scope<> vector_lets; + Scope> requested_sliced_lets; using IRMutator::visit; @@ -459,9 +848,9 @@ class Interleaver : public IRMutator { Expr deinterleave_expr(const Expr &e) { std::vector exprs; + exprs.reserve(num_lanes); for (int i = 0; i < num_lanes; i++) { - Scope<> lets; - exprs.emplace_back(deinterleave(e, i, num_lanes, e.type().lanes() / num_lanes, lets)); + exprs.emplace_back(extract_lanes(e, i, num_lanes, e.type().lanes() / num_lanes, vector_lets, requested_sliced_lets)); } return Shuffle::make_interleave(exprs); } @@ -492,18 +881,21 @@ class Interleaver : public IRMutator { for (const auto &frame : reverse_view(frames)) { Expr value = std::move(frame.new_value); + // The original variable: result = LetOrLetStmt::make(frame.op->name, value, result); - // For vector lets, we may additionally need a let defining the even and odd lanes only + // For vector lets, we may additionally need a lets for the requested slices of this variable: if (value.type().is_vector()) { - if (value.type().lanes() % 2 == 0) { - result = LetOrLetStmt::make(frame.op->name + ".even_lanes", extract_even_lanes(value, vector_lets), result); - result = LetOrLetStmt::make(frame.op->name + ".odd_lanes", extract_odd_lanes(value, vector_lets), result); - } - if (value.type().lanes() % 3 == 0) { - result = LetOrLetStmt::make(frame.op->name + ".lanes_0_of_3", extract_mod3_lanes(value, 0, vector_lets), result); - result = LetOrLetStmt::make(frame.op->name + ".lanes_1_of_3", extract_mod3_lanes(value, 1, vector_lets), result); - result = LetOrLetStmt::make(frame.op->name + ".lanes_2_of_3", extract_mod3_lanes(value, 2, vector_lets), result); + if (std::vector *reqs = + requested_sliced_lets.shallow_find(frame.op->name)) { + for (const VectorSlice &sl : *reqs) { + result = LetOrLetStmt::make( + sl.variable_name, + extract_lanes(value, sl.start, sl.stride, sl.count, + vector_lets, requested_sliced_lets), + result); + } + requested_sliced_lets.pop(frame.op->name); } } } @@ -718,7 +1110,7 @@ class Interleaver : public IRMutator { const Ramp *ri = stores[i].as()->index.as(); internal_assert(ri); - // Mismatched store vector laness. + // Mismatched store vector lanes. if (ri->lanes != lanes) { return Stmt(); } diff --git a/src/Deinterleave.h b/src/Deinterleave.h index 485641f71a5f..630fa8e7ecc1 100644 --- a/src/Deinterleave.h +++ b/src/Deinterleave.h @@ -9,15 +9,21 @@ */ #include "Expr.h" +#include "Scope.h" namespace Halide { namespace Internal { -/** Extract the odd-numbered lanes in a vector */ -Expr extract_odd_lanes(const Expr &a); +struct VectorSlice { + int start, stride, count; + std::string variable_name; +}; -/** Extract the even-numbered lanes in a vector */ -Expr extract_even_lanes(const Expr &a); +/* Extract lanes and relying on the fact that the caller will provide new variables in Lets or LetStmts which correspond to slices of the original variable. */ +Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &sliceable_lets, Scope> &requested_sliced_lets); + +/* Extract lanes without requesting any extra slices from variables. */ +Expr extract_lanes(const Expr &e, int starting_lane, int lane_stride, int new_lanes); /** Extract the nth lane of a vector */ Expr extract_lane(const Expr &vec, int lane); diff --git a/src/IR.cpp b/src/IR.cpp index 2c91d16b50e1..0cc6cb828500 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -12,7 +12,7 @@ namespace Internal { Expr Cast::make(Type t, Expr v) { internal_assert(v.defined()) << "Cast of undefined\n"; - internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths\n"; + internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths: " << v << " of type " << v.type() << " cannot be cast to " << t << "\n"; Cast *node = new Cast; node->type = t; @@ -281,7 +281,7 @@ Expr Ramp::make(Expr base, Expr stride, int lanes) { Expr Broadcast::make(Expr value, int lanes) { internal_assert(value.defined()) << "Broadcast of undefined\n"; - internal_assert(lanes != 1) << "Broadcast of lanes 1\n"; + internal_assert(lanes != 1) << "Broadcast over 1 lane is not a broadcast\n"; Broadcast *node = new Broadcast; node->type = value.type().with_lanes(lanes * value.type().lanes()); diff --git a/src/IROperator.h b/src/IROperator.h index 797f12870f5d..015b767bd330 100644 --- a/src/IROperator.h +++ b/src/IROperator.h @@ -1285,7 +1285,8 @@ Expr random_int(Expr seed = Expr()); /** Create an Expr that prints out its value whenever it is * evaluated. It also prints out everything else in the arguments - * list, separated by spaces. This can include string literals. */ + * list, separated by spaces. This can include string literals. + * Evaluates to the first argument passed. */ //@{ Expr print(const std::vector &values); diff --git a/src/LegalizeVectors.cpp b/src/LegalizeVectors.cpp new file mode 100644 index 000000000000..85fba6c3eca1 --- /dev/null +++ b/src/LegalizeVectors.cpp @@ -0,0 +1,541 @@ +#include "LegalizeVectors.h" +#include "CSE.h" +#include "Deinterleave.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "Simplify.h" +#include "Util.h" + +#include +#include + +namespace Halide { +namespace Internal { + +namespace { + +using namespace std; + +int max_lanes_for_device(DeviceAPI api, int parent_max_lanes) { + // The environment variable below (HL_FORCE_VECTOR_LEGALIZATION) is here solely for testing purposes. + // It is useful to "stress-test" this lowering pass by forcing a shorter maximal vector size across + // all codegen across the entire test suite. This should not be used in real uses of Halide. + std::string envvar = Halide::Internal::get_env_variable("HL_FORCE_VECTOR_LEGALIZATION"); + if (!envvar.empty()) { + return std::atoi(envvar.c_str()); + } + // The remainder of this function correctly determines the number of lanes the device API supports. + switch (api) { + case DeviceAPI::Metal: + case DeviceAPI::WebGPU: + case DeviceAPI::Vulkan: + case DeviceAPI::D3D12Compute: + return 4; + case DeviceAPI::OpenCL: + return 16; + case DeviceAPI::CUDA: + case DeviceAPI::Hexagon: + case DeviceAPI::HexagonDma: + case DeviceAPI::Host: + return 0; // No max: LLVM based legalization + case DeviceAPI::None: + return parent_max_lanes; + case DeviceAPI::Default_GPU: + internal_error << "No GPU API was selected."; + return 0; + } + internal_error << "Unknown Device API"; + return 0; +} + +class LiftLetToLetStmt : public IRMutator { + using IRMutator::visit; + + unordered_set lifted_let_names; + vector lets; + Expr visit(const Let *op) override { + internal_assert(lifted_let_names.count(op->name) == 0) + << "Let " << op->name << " = ... cannot be lifted to LetStmt because the name is not unique."; + lets.push_back(op); + lifted_let_names.insert(op->name); + return mutate(op->body); + } + +public: + Stmt mutate(const Stmt &s) override { + ScopedValue scoped_lets(lets, {}); + Stmt mutated = IRMutator::mutate(s); + for (const Let *let : reverse_view(lets)) { + mutated = LetStmt::make(let->name, let->value, mutated); + } + return mutated; + } + + Expr mutate(const Expr &e) override { + return IRMutator::mutate(e); + } +}; + +class LiftExceedingVectors : public IRMutator { + using IRMutator::visit; + + int max_lanes; + + vector> lets; + bool just_in_let_definition{false}; + + template + auto visit_let_or_letstmt(const LetOrLetStmt *op) -> decltype(op->body) { + just_in_let_definition = true; + Expr def = mutate(op->value); + just_in_let_definition = false; + + decltype(op->body) body = mutate(op->body); + if (def.same_as(op->value) && body.same_as(op->body)) { + return op; + } + return LetOrLetStmt::make(op->name, std::move(def), std::move(body)); + } + + Expr visit(const Let *op) override { + return visit_let_or_letstmt(op); + } + + Stmt visit(const LetStmt *op) override { + return visit_let_or_letstmt(op); + } + + Expr visit(const Call *op) override { + // Custom handling of Call, to prevent certain things from being extracted out + // of the call arguments, as that's not always allowed. + bool exceeds_lanecount = op->type.lanes() > max_lanes; + Expr mutated = op; + if (exceeds_lanecount) { + std::vector args; + args.reserve(op->args.size()); + bool changed = false; + for (int i = 0; i < int(op->args.size()); ++i) { + bool may_extract = true; + if (op->is_intrinsic(Call::require)) { + // Call::require is special: it behaves a little like if-then-else: + // it runs the 3rd argument (the error handling part) only when there + // is an error. Extracting that would unconditionally print the error. + may_extract &= i < 2; + } + if (op->is_intrinsic(Call::if_then_else)) { + // Only allow the condition to be extracted. + may_extract &= i == 0; + } + const Expr &arg = op->args[i]; + if (may_extract) { + internal_assert(arg.type().lanes() == op->type.lanes()); + Expr mutated = mutate(arg); + if (!mutated.same_as(arg)) { + changed = true; + } + args.push_back(mutated); + } else { + args.push_back(arg); + } + } + if (!changed) { + return op; + } + mutated = Call::make(op->type, op->name, args, op->call_type); + } else { + mutated = IRMutator::visit(op); + } + return mutated; + } + +public: + Stmt mutate(const Stmt &s) override { + ScopedValue scoped_lets(lets, {}); + just_in_let_definition = false; + Stmt mutated = IRMutator::mutate(s); + for (auto &let : reverse_view(lets)) { + // There is no recurse into let.second. This is handled by repeatedly calling this transform. + mutated = LetStmt::make(let.first, let.second, mutated); + } + return mutated; + } + + Expr mutate(const Expr &e) override { + bool exceeds_lanecount = e.type().lanes() > max_lanes; + + if (exceeds_lanecount) { + bool should_extract = false; + should_extract |= e.node_type() == IRNodeType::Shuffle; + should_extract |= e.node_type() == IRNodeType::VectorReduce; + + should_extract &= !just_in_let_definition; + + debug((should_extract ? 3 : 4)) << "Max lanes (" << max_lanes << ") exceeded (" << e.type().lanes() << ") by: " << e << "\n"; + if (should_extract) { + std::string name = unique_name('t'); + Expr var = Variable::make(e.type(), name); + lets.emplace_back(name, e); + debug(3) << " => Lifted out into " << name << "\n"; + return var; + } + } + + just_in_let_definition = false; + return IRMutator::mutate(e); + } + + LiftExceedingVectors(int max_lanes) + : max_lanes(max_lanes) { + internal_assert(max_lanes != 0) << "LiftExceedingVectors should not be called when there is no lane limit."; + } +}; + +class LegalizeVectors : public IRMutator { + using IRMutator::visit; + + int max_lanes; + + Scope<> sliceable_vectors; + Scope> requested_slices; + + template + auto visit_let(const LetOrLetStmt *op) -> decltype(op->body) { + bool exceeds_lanecount = op->value.type().lanes() > max_lanes; + + if (exceeds_lanecount) { + int num_vecs = (op->value.type().lanes() + max_lanes - 1) / max_lanes; + debug(3) << "Legalize let " << op->value.type() << ": " << op->name + << " = " << op->value << " into " << num_vecs << " vecs\n"; + + // First mark this Let as sliceable before mutating the body: + ScopedBinding<> vector_is_slicable(sliceable_vectors, op->name); + + auto body = mutate(op->body); + // Here we know which requested vector variable slices should be created for the body of the Let/LetStmt to work. + + if (const std::vector *reqs = requested_slices.find(op->name)) { + for (const VectorSlice &sl : *reqs) { + Expr value = extract_lanes(op->value, sl.start, sl.stride, sl.count, sliceable_vectors, requested_slices); + value = mutate(value); + body = LetOrLetStmt::make(sl.variable_name, value, body); + debug(3) << " Add: let " << sl.variable_name << " = " << value << "\n"; + } + requested_slices.pop(op->name); + } + return body; + } else { + return IRMutator::visit(op); + } + } + + Stmt visit(const LetStmt *op) override { + return visit_let(op); + } + + Expr visit(const Let *op) override { + bool exceeds_lanecount = op->value.type().lanes() > max_lanes; + internal_assert(!exceeds_lanecount) << "All illegal Let's should have been converted to LetStmts"; + return IRMutator::visit(op); + } + + Stmt visit(const Store *op) override { + bool exceeds_lanecount = op->index.type().lanes() > max_lanes; + if (exceeds_lanecount) { + // Split up in multiple stores + int num_vecs = (op->index.type().lanes() + max_lanes - 1) / max_lanes; + + std::vector bundle_args; + bundle_args.reserve(num_vecs * 3); + + // Break up the index, predicate, and value of the Store into legal chunks. + for (int i = 0; i < num_vecs; ++i) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes); + + // Pack them in a known order: rhs, index, predicate + bundle_args.push_back(extract_lanes(op->value, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices)); + bundle_args.push_back(extract_lanes(op->index, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices)); + bundle_args.push_back(extract_lanes(op->predicate, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices)); + } + + // Run CSE on the joint bundle + Expr joint_bundle = Call::make(Int(32), Call::bundle, bundle_args, Call::PureIntrinsic); + joint_bundle = common_subexpression_elimination(joint_bundle); + + // Peel off the `Let` expressions introduced by the CSE pass + std::vector> let_bindings; + while (const Let *let = joint_bundle.as()) { + let_bindings.emplace_back(let->name, let->value); + joint_bundle = let->body; + } + + // Destructure the bundle to get our optimized expressions + const Call *struct_call = joint_bundle.as(); + internal_assert(struct_call && struct_call->is_intrinsic(Call::bundle)) + << "Expected the CSE bundle to remain a bundle Call."; + + // Construct the legal stores with the CSE'd expressions + std::vector assignments; + assignments.reserve(num_vecs); + for (int i = 0; i < num_vecs; ++i) { + + // Unpack in the same order we packed them + Expr rhs = struct_call->args[i * 3 + 0]; + Expr index = struct_call->args[i * 3 + 1]; + Expr predicate = struct_call->args[i * 3 + 2]; + + ModulusRemainder alignment = op->alignment; + if (i != 0) { + // In case i == 0, we are taking the first lane, and the alignment is still valid. + alignment = ModulusRemainder(); + } + + assignments.push_back(Store::make( + op->name, std::move(rhs), std::move(index), + op->param, std::move(predicate), alignment)); + } + + Stmt result = Block::make(assignments); + + // Wrap the block in LetStmts to properly scope all shared expressions + // Iterate backwards to build the LetStmt tree from the inside out. + for (auto &let : reverse_view(let_bindings)) { + result = LetStmt::make(let.first, let.second, result); + } + + debug(3) << "Legalized store " << Stmt(op) << " => " << result << "\n"; + return result; + } + return IRMutator::visit(op); + } + + Expr visit(const Shuffle *op) override { + // Primary violation: there are too many output lanes. + if (op->type.lanes() > max_lanes) { + // Break it down in multiple legal-output-length shuffles, and concatenate them back together. + int total_lanes = op->type.lanes(); + + std::vector output_chunks; + output_chunks.reserve((total_lanes + max_lanes - 1) / max_lanes); + for (int i = 0; i < total_lanes; i += max_lanes) { + int slice_len = std::min(max_lanes, total_lanes - i); + + std::vector slice_indices(slice_len); + for (int k = 0; k < slice_len; ++k) { + slice_indices[k] = op->indices[i + k]; + } + + Expr sub_shuffle = Shuffle::make(op->vectors, slice_indices); + + output_chunks.push_back(mutate(sub_shuffle)); + } + return Shuffle::make_concat(output_chunks); + } + + // Secondary violation: input vectors have too many lanes. + bool requires_mutation = false; + for (const auto &vec : op->vectors) { + if (vec.type().lanes() > max_lanes) { + requires_mutation = true; + break; + } + } + + if (requires_mutation) { + debug(4) << "Legalizing Shuffle " << Expr(op) << "\n"; + // We are dealing with a shuffle of an exceeding-lane-count vector argument. + // We can assume the variable here has extracted lane variables in surrounding Lets. + // So let's hope it's a simple case, and we can legalize. + + vector new_vectors; + vector> vector_and_lane_indices = op->vector_and_lane_indices(); + for (int i = 0; i < int(op->vectors.size()); ++i) { + const Expr &vec = op->vectors[i]; + if (vec.type().lanes() > max_lanes) { + debug(4) << " Arg " << i << ": " << vec << "\n"; + int num_vecs = (vec.type().lanes() + max_lanes - 1) / max_lanes; + for (int i = 0; i < num_vecs; i++) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(vec.type().lanes() - lane_start, max_lanes); + new_vectors.push_back(extract_lanes(vec, lane_start, 1, lane_count_for_vec, sliceable_vectors, requested_slices)); + } + } else { + new_vectors.push_back(IRMutator::mutate(vec)); + } + } + Expr result = simplify(Shuffle::make(new_vectors, op->indices)); + debug(3) << "Legalized " << Expr(op) << " => " << result << "\n"; + return result; + } + + // Base case: everything legal in this Shuffle + return IRMutator::visit(op); + } + + Expr visit(const VectorReduce *op) override { + // Written with the help of Gemini 3 Pro. + Expr value = mutate(op->value); + + int input_lanes = value.type().lanes(); + int output_lanes = op->type.lanes(); + + // Base case: we don't need legalization. + if (input_lanes <= max_lanes && output_lanes <= max_lanes) { + if (value.same_as(op->value)) { + return op; + } else { + return VectorReduce::make(op->op, value, output_lanes); + } + } + + // Recursive splitting strategy. + // Case A: Segmented Reduction (Multiple Output Lanes) + // Example: VectorReduce( <16 lanes>, output_lanes=2 ) with max_lanes=4. + // Input is too big. We split the OUTPUT domain. + // We calculate which chunk of the input corresponds to the first half of the output. + if (output_lanes > 1) { + // 1. Calculate good splitting point + int out_split = output_lanes / 2; + + // 2. However, do align to max_lanes to keep chunks native-sized if possible + if (out_split > max_lanes) { + out_split = (out_split / max_lanes) * max_lanes; + } else if (output_lanes > max_lanes) { + // If the total is > max, but half is < max (e.g. 6), + // we want to peel 'max' (4) rather than split (3). + out_split = max_lanes; + } + + // Take remainder beyond the split point + int out_remaining = output_lanes - out_split; + internal_assert(out_remaining >= 1); + + // Calculate the reduction factor to find where to split the input + // e.g., 16 input -> 2 output means factor is 8. + // If we want the first 1 output lane, we need the first 8 input lanes. + int reduction_factor = input_lanes / output_lanes; + int in_split = out_split * reduction_factor; + int in_remaining = input_lanes - in_split; + + Expr arg_lo = extract_lanes(value, 0, 1, in_split, sliceable_vectors, requested_slices); + Expr arg_hi = extract_lanes(value, in_split, 1, in_remaining, sliceable_vectors, requested_slices); + + // Recursively mutate the smaller reductions + Expr res_lo = mutate(VectorReduce::make(op->op, arg_lo, out_split)); + Expr res_hi = mutate(VectorReduce::make(op->op, arg_hi, out_remaining)); + + // Concatenate the results to form the new vector + return Shuffle::make_concat({res_lo, res_hi}); + } + + // Case B: Horizontal Reduction (Single Output Lane) + // Example: VectorReduce( <16 lanes>, output_lanes=1 ) with max_lanes=4. + // We cannot split the output. We must split the INPUT, reduce both halves + // to scalars, and then combine them. + if (output_lanes == 1) { + int in_split = input_lanes / 2; + int in_remaining = input_lanes - in_split; + + // Extract input halves + Expr arg_lo = extract_lanes(value, 0, 1, in_split, sliceable_vectors, requested_slices); + Expr arg_hi = extract_lanes(value, in_split, 1, in_remaining, sliceable_vectors, requested_slices); + + // Recursively reduce both halves to scalars + Expr res_lo = mutate(VectorReduce::make(op->op, arg_lo, 1)); + Expr res_hi = mutate(VectorReduce::make(op->op, arg_hi, 1)); + + // Combine using the standard binary operator for this reduction type + switch (op->op) { + case VectorReduce::Add: + return res_lo + res_hi; + case VectorReduce::SaturatingAdd: + return saturating_add(res_lo, res_hi); + case VectorReduce::Mul: + return res_lo * res_hi; + case VectorReduce::Min: + return min(res_lo, res_hi); + case VectorReduce::Max: + return max(res_lo, res_hi); + case VectorReduce::And: + return res_lo && res_hi; + case VectorReduce::Or: + return res_lo || res_hi; + default: + internal_error << "Unknown VectorReduce operator\n"; + return Expr(); + } + } + + internal_error << "Unreachable"; + return op; + } + +public: + LegalizeVectors(int max_lanes) + : max_lanes(max_lanes) { + internal_assert(max_lanes != 0) << "LegalizeVectors should not be called when there is no lane limit."; + } +}; + +} // namespace + +Stmt legalize_vectors_in_device_loop(const For *op) { + int max_lanes = max_lanes_for_device(op->device_api, 0); + + // Similar to CSE, lifting out stuff into variables. + // Pass 1): lift out Shuffles that exceed lane count into variables + // Pass 2): Rewrite those vector variables as bundles of vector variables, while legalizing all other stuff. + Stmt m0 = simplify(op->body); + Stmt m1 = common_subexpression_elimination(m0, false); + if (!m1.same_as(op->body)) { + debug(3) << "After CSE:\n" + << m1 << "\n"; + } + Stmt m2 = LiftLetToLetStmt().mutate(m1); + if (!m2.same_as(m1)) { + debug(3) << "After lifting Lets to LetStmts:\n" + << m2 << "\n"; + } + + Stmt m3 = m2; + while (true) { + Stmt m = LiftExceedingVectors(max_lanes).mutate(m3); + bool modified = !m3.same_as(m); + m3 = std::move(m); + if (!modified) { + debug(3) << "Nothing got lifted out\n"; + break; + } else { + debug(3) << "Atfer lifting exceeding vectors:\n" + << m3 << "\n"; + } + } + + Stmt m4 = LegalizeVectors(max_lanes).mutate(m3); + if (!m4.same_as(m3)) { + debug(3) << "After legalizing vectors:\n" + << m4 << "\n"; + } + if (m4.same_as(m2)) { + debug(3) << "Vector Legalization did do nothing, returning input.\n"; + return op; + } + Stmt m5 = simplify(m4); + if (!m4.same_as(m5)) { + debug(3) << "After simplify:\n" + << m5 << "\n"; + } + return For::make(op->name, op->min, op->max, op->for_type, + op->partition_policy, op->device_api, m5); +} + +Stmt legalize_vectors(const Stmt &s) { + return mutate_with(s, [&](auto *self, const For *op) { + if (max_lanes_for_device(op->device_api, 0)) { + return legalize_vectors_in_device_loop(op); + } + return self->visit_base(op); + }); +} +} // namespace Internal +} // namespace Halide diff --git a/src/LegalizeVectors.h b/src/LegalizeVectors.h new file mode 100644 index 000000000000..14fe8d806fb1 --- /dev/null +++ b/src/LegalizeVectors.h @@ -0,0 +1,19 @@ +#ifndef HALIDE_INTERNAL_LEGALIZE_VECTORS_H +#define HALIDE_INTERNAL_LEGALIZE_VECTORS_H + +#include "Expr.h" + +/** \file + * Defines a lowering pass that legalizes vectorized expressions + * to have a maximal lane count. + */ + +namespace Halide { +namespace Internal { + +Stmt legalize_vectors(const Stmt &s); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/Lower.cpp b/src/Lower.cpp index 9b55bd20840d..0af72b265cf0 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -42,6 +42,7 @@ #include "InjectHostDevBufferCopies.h" #include "Inline.h" #include "LICM.h" +#include "LegalizeVectors.h" #include "LoopCarry.h" #include "LowerParallelTasks.h" #include "LowerWarpShuffles.h" @@ -439,6 +440,10 @@ void lower_impl(const vector &output_funcs, s = flatten_nested_ramps(s); log("Lowering after flattening nested ramps:", s); + debug(1) << "Legalizing vectors...\n"; + s = legalize_vectors(s); + log("Lowering after legalizing vectors:", s); + debug(1) << "Removing dead allocations and moving loop invariant code...\n"; s = remove_dead_allocations(s); s = simplify(s); diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp index 0eb3bbaf3c15..3d1d7b8f053b 100644 --- a/src/Simplify_Exprs.cpp +++ b/src/Simplify_Exprs.cpp @@ -69,7 +69,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { return value; } - if (info && op->type.is_int()) { + if (info && op->type.is_int_or_uint()) { switch (op->op) { case VectorReduce::Add: // Alignment of result is the alignment of the arg. Bounds @@ -123,7 +123,8 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { case VectorReduce::Add: { auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type); if (rewrite(h_add(x * broadcast(y, arg_lanes), lanes), h_add(x, lanes) * broadcast(y, lanes)) || - rewrite(h_add(broadcast(x, arg_lanes) * y, lanes), h_add(y, lanes) * broadcast(x, lanes))) { + rewrite(h_add(broadcast(x, arg_lanes) * y, lanes), h_add(y, lanes) * broadcast(x, lanes)) || + rewrite(h_add(broadcast(x, arg_lanes), lanes), broadcast(x * factor, lanes))) { return mutate(rewrite.result, info); } break; @@ -136,7 +137,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { rewrite(h_min(max(broadcast(x, arg_lanes), y), lanes), max(h_min(y, lanes), broadcast(x, lanes))) || rewrite(h_min(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) || rewrite(h_min(broadcast(x, c0), lanes), h_min(x, lanes), factor % c0 == 0) || - rewrite(h_min(ramp(x, y, arg_lanes), lanes), x + min(y * (arg_lanes - 1), 0)) || + (lanes == 1 && rewrite(h_min(ramp(x, y, arg_lanes), lanes), x + min(y * (arg_lanes - 1), 0))) || false) { return mutate(rewrite.result, info); } @@ -150,7 +151,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { rewrite(h_max(max(broadcast(x, arg_lanes), y), lanes), max(h_max(y, lanes), broadcast(x, lanes))) || rewrite(h_max(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) || rewrite(h_max(broadcast(x, c0), lanes), h_max(x, lanes), factor % c0 == 0) || - rewrite(h_max(ramp(x, y, arg_lanes), lanes), x + max(y * (arg_lanes - 1), 0)) || + (lanes == 1 && rewrite(h_max(ramp(x, y, arg_lanes), lanes), x + max(y * (arg_lanes - 1), 0))) || false) { return mutate(rewrite.result, info); } @@ -164,14 +165,14 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { rewrite(h_and(broadcast(x, arg_lanes) && y, lanes), h_and(y, lanes) && broadcast(x, lanes)) || rewrite(h_and(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) || rewrite(h_and(broadcast(x, c0), lanes), h_and(x, lanes), factor % c0 == 0) || - rewrite(h_and(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes), - x + max(y * (arg_lanes - 1), 0) < z) || - rewrite(h_and(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes), - x + max(y * (arg_lanes - 1), 0) <= z) || - rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), - x < y + min(z * (arg_lanes - 1), 0)) || - rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), - x <= y + min(z * (arg_lanes - 1), 0)) || + (lanes == 1 && rewrite(h_and(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes), + x + max(y * (arg_lanes - 1), 0) < z)) || + (lanes == 1 && rewrite(h_and(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes), + x + max(y * (arg_lanes - 1), 0) <= z)) || + (lanes == 1 && rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), + x < y + min(z * (arg_lanes - 1), 0))) || + (lanes == 1 && rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), + x <= y + min(z * (arg_lanes - 1), 0))) || false) { return mutate(rewrite.result, info); } @@ -186,14 +187,14 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) { rewrite(h_or(broadcast(x, arg_lanes), lanes), broadcast(x, lanes)) || rewrite(h_or(broadcast(x, c0), lanes), h_or(x, lanes), factor % c0 == 0) || // type of arg_lanes is somewhat indeterminate - rewrite(h_or(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes), - x + min(y * (arg_lanes - 1), 0) < z) || - rewrite(h_or(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes), - x + min(y * (arg_lanes - 1), 0) <= z) || - rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), - x < y + max(z * (arg_lanes - 1), 0)) || - rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), - x <= y + max(z * (arg_lanes - 1), 0)) || + (lanes == 1 && rewrite(h_or(ramp(x, y, arg_lanes) < broadcast(z, arg_lanes), lanes), + x + min(y * (arg_lanes - 1), 0) < z)) || + (lanes == 1 && rewrite(h_or(ramp(x, y, arg_lanes) <= broadcast(z, arg_lanes), lanes), + x + min(y * (arg_lanes - 1), 0) <= z)) || + (lanes == 1 && rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), + x < y + max(z * (arg_lanes - 1), 0))) || + (lanes == 1 && rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes), + x <= y + max(z * (arg_lanes - 1), 0))) || false) { return mutate(rewrite.result, info); } diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp index 1c60e7a2510d..9f18a6fb25c1 100644 --- a/src/Simplify_Let.cpp +++ b/src/Simplify_Let.cpp @@ -98,7 +98,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) { Expr new_var = Variable::make(f.new_value.type(), f.new_name); Expr replacement = new_var; - debug(4) << "simplify let " << op->name << " = " << f.value << " in...\n"; + debug(4) << "simplify let " << op->name << " = (" << f.value.type() << ") " << f.value << " in...\n"; while (true) { const Variable *var = f.new_value.template as(); @@ -180,6 +180,21 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) { f.new_value = cast->value; new_var = Variable::make(f.new_value.type(), f.new_name); replacement = substitute(f.new_name, Cast::make(cast->type, new_var), replacement); + } else if (shuffle && shuffle->is_concat() && is_pure(shuffle)) { + // Substitute in all concatenates as they will likely simplify + // with other shuffles. + // As the structure of this while loop makes it hard to peel off + // pure operations from _all_ arguments to the Shuffle, we will + // instead substitute all of the vars that go in the shuffle, and + // instead guard against side effects by checking with `is_pure()`. + // + // Also, it is safe to substitute in without combinatorial + // blow-up, because deeply nested concats implies a + // combinatorially-large number of vector lanes, which we can't + // express in the type system anyway. + replacement = substitute(f.new_name, shuffle, replacement); + f.new_value = Expr(); + break; } else if (shuffle && shuffle->is_slice()) { // Replacing new_value below might free the shuffle // indices vector, so save them now. diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp index aecb4c6fc99a..644418664ffc 100644 --- a/src/Simplify_Shuffle.cpp +++ b/src/Simplify_Shuffle.cpp @@ -5,6 +5,7 @@ namespace Halide { namespace Internal { +using std::pair; using std::vector; Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { @@ -25,9 +26,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { } } - // Mutate the vectors vector new_vectors; + vector new_indices = op->indices; bool changed = false; + + // Mutate the vectors for (const Expr &vector : op->vectors) { ExprInfo v_info; Expr new_vector = mutate(vector, &v_info); @@ -45,57 +48,164 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { new_vectors.push_back(new_vector); } - // Try to convert a load with shuffled indices into a - // shuffle of a dense load. + // A concat of one vector, is just the vector. + // (Early check, this is repeated below, once the argument list is potentially reduced) + if (op->vectors.size() == 1 && op->is_concat()) { + return new_vectors[0]; + } + + Expr result = op; + + // Analyze which input vectors are actually used. We will rewrite + // the vector of inputs and the indices jointly, and continue with + // those below. + { + vector arg_used(new_vectors.size()); + // Figure out if all extracted lanes come from 1 component. + vector> src_vec_and_lane_idx = op->vector_and_lane_indices(); + for (int i = 0; i < int(op->indices.size()); ++i) { + arg_used[src_vec_and_lane_idx[i].first] = true; + } + size_t num_args_used = 0; + for (bool used : arg_used) { + if (used) { + num_args_used++; + } + } + + if (num_args_used < op->vectors.size()) { + // Not all arguments to the shuffle are used by the indices. + // Let's throw them out. + for (int vi = arg_used.size() - 1; vi >= 0; --vi) { + if (!arg_used[vi]) { + int lanes_deleted = op->vectors[vi].type().lanes(); + int vector_start_lane = 0; + for (int i = 0; i < vi; ++i) { + vector_start_lane += op->vectors[i].type().lanes(); + } + for (int &new_index : new_indices) { + if (new_index > vector_start_lane) { + internal_assert(new_index >= vector_start_lane + lanes_deleted); + new_index -= lanes_deleted; + } + } + new_vectors.erase(new_vectors.begin() + vi); + } + } + + changed = true; + } + } + + // Replace the op with the intermediate simplified result (if it changed), and continue. + if (changed) { + result = Shuffle::make(new_vectors, new_indices); + op = result.as(); + changed = false; + } + + if (new_vectors.size() == 1) { + const Ramp *ramp = new_vectors[0].as(); + if (ramp && ramp->base.type().is_scalar() && op->is_slice()) { + int first_lane_in_src = op->indices[0]; + int slice_stride = op->slice_stride(); + if (slice_stride >= 1) { + return mutate(Ramp::make(ramp->base + first_lane_in_src * ramp->stride, + ramp->stride * slice_stride, + op->indices.size()), + nullptr); + } + } + + // Test this again, but now after new_vectors got potentially shorter. + if (op->is_concat()) { + return new_vectors[0]; + } + } + + // Try to convert a Shuffle of Loads into a single Load of a Ramp. + // Make sure to not undo the work of the StageStridedLoads pass: + // only if the result of the shuffled indices is a *dense* ramp, we + // can proceed. There are two side cases: concatenations of scalars, + // and when the loads weren't dense to begin with. if (const Load *first_load = new_vectors[0].as()) { vector load_predicates; vector load_indices; + bool all_loads_are_dense = true; bool unpredicated = true; + bool concat_of_scalars = true; for (const Expr &e : new_vectors) { const Load *load = e.as(); if (load && load->name == first_load->name) { load_predicates.push_back(load->predicate); load_indices.push_back(load->index); unpredicated = unpredicated && is_const_one(load->predicate); + if (const Ramp *index_ramp = load->index.as()) { + if (!is_const_one(index_ramp->stride)) { + all_loads_are_dense = false; + } + } else if (!load->index.type().is_scalar()) { + all_loads_are_dense = false; + } + if (!load->index.type().is_scalar()) { + concat_of_scalars = false; + } } else { break; } } + debug(3) << "Shuffle of Load found: " << result << " where" + << " all_loads_are_dense=" << all_loads_are_dense << "," + << " concat_of_scalars=" << concat_of_scalars << "\n"; + if (load_indices.size() == new_vectors.size()) { + // All of the Shuffle arguments are Loads. Type t = load_indices[0].type().with_lanes(op->indices.size()); Expr shuffled_index = Shuffle::make(load_indices, op->indices); + debug(3) << " Shuffled index: " << shuffled_index << "\n"; ExprInfo shuffled_index_info; shuffled_index = mutate(shuffled_index, &shuffled_index_info); - if (shuffled_index.as()) { - ExprInfo base_info; - if (const Ramp *r = shuffled_index.as()) { - mutate(r->base, &base_info); - } + debug(3) << " Simplified shuffled index: " << shuffled_index << "\n"; + if (const Ramp *index_ramp = shuffled_index.as()) { + if (is_const_one(index_ramp->stride) || !all_loads_are_dense || concat_of_scalars) { + ExprInfo base_info; + mutate(index_ramp->base, &base_info); - ModulusRemainder alignment = - ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment); + ModulusRemainder alignment = + ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment); - Expr shuffled_predicate; - if (unpredicated) { - shuffled_predicate = const_true(t.lanes(), nullptr); - } else { - shuffled_predicate = Shuffle::make(load_predicates, op->indices); - shuffled_predicate = mutate(shuffled_predicate, nullptr); + Expr shuffled_predicate; + if (unpredicated) { + shuffled_predicate = const_true(t.lanes(), nullptr); + } else { + shuffled_predicate = Shuffle::make(load_predicates, op->indices); + shuffled_predicate = mutate(shuffled_predicate, nullptr); + } + t = first_load->type; + t = t.with_lanes(op->indices.size()); + Expr result = Load::make(t, first_load->name, shuffled_index, first_load->image, + first_load->param, shuffled_predicate, alignment); + debug(3) << " => " << result << "\n"; + return result; } - t = first_load->type; - t = t.with_lanes(op->indices.size()); - return Load::make(t, first_load->name, shuffled_index, first_load->image, - first_load->param, shuffled_predicate, alignment); + } else { + // We can't... Leave it as a Shuffle of Loads. + // Note: no mutate-recursion as we are dealing here with a + // Shuffle of Loads, which have already undergone mutation + // early in this function (new_vectors). + return result; } } } // Try to collapse a shuffle of broadcasts into a single // broadcast. Note that it doesn't matter what the indices - // are. + // are. Only applies when the broadcast value is scalar, + // because Broadcast::make(vec, N) has vec.lanes() * N total + // lanes. const Broadcast *b1 = new_vectors[0].as(); - if (b1) { + if (b1 && b1->value.type().is_scalar()) { bool can_collapse = true; for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) { if (const Broadcast *b2 = new_vectors[i].as()) { @@ -289,13 +399,18 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { if (inner_shuffle->is_concat()) { int slice_min = op->indices.front(); int slice_max = op->indices.back(); + if (slice_min > slice_max) { + // Slices can go backward. + std::swap(slice_min, slice_max); + } int concat_index = 0; int new_slice_start = -1; vector new_concat_vectors; for (const auto &v : inner_shuffle->vectors) { // Check if current concat vector overlaps with slice. - if ((concat_index >= slice_min && concat_index <= slice_max) || - ((concat_index + v.type().lanes() - 1) >= slice_min && (concat_index + v.type().lanes() - 1) <= slice_max)) { + int overlap_max = std::min(slice_max, concat_index + v.type().lanes() - 1); + int overlap_min = std::max(slice_min, concat_index); + if (overlap_min <= overlap_max) { if (new_slice_start < 0) { new_slice_start = concat_index; } @@ -305,17 +420,16 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { concat_index += v.type().lanes(); } if (new_concat_vectors.size() < inner_shuffle->vectors.size()) { - return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), op->slice_begin() - new_slice_start, op->slice_stride(), op->indices.size()); + return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), + op->slice_begin() - new_slice_start, + op->slice_stride(), + op->indices.size()); } } } } - if (!changed) { - return op; - } else { - return Shuffle::make(new_vectors, op->indices); - } + return result; } } // namespace Internal diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 2d149adbaf20..fc6fd9531983 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -732,8 +732,8 @@ class VectorSubs : public IRMutator { if (op->is_intrinsic(Call::prefetch)) { // We don't want prefetch args to ve vectorized, but we can't just skip the mutation - // (otherwise we can end up with dead loop variables. Instead, use extract_lane() on each arg - // to scalarize it again. + // (otherwise we can end up with dead loop variables). Instead, use extract_lane() on + // each arg to scalarize it again. for (auto &arg : new_args) { if (arg.type().is_vector()) { arg = extract_lane(arg, 0); diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h index e9d345b6d403..db74739e20da 100644 --- a/src/runtime/vulkan_internal.h +++ b/src/runtime/vulkan_internal.h @@ -280,6 +280,8 @@ const char *vk_get_error_name(VkResult error) { return "VK_ERROR_FORMAT_NOT_SUPPORTED"; case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL"; + case VK_ERROR_UNKNOWN: + return "VK_ERROR_UNKNOWN"; case VK_ERROR_SURFACE_LOST_KHR: return "VK_ERROR_SURFACE_LOST_KHR"; case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR: @@ -303,6 +305,8 @@ const char *vk_get_error_name(VkResult error) { } } +#define vk_report_error(user_context, code, func) (error((user_context)) << "Vulkan: " << (func) << " returned " << vk_get_error_name((code)) << " (code: " << (code) << ") ") + // -------------------------------------------------------------------------- } // namespace diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h index d2ef2ee5ba6f..35c38d05dd7f 100644 --- a/src/runtime/vulkan_resources.h +++ b/src/runtime/vulkan_resources.h @@ -85,7 +85,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, debug(user_context) << " vk_create_command_pool (user_context: " << user_context << ", " << "allocator: " << (void *)allocator << ", " - << "queue_index: " << queue_index << ")\n"; + << "queue_index: " << queue_index << ")"; #endif if (allocator == nullptr) { @@ -103,7 +103,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkResult result = vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create command pool!\n"; + vk_report_error(user_context, result, "vkCreateCommandPool"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -117,7 +117,7 @@ int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator << "command_pool: " << (void *)command_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkResetCommandPool(allocator->current_device(), command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT); @@ -135,7 +135,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato << "command_pool: " << (void *)command_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -150,7 +150,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato VkResult result = vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to allocate command buffers!\n"; + vk_report_error(user_context, result, "vkAllocateCommandBuffers"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -165,7 +165,7 @@ int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocat << "command_buffer: " << (void *)command_buffer << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -231,7 +231,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context, VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info); if (result != VK_SUCCESS) { - error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkBeginCommandBuffer"); return halide_error_code_generic_error; } @@ -242,7 +242,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context, result = vkEndCommandBuffer(command_buffer); if (result != VK_SUCCESS) { - error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkEndCommandBuffer"); return halide_error_code_generic_error; } @@ -272,7 +272,7 @@ int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer VkResult result = vkQueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkSubmitQueue"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -325,7 +325,7 @@ int vk_create_descriptor_pool(void *user_context, << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -362,7 +362,7 @@ int vk_create_descriptor_pool(void *user_context, VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateDescriptorPool"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -378,7 +378,7 @@ int vk_destroy_descriptor_pool(void *user_context, << "descriptor_pool: " << (void *)descriptor_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks()); @@ -402,7 +402,7 @@ int vk_create_descriptor_set_layout(void *user_context, << "layout: " << (void *)layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -460,7 +460,7 @@ int vk_create_descriptor_set_layout(void *user_context, // Create the descriptor set layout VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout); if (result != VK_SUCCESS) { - error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateDescriptorSetLayout"); return halide_error_code_generic_error; } @@ -478,7 +478,7 @@ int vk_destroy_descriptor_set_layout(void *user_context, << "layout: " << (void *)descriptor_set_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks()); @@ -500,7 +500,7 @@ int vk_create_descriptor_set(void *user_context, << "descriptor_pool: " << (void *)descriptor_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -515,7 +515,7 @@ int vk_create_descriptor_set(void *user_context, VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkAllocateDescriptorSets"); return halide_error_code_generic_error; } @@ -541,7 +541,7 @@ int vk_update_descriptor_set(void *user_context, << "descriptor_set: " << (void *)descriptor_set << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -599,7 +599,7 @@ int vk_update_descriptor_set(void *user_context, // retrieve the buffer from the region VkBuffer *device_buffer = reinterpret_cast(owner->handle); if (device_buffer == nullptr) { - error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n"; + error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!"; return halide_error_code_internal_error; } @@ -698,7 +698,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!"; return nullptr; } @@ -711,7 +711,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context, // allocate a new region MemoryRegion *region = allocator->reserve(user_context, request); if ((region == nullptr) || (region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!\n"; + error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!"; return nullptr; } @@ -733,19 +733,19 @@ int vk_update_scalar_uniform_buffer(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } if ((region == nullptr) || (region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!"; return halide_error_code_internal_error; } // map the region to a host ptr uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region); if (host_ptr == nullptr) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!"; return halide_error_code_internal_error; } @@ -798,7 +798,7 @@ int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator * << "scalar_args_region: " << (void *)scalar_args_region << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -832,7 +832,7 @@ int vk_create_pipeline_layout(void *user_context, << "pipeline_layout: " << (void *)pipeline_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -841,7 +841,7 @@ int vk_create_pipeline_layout(void *user_context, if (descriptor_set_count > max_bound_descriptor_sets) { error(user_context) << "Vulkan: Number of descriptor sets for pipeline layout exceeds the number that can be bound by device!\n" << " requested: " << descriptor_set_count << "," - << " available: " << max_bound_descriptor_sets << "\n"; + << " available: " << max_bound_descriptor_sets; return halide_error_code_incompatible_device_interface; } } @@ -858,7 +858,7 @@ int vk_create_pipeline_layout(void *user_context, VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreatePipelineLayout"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -876,7 +876,7 @@ int vk_destroy_pipeline_layout(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -898,11 +898,12 @@ int vk_create_compute_pipeline(void *user_context, debug(user_context) << " vk_create_compute_pipeline (user_context: " << user_context << ", " << "allocator: " << (void *)allocator << ", " + << "pipeline_name: " << pipeline_name << ", " << "shader_module: " << (void *)shader_module << ", " << "pipeline_layout: " << (void *)pipeline_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -928,7 +929,10 @@ int vk_create_compute_pipeline(void *user_context, VkResult result = vkCreateComputePipelines(allocator->current_device(), VK_NULL_HANDLE, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateComputePipeline") + << "failed to create compute pipeline " << pipeline_name << ".\n" + << " (This might be a bug in Halide. To debug this, see the HL_SPIRV_DUMP_FILE environment variable, and use the Khronos validator to make a bug report)"; + return halide_error_code_generic_error; } @@ -955,24 +959,24 @@ int vk_setup_compute_pipeline(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!"; return halide_error_code_generic_error; } if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!"; return halide_error_code_generic_error; } VkResult result = VK_SUCCESS; const char *entry_point_name = shader_bindings->entry_point_name; if (entry_point_name == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!"; return halide_error_code_generic_error; } @@ -995,7 +999,7 @@ int vk_setup_compute_pipeline(void *user_context, } else { // dynamic allocation if (shared_mem_constant_id > 0) { - error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is supported!!\n"; + error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!"; result = VK_ERROR_TOO_MANY_OBJECTS; break; } @@ -1028,13 +1032,13 @@ int vk_setup_compute_pipeline(void *user_context, if (static_shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n" << " requested: " << static_shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } if (dispatch_data->shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of dynamic shared memory used exceeds device limit!\n" << " requested: " << dispatch_data->shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } } @@ -1065,14 +1069,14 @@ int vk_setup_compute_pipeline(void *user_context, } } if (found_index == invalid_index) { - error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!\n"; + error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!"; result = VK_ERROR_INITIALIZATION_FAILED; } } // don't even attempt to create the pipeline layout if we encountered errors in the shader binding if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result) << "\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result); return halide_error_code_generic_error; } @@ -1100,7 +1104,7 @@ int vk_setup_compute_pipeline(void *user_context, if (shader_bindings->compute_pipeline) { int error_code = vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to destroy compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to destroy compute pipeline!"; return halide_error_code_generic_error; } shader_bindings->compute_pipeline = VK_NULL_HANDLE; @@ -1108,7 +1112,7 @@ int vk_setup_compute_pipeline(void *user_context, int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline)); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to create compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline!"; return error_code; } @@ -1118,7 +1122,7 @@ int vk_setup_compute_pipeline(void *user_context, if (shader_bindings->compute_pipeline == VK_NULL_HANDLE) { int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline)); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to create compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline!"; return error_code; } } @@ -1138,7 +1142,7 @@ int vk_destroy_compute_pipeline(void *user_context, << "compute_pipeline: " << (void *)compute_pipeline << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -1160,12 +1164,12 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!"; return nullptr; } if ((module_ptr == nullptr) || (module_size < (2 * sizeof(uint32_t)))) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!"; return nullptr; } @@ -1213,7 +1217,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA uint32_t idx = 1; // skip past the header_word_count uint32_t shader_count = module_ptr[idx++]; if (shader_count < 1) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!"; return nullptr; // no descriptors } @@ -1222,7 +1226,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t shader_bindings_size = shader_count * sizeof(VulkanShaderBinding); VulkanShaderBinding *shader_bindings = (VulkanShaderBinding *)vk_host_malloc(user_context, shader_bindings_size, 0, alloc_scope, allocator->callbacks()); if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!"; return nullptr; } memset(shader_bindings, 0, shader_bindings_size); @@ -1255,7 +1259,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t specialization_constants_size = specialization_constants_count * sizeof(VulkanSpecializationConstant); specialization_constants = (VulkanSpecializationConstant *)vk_host_malloc(user_context, specialization_constants_size, 0, alloc_scope, allocator->callbacks()); if (specialization_constants == nullptr) { - error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!"; return nullptr; } memset(specialization_constants, 0, specialization_constants_size); @@ -1291,7 +1295,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t shared_memory_allocations_size = shared_memory_allocations_count * sizeof(VulkanSharedMemoryAllocation); shared_memory_allocations = (VulkanSharedMemoryAllocation *)vk_host_malloc(user_context, shared_memory_allocations_size, 0, alloc_scope, allocator->callbacks()); if (shared_memory_allocations == nullptr) { - error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!"; return nullptr; } memset(shared_memory_allocations, 0, shared_memory_allocations_size); @@ -1356,7 +1360,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA #endif shader_bindings[n].entry_point_name = (char *)vk_host_malloc(user_context, entry_point_name_length * sizeof(uint32_t), 0, alloc_scope, allocator->callbacks()); if (shader_bindings[n].entry_point_name == nullptr) { - error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!"; return nullptr; } @@ -1408,7 +1412,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all if (static_shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n" << " requested: " << static_shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } } @@ -1420,7 +1424,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all if (shader_count > max_descriptors) { error(user_context) << "Vulkan: Number of required descriptor sets exceeds the amount available for device!\n" << " requested: " << shader_count << "," - << " available: " << max_descriptors << "\n"; + << " available: " << max_descriptors; return halide_error_code_incompatible_device_interface; } } @@ -1516,7 +1520,7 @@ VulkanCompilationCacheEntry *vk_compile_kernel_module(void *user_context, Vulkan // Compile the "SPIR-V Module" for the kernel cache_entry->compiled_modules[i] = vk_compile_shader_module(user_context, allocator, (const char *)spirv_ptr, (int)spirv_size); if (cache_entry->compiled_modules[i] == nullptr) { - debug(user_context) << "Vulkan: Failed to compile shader module!\n"; + debug(user_context) << "Vulkan: Failed to compile shader module!"; error_code = halide_error_code_generic_error; } @@ -1556,12 +1560,12 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!"; return nullptr; } if ((ptr == nullptr) || (size <= 0)) { - error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!\n"; + error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!"; return nullptr; } @@ -1599,7 +1603,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT; VulkanCompiledShaderModule *compiled_module = (VulkanCompiledShaderModule *)vk_host_malloc(user_context, sizeof(VulkanCompiledShaderModule), 0, alloc_scope, allocator->callbacks()); if (compiled_module == nullptr) { - error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!"; return nullptr; } memset(compiled_module, 0, sizeof(VulkanCompiledShaderModule)); @@ -1607,7 +1611,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM // decode the entry point data and extract the shader bindings VulkanShaderBinding *decoded_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size); if (decoded_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to decode shader bindings!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings!"; return nullptr; } @@ -1624,8 +1628,8 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM compiled_module->shader_count = shader_count; VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &compiled_module->shader_module); - if ((result != VK_SUCCESS)) { - error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n"; + if (result != VK_SUCCESS) { + vk_report_error(user_context, result, "vkCreateShaderModule"); vk_host_free(user_context, compiled_module->shader_bindings, allocator->callbacks()); vk_host_free(user_context, compiled_module, allocator->callbacks()); return nullptr; @@ -1635,7 +1639,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM if (compiled_module->shader_count) { compiled_module->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, compiled_module->shader_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks()); if (compiled_module->descriptor_set_layouts == nullptr) { - error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!"; return nullptr; } memset(compiled_module->descriptor_set_layouts, 0, compiled_module->shader_count * sizeof(VkDescriptorSetLayout)); @@ -1808,7 +1812,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff VkBuffer *src_buffer = reinterpret_cast(c.src); VkBuffer *dst_buffer = reinterpret_cast(c.dst); if (!src_buffer || !dst_buffer) { - error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n"; + error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!"; return halide_error_code_internal_error; } @@ -1846,7 +1850,7 @@ int vk_device_crop_from_offset(void *user_context, VulkanContext ctx(user_context); if (ctx.error != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to acquire context!\n"; + error(user_context) << "Vulkan: Failed to acquire context!"; return ctx.error; } @@ -1855,14 +1859,14 @@ int vk_device_crop_from_offset(void *user_context, #endif if (byte_offset < 0) { - error(user_context) << "Vulkan: Invalid offset for device crop!\n"; + error(user_context) << "Vulkan: Invalid offset for device crop!"; return halide_error_code_device_crop_failed; } // get the allocated region for the device MemoryRegion *device_region = reinterpret_cast(src->device); if (device_region == nullptr) { - error(user_context) << "Vulkan: Failed to crop region! Invalid device region!\n"; + error(user_context) << "Vulkan: Failed to crop region! Invalide device region!"; return halide_error_code_device_crop_failed; } @@ -1873,7 +1877,7 @@ int vk_device_crop_from_offset(void *user_context, region_indexing.offset = byte_offset / src->type.bytes(); MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, region_indexing); if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!\n"; + error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!"; return halide_error_code_device_crop_failed; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 64f2c9ac6825..9edfd0476cbf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,7 +4,7 @@ include(CheckCXXCompilerFlag) # Internal tests are a special case. # HalideTestHelpers depends on this test being present. add_executable(_test_internal internal.cpp) -target_link_libraries(_test_internal PRIVATE Halide::Test) +target_link_libraries(_test_internal PRIVATE Halide::Test Halide::TerminateHandler) target_include_directories(_test_internal PRIVATE "${Halide_SOURCE_DIR}/src") target_precompile_headers(_test_internal PRIVATE ) if (Halide_CCACHE_BUILD) diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 6d41a9e71219..d732f1e72284 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -124,6 +124,7 @@ tests(GROUPS correctness fuse.cpp fuse_gpu_threads.cpp fused_where_inner_extent_is_zero.cpp + fuzz_extract_lanes.cpp fuzz_float_stores.cpp fuzz_schedule.cpp fuzz_simplify.cpp @@ -221,6 +222,7 @@ tests(GROUPS correctness math.cpp median3x3.cpp memoize_cloned.cpp + metal_long_vectors.cpp metal_precompiled_shaders.cpp min_extent.cpp mod.cpp diff --git a/test/correctness/fuzz_extract_lanes.cpp b/test/correctness/fuzz_extract_lanes.cpp new file mode 100644 index 000000000000..e23e60ccf0be --- /dev/null +++ b/test/correctness/fuzz_extract_lanes.cpp @@ -0,0 +1,492 @@ +#include "Halide.h" +#include +#include +#include + +// Fuzz test for deinterleave / extract_lane operations in Deinterleave.cpp. +// Constructs random vector expressions covering the IR node types that +// the Deinterleaver has visit methods for, evaluates them by JIT-compiling +// with a custom lowering pass, then checks that deinterleave() produces +// results consistent with the original expression. + +namespace { + +using std::string; +using std::vector; +using namespace Halide; +using namespace Halide::Internal; + +using RandomEngine = std::mt19937_64; + +constexpr int fuzz_var_count = 3; +std::vector> fuzz_vars(fuzz_var_count); + +template +decltype(auto) random_choice(RandomEngine &rng, T &&choices) { + std::uniform_int_distribution dist(0, std::size(choices) - 1); + return choices[dist(rng)]; +} + +Type fuzz_types[] = {UInt(8), UInt(16), UInt(32), UInt(64), Int(8), Int(16), Int(32), Int(64)}; + +Type random_scalar_type(RandomEngine &rng) { + return random_choice(rng, fuzz_types); +} + +int random_factor(RandomEngine &rng, int x) { + vector factors; + factors.reserve(x); + for (int i = 1; i < x; i++) { + if (x % i == 0) { + factors.push_back(i); + } + } + return random_choice(rng, factors); +} + +Expr random_const(RandomEngine &rng, Type t) { + int val = (int)((int8_t)(rng() & 0x0f)); + if (t.is_vector()) { + return Broadcast::make(cast(t.element_of(), val), t.lanes()); + } else { + return cast(t, val); + } +} + +Expr random_leaf(RandomEngine &rng, Type t) { + if (t.is_scalar()) { + if (rng() & 1) { + // Variable + std::uniform_int_distribution dist(0, fuzz_var_count - 1); + return cast(t, fuzz_vars[dist(rng)]); + } else { + return random_const(rng, t); + } + } + // For vector types, build from Ramp or Broadcast + int lanes = t.lanes(); + if (rng() & 1) { + Expr base = random_leaf(rng, t.element_of()); + Expr stride = random_const(rng, t.element_of()); + return Ramp::make(base, stride, lanes); + } else { + Expr val = random_leaf(rng, t.element_of()); + return Broadcast::make(val, lanes); + } +} + +Expr random_vector_expr(RandomEngine &rng, Type t, int depth) { + if (depth <= 0 || t.lanes() == 1) { + return random_leaf(rng, t); + } + + // Weight the choices to cover all Deinterleaver visit methods: + // Broadcast, Ramp, Cast, Reinterpret, Call (via abs), Shuffle, + // VectorReduce, Add/Sub/Min/Max (handled by default IRMutator) + std::function ops[] = { + // Leaf + [&]() -> Expr { + return random_leaf(rng, t); + }, + // Add + [&]() -> Expr { + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + return a + b; + }, + // Sub (only for signed types to avoid unsigned underflow coercion errors) + [&]() -> Expr { + if (t.is_uint()) { + // Fall back to Add for unsigned types + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + return a + b; + } + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + return a - b; + }, + // Min + [&]() -> Expr { + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + return min(a, b); + }, + // Max + [&]() -> Expr { + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + internal_assert(a.type() == b.type()) << a << " " << b; + return max(a, b); + }, + // Select + [&]() -> Expr { + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + Expr c = random_vector_expr(rng, t, depth - 1); + Expr cond = (a > b); + return select(cond, a, c); + }, + // Cast + [&]() -> Expr { + // Cast from a different type + Type other = random_scalar_type(rng).with_lanes(t.lanes()); + while (other == t) { + other = random_scalar_type(rng).with_lanes(t.lanes()); + } + Expr e = random_vector_expr(rng, other, depth - 1); + return Cast::make(t, e); + }, + // Reinterpret (different bit width, changes lane count) + [&]() -> Expr { + int total_bits = t.bits() * t.lanes(); + // Pick a different bit width that divides the total bits evenly + int bit_widths[] = {8, 16, 32, 64}; + vector valid_widths; + for (int bw : bit_widths) { + if (total_bits % bw == 0) { + valid_widths.push_back(bw); + } + } + // Should at least be able to preserve the existing bit width and change signedness. + internal_assert(!valid_widths.empty()); + int other_bits = random_choice(rng, valid_widths); + int other_lanes = total_bits / other_bits; + Type other = ((rng() & 1) ? Int(other_bits) : UInt(other_bits)).with_lanes(other_lanes); + Expr e = random_vector_expr(rng, other, depth - 1); + return Reinterpret::make(t, e); + }, + // Broadcast of sub-expression + [&]() -> Expr { + int f = random_factor(rng, t.lanes()); + Expr val = random_vector_expr(rng, t.with_lanes(f), depth - 1); + return Broadcast::make(val, t.lanes() / f); + }, + // Ramp + [&]() -> Expr { + int f = random_factor(rng, t.lanes()); + Type sub_t = t.with_lanes(f); + Expr base = random_vector_expr(rng, sub_t, depth - 1); + Expr stride = random_const(rng, sub_t); + return Ramp::make(base, stride, t.lanes() / f); + }, + // Shuffle (interleave) + [&]() -> Expr { + if (t.lanes() >= 4 && t.lanes() % 2 == 0) { + int half = t.lanes() / 2; + Expr a = random_vector_expr(rng, t.with_lanes(half), depth - 1); + Expr b = random_vector_expr(rng, t.with_lanes(half), depth - 1); + return Shuffle::make_interleave({a, b}); + } + // Fall back to a simple expression + return random_vector_expr(rng, t, depth - 1); + }, + // Shuffle (concat) + [&]() -> Expr { + if (t.lanes() >= 4 && t.lanes() % 2 == 0) { + int half = t.lanes() / 2; + Expr a = random_vector_expr(rng, t.with_lanes(half), depth - 1); + Expr b = random_vector_expr(rng, t.with_lanes(half), depth - 1); + return Shuffle::make_concat({a, b}); + } + return random_vector_expr(rng, t, depth - 1); + }, + // Shuffle (slice) + [&]() -> Expr { + // Make a wider vector and slice it + if (t.lanes() <= 8) { + int wider = t.lanes() * 2; + Expr e = random_vector_expr(rng, t.with_lanes(wider), depth - 1); + // Slice: take every other element starting at 0 or 1 + int start = rng() & 1; + return Shuffle::make_slice(e, start, 2, t.lanes()); + } + return random_vector_expr(rng, t, depth - 1); + }, + // VectorReduce (only when we can make it work with lane counts) + [&]() -> Expr { + // Input has more lanes, output has t.lanes() lanes + // factor must divide input lanes, and input lanes = t.lanes() * factor + int factor = (rng() % 3) + 2; + int input_lanes = t.lanes() * factor; + if (input_lanes <= 32) { + VectorReduce::Operator ops[] = { + VectorReduce::Add, + VectorReduce::Min, + VectorReduce::Max, + }; + auto op = random_choice(rng, ops); + Expr val = random_vector_expr(rng, t.with_lanes(input_lanes), depth - 1); + internal_assert(val.type().lanes() == input_lanes) << val; + return VectorReduce::make(op, val, t.lanes()); + } + return random_vector_expr(rng, t, depth - 1); + }, + // Call node (using a pure intrinsic like absd) + [&]() -> Expr { + Expr a = random_vector_expr(rng, t, depth - 1); + Expr b = random_vector_expr(rng, t, depth - 1); + return cast(t, absd(a, b)); + }, + }; + + Expr e = random_choice(rng, ops)(); + internal_assert(e.type() == t) << e.type() << " " << t << " " << e; + return e; +} + +// A custom lowering pass that replaces a specific dummy store RHS with the +// desired test expression. This lets us JIT-evaluate arbitrary vector Exprs. +class InjectExpr : public IRMutator { + using IRMutator::visit; + + string func_name; + const std::vector &replacements; + int idx = 0; + + Stmt visit(const Store *op) override { + // Replace calls to our dummy function with the replacement expr + internal_assert(idx < (int)replacements.size()); + if (op->name == func_name) { + return Store::make(op->name, flatten_nested_ramps(replacements[idx++]), + op->index, op->param, op->predicate, op->alignment); + } + return IRMutator::visit(op); + } + +public: + InjectExpr(const string &func_name, const std::vector &replacements) + : func_name(func_name), replacements(replacements) { + } +}; + +// Evaluate a vector expression by JIT-compiling it. Returns the values +// as a vector of int64_t (to hold any integer type). +// The expression may reference variables a, b, c which are set to fixed values. +bool evaluate_vector_exprs(const std::vector &e, + Buffer &result) { + Type t = e[0].type(); + int lanes = t.lanes(); + + // Create a Func that outputs a vector of the right size + Func f("test_func"); + Var x("x"), y("y"); + + // We define f(x, y) as a dummy, then inject our expressions via a custom + // lowering pass + Expr fuzz_var_sum = 0; + for (int i = 0; i < fuzz_var_count; i++) { + fuzz_var_sum += fuzz_vars[i]; + } + f(x, y) = cast(t.element_of(), fuzz_var_sum); + f.bound(x, 0, lanes) + .bound(y, 0, (int)e.size()) + .vectorize(x) + .unroll(y); + + // The custom lowering pass replaces the dummy RHS + InjectExpr injector(f.name(), e); + + auto buf = Runtime::Buffer<>(t.element_of(), {lanes, (int)e.size()}); + + Pipeline p(f); + p.add_custom_lowering_pass(&injector, nullptr); + if (get_target_from_environment() == get_host_target()) { + p.realize(buf); + } else { + // Compile something, to be able to at least test CodeGen from the backends and LLVM. + p.compile_to_assembly("fuzz_extract_lanes.s", {fuzz_vars[0], fuzz_vars[1], fuzz_vars[2]}, "fuzz_func"); + return false; + } + + // Upcast results to int64 for easier comparison + internal_assert(result.height() == (int)e.size()); + internal_assert(result.width() == lanes); + for (int y = 0; y < (int)e.size(); y++) { + for (int x = 0; x < lanes; x++) { + if (t.is_uint()) { + switch (t.bits()) { + case 8: + result(x, y) = buf.as()(x, y); + break; + case 16: + result(x, y) = buf.as()(x, y); + break; + case 32: + result(x, y) = buf.as()(x, y); + break; + case 64: + result(x, y) = buf.as()(x, y); + break; + default: + return false; + } + } else { + switch (t.bits()) { + case 8: + result(x, y) = buf.as()(x, y); + break; + case 16: + result(x, y) = buf.as()(x, y); + break; + case 32: + result(x, y) = buf.as()(x, y); + break; + case 64: + result(x, y) = buf.as()(x, y); + break; + default: + return false; + } + } + } + } + + return true; +} + +template +T initialize_rng() { + constexpr size_t kStateWords = T::state_size * sizeof(typename T::result_type) / sizeof(uint32_t); + vector random(kStateWords); + std::generate(random.begin(), random.end(), std::random_device{}); + std::seed_seq seed_seq(random.begin(), random.end()); + return T{seed_seq}; +} + +bool test_one(RandomEngine &rng) { + // Pick a random vector width and type + int lanes = std::uniform_int_distribution(4, 16)(rng); + Type scalar_t = random_scalar_type(rng); + Type t = scalar_t.with_lanes(lanes); + + // Pick random deinterleave parameters + int starting_lane = std::uniform_int_distribution(0, lanes - 1)(rng); + int ending_lane = std::uniform_int_distribution(0, lanes - 1)(rng); + int new_lanes = std::abs(ending_lane - starting_lane) + 1; + int lane_stride = std::uniform_int_distribution(1, new_lanes)(rng); + // bias it towards small strides + lane_stride = std::uniform_int_distribution(1, lane_stride)(rng); + new_lanes /= lane_stride; + if (starting_lane > ending_lane) { + lane_stride = -lane_stride; + } + + // Generate a batch of random vector expressions + constexpr int batch_size = 64; + constexpr int depth = 4; + std::vector original(batch_size); + std::vector sliced(batch_size); + + for (int i = 0; i < batch_size; i++) { + original[i] = random_vector_expr(rng, t, depth); + sliced[i] = extract_lanes(original[i], starting_lane, lane_stride, new_lanes); + internal_assert(sliced[i].type() == scalar_t.with_lanes(new_lanes)) + << sliced[i].type() << " vs " << scalar_t.with_lanes(new_lanes); + } + + // Pick random variable values + for (int i = 0; i < fuzz_var_count; i++) { + fuzz_vars[i].set((int)((int8_t)(rng() & 0x0f))); + } + + // Evaluate both + Buffer orig_vals(lanes, batch_size), sliced_vals(new_lanes, batch_size); + if (!evaluate_vector_exprs(original, orig_vals) || + !evaluate_vector_exprs(sliced, sliced_vals)) { + // Can't evaluate this for whatever reason + return true; + } + + // Check that the sliced values match the corresponding lanes of the original + for (int y = 0; y < batch_size; y++) { + for (int x = 0; x < new_lanes; x++) { + int orig_lane = starting_lane + x * lane_stride; + if (sliced_vals(x, y) != orig_vals(orig_lane, y)) { + std::cerr << "MISMATCH! (y=" << y << ", x=" << x << ")\n" + << "Original expr: " << original[y] << "\n" + << "Original type: " << original[y].type() << "\n" + << "ExtractLanes params: starting_lane=" << starting_lane + << " lane_stride=" << lane_stride + << " new_lanes=" << new_lanes << "\n" + << "Sliced expr: " << sliced[y] << "\n" + << "Variables:"; + for (int j = 0; j < fuzz_var_count; j++) { + std::cerr << " " << fuzz_vars[j].name() << "=" << fuzz_vars[j].get() << "\n"; + } + std::cerr << "\n" + << "Original values:"; + for (int j = 0; j < lanes; j++) { + std::cerr << " " << orig_vals(j, y); + } + std::cerr << "\n" + << "Sliced values:"; + for (int j = 0; j < new_lanes; j++) { + std::cerr << " " << sliced_vals(j, y); + } + std::cerr << "\n"; + return false; + } + } + + std::cerr << "Original values:"; + for (int j = 0; j < lanes; j++) { + std::cerr << " " << orig_vals(j, y); + } + std::cerr << " Sliced values:"; + for (int j = 0; j < new_lanes; j++) { + std::cerr << " " << sliced_vals(j, y); + } + std::cerr << " Correct.\n"; + } + + return true; +} + +} // namespace + +int main(int argc, char **argv) { + Target t = get_jit_target_from_environment(); + if (t.has_feature(Target::SVE2)) { + printf("[SKIP-WITH-ISSUE-9026] LLVM generates incorrect IR for some expressions.\n"); + return 0; + } + if (t.arch != Target::X86 || t.bits != 64) { + printf("[SKIP-WITH-ISSUE-9040] Only running test on X86-64 for now. See also #9044."); + return 0; + } + auto seed_generator = initialize_rng(); + + /* Seeds known to have failed in the past: */ + std::vector seeds_to_try = { + 11290674455725750672ull, + 18322803614019275106ull, + 12847901530538798383ull, + + // Failures on ARM: + 5792148528566212763, + 6300344786331520063, + }; + + size_t num_iters = (argc > 1) ? 1 : 64; + + for (size_t i = 0; i < num_iters; i++) { + uint64_t seed = seed_generator(); + if (i < seeds_to_try.size()) { + seed = seeds_to_try[i]; + } + if (argc > 1) { + std::istringstream{argv[1]} >> seed; + } + std::cout << "Seed: " << seed << "\n"; + RandomEngine rng{seed}; + + if (!test_one(rng)) { + std::cout << "Failed with seed " << seed << "\n"; + return 1; + } + } + + std::cout << "Success!\n"; + return 0; +} diff --git a/test/error/metal_vector_too_large.cpp b/test/correctness/metal_long_vectors.cpp similarity index 89% rename from test/error/metal_vector_too_large.cpp rename to test/correctness/metal_long_vectors.cpp index bf4c74bb75a0..74c2e981fc2d 100644 --- a/test/error/metal_vector_too_large.cpp +++ b/test/correctness/metal_long_vectors.cpp @@ -9,7 +9,7 @@ int main(int argc, char **argv) { Var x("x"), y("y"); f(x, y) = input(x, y) + 42; - f.vectorize(x, 16).gpu_blocks(y, DeviceAPI::Metal); + f.vectorize(x, 32).gpu_blocks(y, DeviceAPI::Metal); std::string test_object = Internal::get_test_tmp_dir() + "metal_vector_too_large.o"; Target mac_target("x86-64-osx-metal"); diff --git a/test/correctness/require.cpp b/test/correctness/require.cpp index 625383f460df..58226077d971 100644 --- a/test/correctness/require.cpp +++ b/test/correctness/require.cpp @@ -9,7 +9,7 @@ void halide_error(JITUserContext *ctx, const char *msg) { // Emitting "error.*:" to stdout or stderr will cause CMake to report the // test as a failure on Windows, regardless of error code returned, // hence the abbreviation to "err". - printf("Saw (Expected) Halide Err: %s\n", msg); + printf("Saw (Expected) Halide Err: %s", msg); error_occurred = true; } @@ -46,14 +46,18 @@ static void test(int vector_width) { if (!error_occurred) { printf("There should have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } + printf("\n"); + p1.set(1); p2.set(kPrime1 - 1); error_occurred = false; result = f.realize({realize_width}); if (error_occurred) { - printf("There should not have been a requirement error (vector_width = %d)\n", vector_width); + printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width); exit(1); } for (int i = 0; i < realize_width; ++i) { @@ -64,6 +68,8 @@ static void test(int vector_width) { exit(1); } } + printf("OK\n"); + printf("\n"); ImageParam input(Int(32), 2); Expr h = require(p1 == p2, p1); @@ -81,8 +87,12 @@ static void test(int vector_width) { if (!error_occurred) { printf("There should have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } + printf("\n"); + p1.set(16); p2.set(16); @@ -91,6 +101,8 @@ static void test(int vector_width) { if (error_occurred) { printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } } diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 5a61df34252e..8139751d56e6 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -507,20 +507,27 @@ class SimdOpCheckTest { })); } + std::vector failed_tests; + constexpr int tabstop = 32; for (auto &f : futures) { auto result = f.get(); - constexpr int tabstop = 32; const int spaces = std::max(1, tabstop - (int)result.op.size()); std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n"; if (!result.error_msg.empty()) { std::cerr << result.error_msg; - // The thread-pool destructor will block until in-progress tasks - // are done, and then will discard any tasks that haven't been - // launched yet. - return false; + failed_tests.push_back(std::move(result)); } } + if (!failed_tests.empty()) { + std::cerr << "SIMD op check summary: " << failed_tests.size() << " tests failed:\n"; + for (auto &result : failed_tests) { + const int spaces = std::max(1, tabstop - (int)result.op.size()); + std::cerr << " " << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n"; + } + return false; + } + return true; } diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp index 628de4d91504..de10bde5a1b9 100644 --- a/test/correctness/simplify.cpp +++ b/test/correctness/simplify.cpp @@ -810,6 +810,24 @@ void check_vectors() { int_vector); check(VectorReduce::make(VectorReduce::Max, Broadcast::make(int_vector, 4), 8), VectorReduce::make(VectorReduce::Max, Broadcast::make(int_vector, 4), 8)); + + { + // h_add(broadcast(x, 8), 4) should simplify to broadcast(x * 2, 4) + check(VectorReduce::make(VectorReduce::Add, broadcast(x, 8), 4), + broadcast(x * 2, 4)); + } + + { + Expr const_u8 = cast(UInt(8), 3); + check(VectorReduce::make(VectorReduce::Add, broadcast(const_u8, 9), 3), broadcast(cast(UInt(8), 9), 3)); + } + + { + // Test VectorReduce::Add on a variable of unsigned type to ensure the multiplied factor + // keeps the correct type and avoids type-mismatch assertion failures. + Expr u8_x = Variable::make(UInt(8), "u8_x"); + check(VectorReduce::make(VectorReduce::Add, broadcast(u8_x, 9), 3), broadcast(u8_x * cast(UInt(8), 3), 3)); + } } void check_bounds() { diff --git a/test/correctness/specialize.cpp b/test/correctness/specialize.cpp index 1a807003f72a..8df87dd27333 100644 --- a/test/correctness/specialize.cpp +++ b/test/correctness/specialize.cpp @@ -128,6 +128,11 @@ int main(int argc, char **argv) { } } + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } + // Should have used vector stores if (!vector_store || scalar_store) { printf("This was supposed to use vector stores\n"); @@ -156,6 +161,11 @@ int main(int argc, char **argv) { } } + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } + // Should have used scalar stores if (vector_store || !scalar_store) { printf("This was supposed to use scalar stores\n"); @@ -243,6 +253,10 @@ int main(int argc, char **argv) { // Check we don't crash with the small input, and that it uses scalar stores reset_trace(); f.realize({5}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return 1; @@ -254,6 +268,10 @@ int main(int argc, char **argv) { reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return 1; @@ -282,6 +300,10 @@ int main(int argc, char **argv) { // Check we used scalar stores for a strided input. reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return 1; @@ -293,6 +315,10 @@ int main(int argc, char **argv) { reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return 1; diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp index 8a82f5ca33d1..a847304cbdd1 100644 --- a/test/correctness/stage_strided_loads.cpp +++ b/test/correctness/stage_strided_loads.cpp @@ -22,7 +22,7 @@ class CheckForStridedLoads : public IRMutator { if (const Ramp *r = op->index.as()) { if (op->name == buf_name) { bool dense = is_const_one(r->stride); - found |= !dense; + found_strided_load |= !dense; dense_loads += dense; } } @@ -30,27 +30,27 @@ class CheckForStridedLoads : public IRMutator { } public: - bool found = false; + bool found_strided_load = false; int dense_loads = 0; std::string buf_name; void check(Func f, int desired_dense_loads, std::string name = "buf") { - found = false; + found_strided_load = false; dense_loads = 0; buf_name = name; f.add_custom_lowering_pass(this, nullptr); f.compile_jit(); - assert(!found); + assert(!found_strided_load); assert(dense_loads == desired_dense_loads); } void check_not(Func f, int desired_dense_loads, std::string name = "buf") { - found = false; + found_strided_load = false; dense_loads = 0; buf_name = name; f.add_custom_lowering_pass(this, nullptr); f.compile_jit(); - assert(found); + assert(found_strided_load); assert(dense_loads == desired_dense_loads); } } checker; diff --git a/test/correctness/vector_shuffle.cpp b/test/correctness/vector_shuffle.cpp index aff6fcbcddcf..f0a62ab3d8cd 100644 --- a/test/correctness/vector_shuffle.cpp +++ b/test/correctness/vector_shuffle.cpp @@ -1,10 +1,20 @@ #include "Halide.h" +#include +#include #include using namespace Halide; -int main(int argc, char **argv) { - Target target = get_jit_target_from_environment(); +int test_with_indices(const Target &target, const std::vector &indices0, const std::vector &indices1) { + printf("indices0:"); + for (int i : indices0) { + printf(" %d", i); + } + printf(" indices1:"); + for (int i : indices1) { + printf(" %d", i); + } + printf("\n"); Var x{"x"}, y{"y"}; Func f0{"f0"}, f1{"f1"}, g{"g"}; @@ -12,15 +22,6 @@ int main(int argc, char **argv) { f1(x, y) = x * (y + 3); Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)}); Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)}); - std::vector indices0; - std::vector indices1; - if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) { - indices0 = {3, 1, 6, 7, 2, 4, 0, 5}; - indices1 = {1, 0, 3, 4, 7, 0, 5, 2}; - } else { - indices0 = {3, 1, 6, 7}; - indices1 = {1, 0, 3, 4}; - } Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0); Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1); Expr result = shuffle1 * shuffle2; @@ -55,6 +56,94 @@ int main(int argc, char **argv) { return 1; } } + return 0; +} + +int main(int argc, char **argv) { + Target target = get_jit_target_from_environment(); + + for (int vec_size = 8; vec_size > 1; vec_size /= 2) { + printf("Testing vector size %d...\n", vec_size); + std::vector indices0, indices1; + + // Test 1: All indices: foreward/backward and combined + for (int i = 0; i < vec_size; ++i) { + indices0.push_back(i); // forward + indices1.push_back(vec_size - i - 1); // backward + } + printf(" All indices forward...\n"); + if (test_with_indices(target, indices0, indices0)) { + return 1; + } + printf(" All indices backward...\n"); + if (test_with_indices(target, indices1, indices1)) { + return 1; + } + printf(" All indices mixed forware / backward...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + // Test 2: Shuffled indices (4 repetitions) + for (int r = 0; r < 4; ++r) { + // Shuffle with Fisher-Yates + for (int i = vec_size - 1; i >= 1; --i) { + // indices0 + int idx = std::rand() % (i + 1); + std::swap(indices0[idx], indices0[i]); + // indices1 + idx = std::rand() % (i + 1); + std::swap(indices1[idx], indices1[i]); + } + printf(" Randomly shuffled...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + } + + // Test 3: Interleaved + indices0.clear(); + indices1.clear(); + for (int i = 0; i < vec_size / 2; ++i) { + // interleave (A, B) + indices0.push_back(i); + indices0.push_back(i + vec_size / 2); + + // interleave (B, A) + indices1.push_back(i + vec_size / 2); + indices1.push_back(i); + } + printf(" Interleaved...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + // Test 4: Concat (not-really, as the input-vectors are size 4, so only if vec_size == 8, it's a concat) + indices0.clear(); + indices1.clear(); + for (int i = 0; i < vec_size; ++i) { + // concat (A, B) + indices0.push_back(i); + + // concat (B, A) + indices1.push_back((i + vec_size / 2) % (vec_size / 2)); + } + printf(" Concat...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + if (vec_size == 4) { + indices0 = {1, 3, 2, 0}; + indices1 = {2, 3, 1, 0}; + + printf(" Specific index combination, known to have caused problems...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + } + } + printf("Success!\n"); return 0; } diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index fc9496af0244..df6e74a2b5f5 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -82,7 +82,6 @@ tests(GROUPS error memoize_output_invalid.cpp memoize_redefine_eviction_key.cpp metal_threads_too_large.cpp - metal_vector_too_large.cpp mismatch_runtime_vscale.cpp missing_args.cpp no_default_device.cpp diff --git a/test/performance/nested_vectorization_gemm.cpp b/test/performance/nested_vectorization_gemm.cpp index 660d3d7bbdf8..4d831e4ba247 100644 --- a/test/performance/nested_vectorization_gemm.cpp +++ b/test/performance/nested_vectorization_gemm.cpp @@ -300,7 +300,6 @@ int main(int argc, char **argv) { return 1; } } - printf("Success!\n"); // 8-bit sparse blur into 32-bit accumulator { @@ -396,5 +395,6 @@ int main(int argc, char **argv) { } } + printf("Success!\n"); return 0; }