diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index e43097b95180..8054307cf271 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1027,96 +1027,13 @@ void CodeGen_ARM::visit(const Load *op) { return; } + // If the stride is in [-1, 4], we can deal with that using vanilla codegen const IntImm *stride = ramp ? ramp->stride.as() : nullptr; - - // If the stride is one or minus one, we can deal with that using vanilla codegen - if (stride && (stride->value == 1 || stride->value == -1)) { + if (stride && (-1 <= stride->value && stride->value <= 4)) { CodeGen_Posix::visit(op); return; } - // Strided loads with known stride - if (stride && stride->value >= 2 && stride->value <= 4) { - // Check alignment on the base. Attempt to shift to an earlier - // address if it simplifies the expression. This makes - // adjacent strided loads shared a vldN op. - Expr base = ramp->base; - int offset = 0; - ModulusRemainder mod_rem = modulus_remainder(ramp->base); - - const Add *add = base.as(); - const IntImm *add_b = add ? add->b.as() : nullptr; - - if ((mod_rem.modulus % stride->value) == 0) { - offset = mod_rem.remainder % stride->value; - } else if ((mod_rem.modulus == 1) && add_b) { - offset = add_b->value % stride->value; - if (offset < 0) { - offset += stride->value; - } - } - - if (offset) { - base = simplify(base - offset); - mod_rem.remainder -= offset; - if (mod_rem.modulus) { - mod_rem.remainder = mod_imp(mod_rem.remainder, mod_rem.modulus); - } - } - - int alignment = op->type.bytes(); - alignment *= gcd(mod_rem.modulus, mod_rem.remainder); - // Maximum stack alignment on arm is 16 bytes, so we should - // never claim alignment greater than that. - alignment = gcd(alignment, 16); - internal_assert(alignment > 0); - - // Decide what width to slice things into. If not a multiple - // of 64 or 128 bits, then we can't safely slice it up into - // some number of vlds, so we hand it over the base class. - int bit_width = op->type.bits() * op->type.lanes(); - int intrin_lanes = 0; - if (bit_width % 128 == 0) { - intrin_lanes = 128 / op->type.bits(); - } else if (bit_width % 64 == 0) { - intrin_lanes = 64 / op->type.bits(); - } else { - CodeGen_Posix::visit(op); - return; - } - - llvm::Type *load_return_type = llvm_type_of(op->type.with_lanes(intrin_lanes * stride->value)); - llvm::Type *load_return_pointer_type = load_return_type->getPointerTo(); - Value *undef = UndefValue::get(load_return_type); - SmallVector constants; - for (int j = 0; j < intrin_lanes; j++) { - Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset); - constants.push_back(constant); - } - Constant *constantsV = ConstantVector::get(constants); - - vector results; - for (int i = 0; i < op->type.lanes(); i += intrin_lanes) { - Expr slice_base = simplify(base + i * ramp->stride); - Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_lanes); - Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base); - Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type); - LoadInst *loadI = cast(builder->CreateLoad(bitcastI)); -#if LLVM_VERSION >= 110 - loadI->setAlignment(Align(alignment)); -#else - loadI->setAlignment(MaybeAlign(alignment)); -#endif - add_tbaa_metadata(loadI, op->name, slice_ramp); - Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV); - results.push_back(shuffleInstr); - } - - // Concat the results - value = concat_vectors(results); - return; - } - // We have builtins for strided loads with fixed but unknown stride, but they use inline assembly. if (target.bits != 64 /* Not yet implemented for aarch64 */) { ostringstream builtin; diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index eda89aca82da..f8fdf854e6b2 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1910,63 +1910,66 @@ void CodeGen_LLVM::visit(const Load *op) { if (ramp && stride && stride->value == 1) { value = codegen_dense_vector_load(op); - } else if (ramp && stride && stride->value == 2) { - // Load two vectors worth and then shuffle - Expr base_a = ramp->base, base_b = ramp->base + ramp->lanes; - Expr stride_a = make_one(base_a.type()); - Expr stride_b = make_one(base_b.type()); - - ModulusRemainder align_a = op->alignment; - ModulusRemainder align_b = align_a + ramp->lanes; - - // False indicates we should take the even-numbered lanes - // from the load, true indicates we should take the - // odd-numbered-lanes. - bool shifted_a = false, shifted_b = false; + } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { + // Try to rewrite strided loads as shuffles of dense loads, + // aligned to the stride. This makes adjacent strided loads + // share the same underlying dense loads. + ModulusRemainder align = op->alignment; + Expr base = ramp->base; + int aligned_stride = gcd(stride->value, align.modulus); + int offset = 0; + if (aligned_stride == stride->value) { + offset = mod_imp((int)align.remainder, aligned_stride); + } else { + const Add *add = base.as(); + if (const IntImm *add_c = add ? add->b.as() : base.as()) { + offset = mod_imp(add_c->value, stride->value); + } + } - bool external = op->param.defined() || op->image.defined(); + if (offset) { + base = simplify(base - offset); + align.remainder -= offset; + } - // Don't read beyond the end of an external buffer. + // We want to load a few more bytes than the original load did. + // We know this is safe for internal buffers because we allocate + // padding. // (In ASAN mode, don't read beyond the end of internal buffers either, // as ASAN will complain even about harmless stack overreads.) + // The min moves lower by offset. + int load_lanes = ramp->lanes * stride->value; + bool external = op->param.defined() || op->image.defined(); if (external || target.has_feature(Target::ASAN)) { - base_b -= 1; - align_b = align_b - 1; - shifted_b = true; - } else { - // If the base ends in an odd constant, then subtract one - // and do a different shuffle. This helps expressions like - // (f(2*x) + f(2*x+1) share loads - const Add *add = ramp->base.as(); - const IntImm *offset = add ? add->b.as() : ramp->base.as(); - if (offset && offset->value & 1) { - base_a -= 1; - align_a = align_a - 1; - shifted_a = true; - base_b -= 1; - align_b = align_b - 1; - shifted_b = true; - } + load_lanes -= (stride->value - 1 - offset); } - // Do each load. - Expr ramp_a = Ramp::make(base_a, stride_a, ramp->lanes); - Expr ramp_b = Ramp::make(base_b, stride_b, ramp->lanes); - Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param, op->predicate, align_a); - Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param, op->predicate, align_b); - Value *vec_a = codegen(load_a); - Value *vec_b = codegen(load_b); + int slice_lanes = native_vector_bits() / op->type.bits(); - // Shuffle together the results. - vector indices(ramp->lanes); - for (int i = 0; i < (ramp->lanes + 1) / 2; i++) { - indices[i] = i * 2 + (shifted_a ? 1 : 0); - } - for (int i = (ramp->lanes + 1) / 2; i < ramp->lanes; i++) { - indices[i] = i * 2 + (shifted_b ? 1 : 0); + // We need to slice the result in to native vector lanes, otherwise + // LLVM misses optimizations like using ldN on ARM. + vector results; + for (int i = 0; i < op->type.lanes(); i += slice_lanes) { + int load_lanes_i = std::min(slice_lanes * stride->value, load_lanes - i); + int lanes_i = std::min(slice_lanes, op->type.lanes() - i); + Expr slice_base = simplify(base + i * ramp->stride); + + Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, + op->image, op->param, op->alignment, nullptr, false); + + SmallVector constants; + for (int j = 0; j < lanes_i; j++) { + Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset); + constants.push_back(constant); + } + Constant *constantsV = ConstantVector::get(constants); + Value *undef = UndefValue::get(load_i->getType()); + Value *shuffleInstr = builder->CreateShuffleVector(load_i, undef, constantsV); + results.push_back(shuffleInstr); } - value = shuffle_vectors(vec_a, vec_b, indices); + // Concat the results + value = concat_vectors(results); } else if (ramp && stride && stride->value == -1) { // Load the vector and then flip it in-place Expr flipped_base = ramp->base - ramp->lanes + 1; @@ -2249,14 +2252,14 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) { } } -Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) { - debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n"; - - const Ramp *ramp = load->index.as(); - internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; +llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred, bool slice_to_native) { + debug(4) << "Vectorize predicated dense vector load:\n\t" + << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; - bool is_external = (external_buffer.find(load->name) != external_buffer.end()); - int alignment = load->type.bytes(); // The size of a single element + bool is_external = (external_buffer.find(name) != external_buffer.end()); + int align_bytes = type.bytes(); // The size of a single element int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; @@ -2266,60 +2269,68 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) { // maximum alignment we can infer based on the index alone. // Boost the alignment if possible, up to the native vector width. - ModulusRemainder mod_rem = load->alignment; + ModulusRemainder mod_rem = alignment; while ((mod_rem.remainder & 1) == 0 && (mod_rem.modulus & 1) == 0 && - alignment < native_bytes) { + align_bytes < native_bytes) { mod_rem.modulus /= 2; mod_rem.remainder /= 2; - alignment *= 2; + align_bytes *= 2; } // If it is an external buffer, then we cannot assume that the host pointer // is aligned to at least native vector width. However, we may be able to do // better than just assuming that it is unaligned. if (is_external) { - if (load->param.defined()) { - int host_alignment = load->param.host_alignment(); - alignment = gcd(alignment, host_alignment); - } else if (get_target().has_feature(Target::JIT) && load->image.defined()) { + if (param.defined()) { + int host_alignment = param.host_alignment(); + align_bytes = gcd(align_bytes, host_alignment); + } else if (get_target().has_feature(Target::JIT) && image.defined()) { // If we're JITting, use the actual pointer value to determine alignment for embedded buffers. - alignment = gcd(alignment, (int)(((uintptr_t)load->image.data()) & std::numeric_limits::max())); + align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits::max())); } } // For dense vector loads wider than the native vector // width, bust them up into native vectors - int load_lanes = load->type.lanes(); - int native_lanes = std::max(1, native_bits / load->type.bits()); + int load_lanes = type.lanes(); + int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes; vector slices; for (int i = 0; i < load_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, load_lanes - i); - Expr slice_base = simplify(ramp->base + i); + Expr slice_base = simplify(base + i); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); - llvm::Type *slice_type = get_vector_type(llvm_type_of(load->type.element_of()), slice_lanes); - Value *elt_ptr = codegen_buffer_pointer(load->name, load->type.element_of(), slice_base); + llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes); + Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); Instruction *load_inst; if (vpred != nullptr) { Value *slice_mask = slice_vector(vpred, i, slice_lanes); #if LLVM_VERSION >= 110 - load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(alignment), slice_mask); + load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(align_bytes), slice_mask); #else - load_inst = builder->CreateMaskedLoad(vec_ptr, alignment, slice_mask); + load_inst = builder->CreateMaskedLoad(vec_ptr, align_bytes, slice_mask); #endif } else { - load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(alignment)); + load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(align_bytes)); } - add_tbaa_metadata(load_inst, load->name, slice_index); + add_tbaa_metadata(load_inst, name, slice_index); slices.push_back(load_inst); } value = concat_vectors(slices); return value; } +Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) { + const Ramp *ramp = load->index.as(); + internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; + + return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param, + load->alignment, vpred, slice_to_native); +} + void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 092bc7713b5b..06ec634938ec 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -552,7 +552,10 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false); - llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr); + llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred = nullptr, bool slice_to_native = true); + llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); virtual void codegen_predicated_vector_load(const Load *op); virtual void codegen_predicated_vector_store(const Store *op); diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp index 32b3202477aa..49a627b767b1 100644 --- a/src/CodeGen_Posix.cpp +++ b/src/CodeGen_Posix.cpp @@ -75,10 +75,10 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type } int CodeGen_Posix::allocation_padding(Type type) const { - // We potentially load one scalar value past the end of the + // We potentially load 3 scalar values past the end of the // buffer, so pad the allocation with an extra instance of the // scalar type. - return type.bytes(); + return 3 * type.bytes(); } CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type, diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp index e3b11503aff2..2f435f975dd3 100644 --- a/test/correctness/nested_tail_strategies.cpp +++ b/test/correctness/nested_tail_strategies.cpp @@ -45,14 +45,15 @@ void check(Func out, int line, std::vector tails) { largest_allocation = 0; out.realize({s}); size_t expected = (s + 1) * 4; - if (largest_allocation > expected) { + size_t tolerance = 3 * sizeof(int); + if (largest_allocation > expected + tolerance) { std::cerr << "Failure on line " << line << "\n" << "with tail strategies: "; for (auto t : tails) { std::cerr << t << " "; } std::cerr << "\n allocation of " << largest_allocation - << " bytes is too large. Expected " << expected << "\n"; + << " bytes is too large. Expected " << expected + tolerance << "\n"; abort(); } } diff --git a/test/correctness/pseudostack_shares_slots.cpp b/test/correctness/pseudostack_shares_slots.cpp index fd491ba149ea..a9e2f49dd048 100644 --- a/test/correctness/pseudostack_shares_slots.cpp +++ b/test/correctness/pseudostack_shares_slots.cpp @@ -2,10 +2,11 @@ using namespace Halide; -std::vector mallocs; +const int tolerance = 3 * sizeof(int); +std::vector mallocs; void *my_malloc(void *user_context, size_t x) { - mallocs.push_back(x); + mallocs.push_back((int)x); void *orig = malloc(x + 32); void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5); ((void **)ptr)[-1] = orig; @@ -52,9 +53,11 @@ int main(int argc, char **argv) { mallocs.clear(); p.set(sz); chain.back().realize({1024}); - size_t sz1 = sz + 2 * 20 - 1; - size_t sz2 = sz1 - 2; - if (mallocs.size() != 2 || mallocs[0] != sz1 || mallocs[1] != sz2) { + int sz1 = sz + 2 * 20 - 1; + int sz2 = sz1 - 2; + if (mallocs.size() != 2 || + std::abs(mallocs[0] - sz1) > tolerance || + std::abs(mallocs[1] - sz2) > tolerance) { printf("Incorrect allocations: %d %d %d\n", (int)mallocs.size(), (int)mallocs[0], (int)mallocs[1]); printf("Expected: 2 %d %d\n", (int)sz1, (int)sz2); return -1; @@ -93,14 +96,18 @@ int main(int argc, char **argv) { mallocs.clear(); p.set(sz); chain.back().realize({1024}); - size_t sz1 = sz / 8 + 23; - size_t sz2 = sz1 - 2; - size_t sz3 = sz + 19; - size_t sz4 = sz3 - 2; - if (mallocs.size() != 4 || mallocs[0] != sz1 || mallocs[1] != sz2 || mallocs[2] != sz3 || mallocs[3] != sz4) { + int sz1 = sz / 8 + 23; + int sz2 = sz1 - 2; + int sz3 = sz + 19; + int sz4 = sz3 - 2; + if (mallocs.size() != 4 || + std::abs(mallocs[0] - sz1) > tolerance || + std::abs(mallocs[1] - sz2) > tolerance || + std::abs(mallocs[2] - sz3) > tolerance || + std::abs(mallocs[3] - sz4) > tolerance) { printf("Incorrect allocations: %d %d %d %d %d\n", (int)mallocs.size(), - (int)mallocs[0], (int)mallocs[1], (int)mallocs[2], (int)mallocs[3]); - printf("Expected: 4 %d %d %d %d\n", (int)sz1, (int)sz2, (int)sz3, (int)sz4); + mallocs[0], mallocs[1], mallocs[2], mallocs[3]); + printf("Expected: 4 %d %d %d %d\n", sz1, sz2, sz3, sz4); return -1; } } diff --git a/test/correctness/reorder_storage.cpp b/test/correctness/reorder_storage.cpp index 546812bad57a..953cd2b1e821 100644 --- a/test/correctness/reorder_storage.cpp +++ b/test/correctness/reorder_storage.cpp @@ -3,11 +3,13 @@ using namespace Halide; -size_t expected_allocation = 0; +// Backends allocate up to 3 extra elements. +int tolerance = 3 * sizeof(int); +int expected_allocation = 0; void *my_malloc(void *user_context, size_t x) { - if (x != expected_allocation) { - printf("Error! Expected allocation of %zu bytes, got %zu bytes\n", expected_allocation, x); + if (std::abs((int)x - expected_allocation) > tolerance) { + printf("Error! Expected allocation of %d bytes, got %zu bytes (tolerance %d)\n", expected_allocation, x, tolerance); exit(-1); } return malloc(x); @@ -40,11 +42,10 @@ int main(int argc, char **argv) { g.set_custom_allocator(my_malloc, my_free); // Without any storage alignment, we should expect an allocation - // that is the product of the extents of the realization (plus one - // for the magical extra Halide element). + // that is the product of the extents of the realization. int W = 10; int H = 11; - expected_allocation = (3 * W * H + 1) * sizeof(int); + expected_allocation = 3 * W * H * sizeof(int); g.realize({W, H, 3}); @@ -53,7 +54,7 @@ int main(int argc, char **argv) { // We've aligned the x dimension, make sure the allocation reflects this. int W_aligned = (W + x_alignment - 1) & ~(x_alignment - 1); - expected_allocation = (W_aligned * H * 3 + 1) * sizeof(int); + expected_allocation = W_aligned * H * 3 * sizeof(int); // Force g to clear it's cache... g.compute_root(); diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index a21bdaa71243..a67af7602a18 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -802,31 +802,34 @@ class SimdOpCheck : public SimdOpCheckTest { } // VLD2 X - Load Two-Element Structures - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_i32(x * 2) + in_i32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_u32(x * 2) + in_u32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_f32(x * 2) + in_f32(x * 2 + 1)); - check(arm32 ? "vld2.8" : "ld2", 8 * w, in_i8(x * 2) + in_i8(x * 2 + 1)); - check(arm32 ? "vld2.8" : "ld2", 8 * w, in_u8(x * 2) + in_u8(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 4 * w, in_i16(x * 2) + in_i16(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); + // These need to be vectorized at least 2 native vectors wide, + // so we get a full vectors' worth that we know is safe to + // access. + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1)); + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1)); // VLD3 X - Load Three-Element Structures - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3 + y)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3 + y)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3 + y)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3 + y)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3 + y)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3 + y)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3 + y)); + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3)); // VLD4 X - Load Four-Element Structures - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4 + y)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4 + y)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4 + y)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4 + y)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4 + y)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4 + y)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4 + y)); + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4)); // VLDM X F, D Load Multiple Registers // VLDR X F, D Load Single Register diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 36303bd29c16..43786143aecd 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -302,6 +302,11 @@ class SimdOpCheckTest { virtual void setup_images() { for (auto p : image_params) { p.reset(); + + const int alignment_bytes = 16; + p.set_host_alignment(alignment_bytes); + const int alignment = alignment_bytes / p.type().bytes(); + p.dim(0).set_min((p.dim(0).min() / alignment) * alignment); } } virtual bool test_all() { diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index dce07531db6a..ee312da988bc 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -437,10 +437,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest { check("vnot(v*)", hvx_width / 2, ~u16_1); check("vnot(v*)", hvx_width / 4, ~u32_1); - // v62 - Broadcasting unsigned scalars - check("v*.b = vsplat(r*)", hvx_width / 1, in_u8(0)); - check("v*.h = vsplat(r*)", hvx_width / 2, in_u16(0)); - check("vsplat(r*)", hvx_width / 4, in_u32(0)); + // v62 - Broadcasting scalars + check("vsplat(r*)", hvx_width / 1, in_u8(y)); + check("vsplat(r*)", hvx_width / 2, in_u16(y)); + check("vsplat(r*)", hvx_width / 4, in_u32(y)); check("vmux(q*,v*,v*)", hvx_width / 1, select(i8_1 == i8_2, i8_3, i8_2)); check("vmux(q*,v*,v*)", hvx_width / 2, select(i16_1 == i16_2, i16_3, i16_2)); diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp index b91bbeaf7859..3bbfb672f512 100644 --- a/test/correctness/storage_folding.cpp +++ b/test/correctness/storage_folding.cpp @@ -1,14 +1,16 @@ #include "Halide.h" #include +#include + using namespace Halide; // Override Halide's malloc and free - -size_t custom_malloc_size = 0; +const int tolerance = 3 * sizeof(int); +std::set custom_malloc_sizes; void *my_malloc(void *user_context, size_t x) { - custom_malloc_size = x; + custom_malloc_sizes.insert(x); void *orig = malloc(x + 32); void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5); ((void **)ptr)[-1] = orig; @@ -19,6 +21,28 @@ void my_free(void *user_context, void *ptr) { free(((void **)ptr)[-1]); } +bool check_expected_malloc(size_t expected) { + for (size_t i : custom_malloc_sizes) { + if (std::abs((int)i - (int)expected) <= tolerance) { + return true; + } + } + printf("Expected an allocation of size %d (tolerance %d). Got instead:\n", (int)expected, tolerance); + for (size_t i : custom_malloc_sizes) { + printf(" %d\n", (int)i); + } + return false; +} + +bool check_expected_mallocs(const std::vector &expected) { + for (size_t i : expected) { + if (!check_expected_malloc(i)) { + return false; + } + } + return true; +} + #ifdef _WIN32 #define DLLEXPORT __declspec(dllexport) #else @@ -111,9 +135,8 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000, 3}); - size_t expected_size = 101 * 4 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 4 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } @@ -133,9 +156,8 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000, 3}); - size_t expected_size = 101 * 1002 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 1002 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } @@ -157,15 +179,14 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000}); - size_t expected_size = 101 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -180,7 +201,7 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - if (custom_malloc_size != 0) { + if (!custom_malloc_sizes.empty()) { printf("There should not have been a heap allocation\n"); return -1; } @@ -197,7 +218,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -213,7 +234,7 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - if (custom_malloc_size != 0) { + if (!custom_malloc_sizes.empty()) { printf("There should not have been a heap allocation\n"); return -1; } @@ -230,7 +251,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -247,10 +268,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 2 * 1002 * 4 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 2 * 1000 * 4 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -266,7 +285,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -285,10 +304,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 1000 * 8 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 1000 * 8 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -304,7 +321,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -322,10 +339,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 2 * 1002 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 2 * 1000 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -341,24 +356,21 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; + // This is tricky due to upsampling. g(x, y) = x * y; f(x, y) = g(x, y / 2) + g(x, y / 2 + 1); - // The automatic storage folding optimization can't figure - // this out due to the downsampling. Explicitly fold it. - g.compute_at(f, x).store_root().fold_storage(y, 2); + g.compute_at(f, x).store_root(); f.set_custom_allocator(my_malloc, my_free); Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 1000 * 2 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 1000 * 2 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -394,12 +406,11 @@ int main(int argc, char **argv) { size_t expected_size; if (interleave) { - expected_size = 101 * 3 * 3 * sizeof(int) + sizeof(int); + expected_size = 101 * 3 * 3 * sizeof(int); } else { - expected_size = 101 * 3 * sizeof(int) + sizeof(int); + expected_size = 101 * 3 * sizeof(int); } - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + if (!check_expected_mallocs({expected_size})) { return -1; } }