From a28a503dbf30ae130eccabd277cb3018e88077cf Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 3 Mar 2021 09:49:09 -0700 Subject: [PATCH 01/13] Safer version of vldN code generation. --- src/CodeGen_ARM.cpp | 46 +++++++++----------------- test/correctness/simd_op_check.cpp | 28 ++++++++-------- test/correctness/simd_op_check.h | 5 +++ test/correctness/simd_op_check_hvx.cpp | 8 ++--- 4 files changed, 39 insertions(+), 48 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 89260d758cc7..7c3293a343b2 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1000,41 +1000,27 @@ void CodeGen_ARM::visit(const Load *op) { return; } - // Strided loads with known stride - if (stride && stride->value >= 2 && stride->value <= 4) { - // Check alignment on the base. Attempt to shift to an earlier - // address if it simplifies the expression. This makes - // adjacent strided loads shared a vldN op. + // Try to rewrite strided loads as shuffles of dense loads, + // aligned to the stride. This makes adjacent strided loads + // shared a vldN op. However, this is only safe when the new + // load would not cross any alignment boundaries not crossed + // by the original load. + ModulusRemainder alignment = op->alignment; + if (stride && stride->value >= 2 && stride->value <= 4 && + alignment.modulus % stride->value == 0) { Expr base = ramp->base; - int offset = 0; - ModulusRemainder mod_rem = modulus_remainder(ramp->base); - - const Add *add = base.as(); - const IntImm *add_b = add ? add->b.as() : nullptr; - - if ((mod_rem.modulus % stride->value) == 0) { - offset = mod_rem.remainder % stride->value; - } else if ((mod_rem.modulus == 1) && add_b) { - offset = add_b->value % stride->value; - if (offset < 0) { - offset += stride->value; - } - } - + int offset = mod_imp(alignment.remainder, stride->value); if (offset) { base = simplify(base - offset); - mod_rem.remainder -= offset; - if (mod_rem.modulus) { - mod_rem.remainder = mod_imp(mod_rem.remainder, mod_rem.modulus); - } + alignment.remainder -= offset; } - int alignment = op->type.bytes(); - alignment *= gcd(mod_rem.modulus, mod_rem.remainder); + int align_bytes = + gcd(alignment.modulus, alignment.remainder) * op->type.bytes(); // Maximum stack alignment on arm is 16 bytes, so we should // never claim alignment greater than that. - alignment = gcd(alignment, 16); - internal_assert(alignment > 0); + align_bytes = gcd(align_bytes, 16); + internal_assert(align_bytes > 0); // Decide what width to slice things into. If not a multiple // of 64 or 128 bits, then we can't safely slice it up into @@ -1068,9 +1054,9 @@ void CodeGen_ARM::visit(const Load *op) { Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type); LoadInst *loadI = cast(builder->CreateLoad(bitcastI)); #if LLVM_VERSION >= 110 - loadI->setAlignment(Align(alignment)); + loadI->setAlignment(Align(align_bytes)); #else - loadI->setAlignment(MaybeAlign(alignment)); + loadI->setAlignment(MaybeAlign(align_bytes)); #endif add_tbaa_metadata(loadI, op->name, slice_ramp); Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV); diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index 274ed02ce1ea..918d60e48bec 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -811,22 +811,22 @@ class SimdOpCheck : public SimdOpCheckTest { check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); // VLD3 X - Load Three-Element Structures - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3 + y)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3 + y)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3 + y)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3 + y)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3 + y)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3 + y)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3 + y)); + check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3)); // VLD4 X - Load Four-Element Structures - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4 + y)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4 + y)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4 + y)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4 + y)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4 + y)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4 + y)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4 + y)); + check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4)); // VLDM X F, D Load Multiple Registers // VLDR X F, D Load Single Register diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 36303bd29c16..b8e3f6636134 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -52,6 +52,11 @@ class SimdOpCheckTest { .with_feature(Target::NoRuntime) .with_feature(Target::DisableLLVMLoopOpt); num_threads = Internal::ThreadPool::num_processors_online(); + + // Some tests need to know the alignment of the buffer. + for (ImageParam i : image_params) { + i.dim(0).set_min(0); + } } virtual ~SimdOpCheckTest() = default; size_t get_num_threads() const { diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 943abd0a7895..8566593da450 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -437,10 +437,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest { check("vnot(v*)", hvx_width / 2, ~u16_1); check("vnot(v*)", hvx_width / 4, ~u32_1); - // v62 - Broadcasting unsigned scalars - check("v*.b = vsplat(r*)", hvx_width / 1, in_u8(0)); - check("v*.h = vsplat(r*)", hvx_width / 2, in_u16(0)); - check("vsplat(r*)", hvx_width / 4, in_u32(0)); + // v62 - Broadcasting scalars + check("vsplat(r*)", hvx_width / 1, in_u8(y)); + check("vsplat(r*)", hvx_width / 2, in_u16(y)); + check("vsplat(r*)", hvx_width / 4, in_u32(y)); check("vmux(q*,v*,v*)", hvx_width / 1, select(i8_1 == i8_2, i8_3, i8_2)); check("vmux(q*,v*,v*)", hvx_width / 2, select(i16_1 == i16_2, i16_3, i16_2)); From 94ca00ae3f5b06960bde58790b10c28d26b9ac4c Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 3 Mar 2021 10:53:44 -0700 Subject: [PATCH 02/13] Only be more conservative with alignment for external buffers. --- src/CodeGen_ARM.cpp | 26 +++++++++++++++++++++----- src/CodeGen_ARM.h | 2 ++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 7c3293a343b2..0281c9c556fb 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -972,6 +972,12 @@ void CodeGen_ARM::visit(const Store *op) { CodeGen_Posix::visit(op); } +int CodeGen_ARM::allocation_padding(Type type) const { + // We want to be able to load up to 3 values past the end of the buffer in + // the Load visitor. + return std::max(CodeGen_Posix::allocation_padding(type), type.bytes() * 3); +} + void CodeGen_ARM::visit(const Load *op) { // Predicated load if (!is_const_one(op->predicate)) { @@ -1002,12 +1008,22 @@ void CodeGen_ARM::visit(const Load *op) { // Try to rewrite strided loads as shuffles of dense loads, // aligned to the stride. This makes adjacent strided loads - // shared a vldN op. However, this is only safe when the new - // load would not cross any alignment boundaries not crossed - // by the original load. + // shared a vldN op. ModulusRemainder alignment = op->alignment; - if (stride && stride->value >= 2 && stride->value <= 4 && - alignment.modulus % stride->value == 0) { + if (stride && stride->value >= 2 && stride->value <= 4) { + // For internal buffers, we know this is safe because we + // allocate an extra 3 values of padding. For external + // buffers, we can only do this if we know the alignment + // makes it safe. + // (In ASAN mode, don't read beyond the end of internal buffers either, + // as ASAN will complain even about harmless stack overreads.) + bool external = op->param.defined() || op->image.defined(); + if (alignment.modulus % stride->value != 0 && + (external || target.has_feature(Target::ASAN))) { + CodeGen_Posix::visit(op); + return; + } + Expr base = ramp->base; int offset = mod_imp(alignment.remainder, stride->value); if (offset) { diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h index 39d1a39d7a0f..49d31b9e1662 100644 --- a/src/CodeGen_ARM.h +++ b/src/CodeGen_ARM.h @@ -44,6 +44,8 @@ class CodeGen_ARM : public CodeGen_Posix { void codegen_vector_reduce(const VectorReduce *, const Expr &) override; // @} + int allocation_padding(Type type) const override; + /** Various patterns to peephole match against */ struct Pattern { std::string intrin; ///< Name of the intrinsic From e4574a7b1644844f14fbfbfa91371ddcb1ef1e3b Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 3 Mar 2021 13:13:29 -0700 Subject: [PATCH 03/13] Add tolerance to allocation size tests. --- test/correctness/nested_tail_strategies.cpp | 5 +- test/correctness/pseudostack_shares_slots.cpp | 31 ++++--- test/correctness/reorder_storage.cpp | 15 +-- test/correctness/storage_folding.cpp | 93 +++++++++++-------- 4 files changed, 84 insertions(+), 60 deletions(-) diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp index e3b11503aff2..2f435f975dd3 100644 --- a/test/correctness/nested_tail_strategies.cpp +++ b/test/correctness/nested_tail_strategies.cpp @@ -45,14 +45,15 @@ void check(Func out, int line, std::vector tails) { largest_allocation = 0; out.realize({s}); size_t expected = (s + 1) * 4; - if (largest_allocation > expected) { + size_t tolerance = 3 * sizeof(int); + if (largest_allocation > expected + tolerance) { std::cerr << "Failure on line " << line << "\n" << "with tail strategies: "; for (auto t : tails) { std::cerr << t << " "; } std::cerr << "\n allocation of " << largest_allocation - << " bytes is too large. Expected " << expected << "\n"; + << " bytes is too large. Expected " << expected + tolerance << "\n"; abort(); } } diff --git a/test/correctness/pseudostack_shares_slots.cpp b/test/correctness/pseudostack_shares_slots.cpp index fd491ba149ea..a9e2f49dd048 100644 --- a/test/correctness/pseudostack_shares_slots.cpp +++ b/test/correctness/pseudostack_shares_slots.cpp @@ -2,10 +2,11 @@ using namespace Halide; -std::vector mallocs; +const int tolerance = 3 * sizeof(int); +std::vector mallocs; void *my_malloc(void *user_context, size_t x) { - mallocs.push_back(x); + mallocs.push_back((int)x); void *orig = malloc(x + 32); void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5); ((void **)ptr)[-1] = orig; @@ -52,9 +53,11 @@ int main(int argc, char **argv) { mallocs.clear(); p.set(sz); chain.back().realize({1024}); - size_t sz1 = sz + 2 * 20 - 1; - size_t sz2 = sz1 - 2; - if (mallocs.size() != 2 || mallocs[0] != sz1 || mallocs[1] != sz2) { + int sz1 = sz + 2 * 20 - 1; + int sz2 = sz1 - 2; + if (mallocs.size() != 2 || + std::abs(mallocs[0] - sz1) > tolerance || + std::abs(mallocs[1] - sz2) > tolerance) { printf("Incorrect allocations: %d %d %d\n", (int)mallocs.size(), (int)mallocs[0], (int)mallocs[1]); printf("Expected: 2 %d %d\n", (int)sz1, (int)sz2); return -1; @@ -93,14 +96,18 @@ int main(int argc, char **argv) { mallocs.clear(); p.set(sz); chain.back().realize({1024}); - size_t sz1 = sz / 8 + 23; - size_t sz2 = sz1 - 2; - size_t sz3 = sz + 19; - size_t sz4 = sz3 - 2; - if (mallocs.size() != 4 || mallocs[0] != sz1 || mallocs[1] != sz2 || mallocs[2] != sz3 || mallocs[3] != sz4) { + int sz1 = sz / 8 + 23; + int sz2 = sz1 - 2; + int sz3 = sz + 19; + int sz4 = sz3 - 2; + if (mallocs.size() != 4 || + std::abs(mallocs[0] - sz1) > tolerance || + std::abs(mallocs[1] - sz2) > tolerance || + std::abs(mallocs[2] - sz3) > tolerance || + std::abs(mallocs[3] - sz4) > tolerance) { printf("Incorrect allocations: %d %d %d %d %d\n", (int)mallocs.size(), - (int)mallocs[0], (int)mallocs[1], (int)mallocs[2], (int)mallocs[3]); - printf("Expected: 4 %d %d %d %d\n", (int)sz1, (int)sz2, (int)sz3, (int)sz4); + mallocs[0], mallocs[1], mallocs[2], mallocs[3]); + printf("Expected: 4 %d %d %d %d\n", sz1, sz2, sz3, sz4); return -1; } } diff --git a/test/correctness/reorder_storage.cpp b/test/correctness/reorder_storage.cpp index 546812bad57a..953cd2b1e821 100644 --- a/test/correctness/reorder_storage.cpp +++ b/test/correctness/reorder_storage.cpp @@ -3,11 +3,13 @@ using namespace Halide; -size_t expected_allocation = 0; +// Backends allocate up to 3 extra elements. +int tolerance = 3 * sizeof(int); +int expected_allocation = 0; void *my_malloc(void *user_context, size_t x) { - if (x != expected_allocation) { - printf("Error! Expected allocation of %zu bytes, got %zu bytes\n", expected_allocation, x); + if (std::abs((int)x - expected_allocation) > tolerance) { + printf("Error! Expected allocation of %d bytes, got %zu bytes (tolerance %d)\n", expected_allocation, x, tolerance); exit(-1); } return malloc(x); @@ -40,11 +42,10 @@ int main(int argc, char **argv) { g.set_custom_allocator(my_malloc, my_free); // Without any storage alignment, we should expect an allocation - // that is the product of the extents of the realization (plus one - // for the magical extra Halide element). + // that is the product of the extents of the realization. int W = 10; int H = 11; - expected_allocation = (3 * W * H + 1) * sizeof(int); + expected_allocation = 3 * W * H * sizeof(int); g.realize({W, H, 3}); @@ -53,7 +54,7 @@ int main(int argc, char **argv) { // We've aligned the x dimension, make sure the allocation reflects this. int W_aligned = (W + x_alignment - 1) & ~(x_alignment - 1); - expected_allocation = (W_aligned * H * 3 + 1) * sizeof(int); + expected_allocation = W_aligned * H * 3 * sizeof(int); // Force g to clear it's cache... g.compute_root(); diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp index b91bbeaf7859..50533c22768f 100644 --- a/test/correctness/storage_folding.cpp +++ b/test/correctness/storage_folding.cpp @@ -1,14 +1,16 @@ #include "Halide.h" #include +#include + using namespace Halide; // Override Halide's malloc and free - -size_t custom_malloc_size = 0; +const int tolerance = 3 * sizeof(int); +std::set custom_malloc_sizes; void *my_malloc(void *user_context, size_t x) { - custom_malloc_size = x; + custom_malloc_sizes.insert(x); void *orig = malloc(x + 32); void *ptr = (void *)((((size_t)orig + 32) >> 5) << 5); ((void **)ptr)[-1] = orig; @@ -19,6 +21,28 @@ void my_free(void *user_context, void *ptr) { free(((void **)ptr)[-1]); } +bool check_expected_malloc(size_t expected) { + for (size_t i : custom_malloc_sizes) { + if (std::abs((int)i - (int)expected) <= tolerance) { + return true; + } + } + printf("Expected an allocation of size %d (tolerance %d). Got instead:\n", (int)expected, tolerance); + for (size_t i : custom_malloc_sizes) { + printf(" %d\n", (int)i); + } + return false; +} + +bool check_expected_mallocs(const std::vector &expected) { + for (size_t i : expected) { + if (!check_expected_malloc(i)) { + return false; + } + } + return true; +} + #ifdef _WIN32 #define DLLEXPORT __declspec(dllexport) #else @@ -111,9 +135,8 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000, 3}); - size_t expected_size = 101 * 4 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 4 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } @@ -133,9 +156,8 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000, 3}); - size_t expected_size = 101 * 1002 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 1002 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } @@ -157,15 +179,14 @@ int main(int argc, char **argv) { Buffer im = g.realize({100, 1000}); - size_t expected_size = 101 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 101 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -180,7 +201,7 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - if (custom_malloc_size != 0) { + if (!custom_malloc_sizes.empty()) { printf("There should not have been a heap allocation\n"); return -1; } @@ -197,7 +218,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -213,7 +234,7 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - if (custom_malloc_size != 0) { + if (!custom_malloc_sizes.empty()) { printf("There should not have been a heap allocation\n"); return -1; } @@ -230,7 +251,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -248,9 +269,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 2 * 1002 * 4 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 2 * 1000 * 4 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -266,7 +286,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -286,9 +306,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 1000 * 8 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 1000 * 8 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -304,7 +323,7 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; g(x, y) = x * y; @@ -323,9 +342,8 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 2 * 1002 * 3 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 2 * 1000 * 3 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -341,24 +359,22 @@ int main(int argc, char **argv) { } { - custom_malloc_size = 0; + custom_malloc_sizes.clear(); Func f, g; + // This is tricky due to upsampling. g(x, y) = x * y; f(x, y) = g(x, y / 2) + g(x, y / 2 + 1); - // The automatic storage folding optimization can't figure - // this out due to the downsampling. Explicitly fold it. - g.compute_at(f, x).store_root().fold_storage(y, 2); + g.compute_at(f, x).store_root(); f.set_custom_allocator(my_malloc, my_free); Buffer im = f.realize({1000, 1000}); // Halide allocates one extra scalar, so we account for that. - size_t expected_size = 1000 * 2 * sizeof(int) + sizeof(int); - if (custom_malloc_size == 0 || custom_malloc_size > expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + size_t expected_size = 1000 * 2 * sizeof(int); + if (!check_expected_mallocs({expected_size})) { return -1; } @@ -394,12 +410,11 @@ int main(int argc, char **argv) { size_t expected_size; if (interleave) { - expected_size = 101 * 3 * 3 * sizeof(int) + sizeof(int); + expected_size = 101 * 3 * 3 * sizeof(int); } else { - expected_size = 101 * 3 * sizeof(int) + sizeof(int); + expected_size = 101 * 3 * sizeof(int); } - if (custom_malloc_size == 0 || custom_malloc_size != expected_size) { - printf("Scratch space allocated was %d instead of %d\n", (int)custom_malloc_size, (int)expected_size); + if (!check_expected_mallocs({expected_size})) { return -1; } } From 50a91e1c753b0643e986a6d0d4c46e01c1ed52ab Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 3 Mar 2021 13:21:01 -0700 Subject: [PATCH 04/13] Remove old comments. --- test/correctness/storage_folding.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/correctness/storage_folding.cpp b/test/correctness/storage_folding.cpp index 50533c22768f..3bbfb672f512 100644 --- a/test/correctness/storage_folding.cpp +++ b/test/correctness/storage_folding.cpp @@ -268,7 +268,6 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. size_t expected_size = 2 * 1000 * 4 * sizeof(int); if (!check_expected_mallocs({expected_size})) { return -1; @@ -305,7 +304,6 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. size_t expected_size = 1000 * 8 * sizeof(int); if (!check_expected_mallocs({expected_size})) { return -1; @@ -341,7 +339,6 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. size_t expected_size = 2 * 1000 * 3 * sizeof(int); if (!check_expected_mallocs({expected_size})) { return -1; @@ -372,7 +369,6 @@ int main(int argc, char **argv) { Buffer im = f.realize({1000, 1000}); - // Halide allocates one extra scalar, so we account for that. size_t expected_size = 1000 * 2 * sizeof(int); if (!check_expected_mallocs({expected_size})) { return -1; From 8c42f5148e4c4ea5107e5d20522ee608ecb8711e Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 4 Mar 2021 20:18:03 -0700 Subject: [PATCH 05/13] Improve ARM alignment and vldN code generation. --- src/CodeGen_ARM.cpp | 47 ++++++++++++++++++++++---------- test/correctness/simd_op_check.h | 10 +++---- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index df859ddcf484..c3ec666865c2 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1047,29 +1047,46 @@ void CodeGen_ARM::visit(const Load *op) { // aligned to the stride. This makes adjacent strided loads // shared a vldN op. ModulusRemainder alignment = op->alignment; + alignment.remainder = mod_imp(alignment.remainder, alignment.modulus); if (stride && stride->value >= 2 && stride->value <= 4) { - // For internal buffers, we know this is safe because we - // allocate an extra 3 values of padding. For external - // buffers, we can only do this if we know the alignment - // makes it safe. + Expr base = ramp->base; + int aligned_stride = gcd(stride->value, alignment.modulus); + int offset = 0; + if (aligned_stride == stride->value) { + offset = mod_imp((int)alignment.remainder, aligned_stride); + } else { + const Add *add = ramp->base.as(); + if (const IntImm *add_c = add ? add->b.as() : ramp->base.as()) { + offset = mod_imp(add_c->value, stride->value); + } + } + + if (offset) { + base = simplify(base - offset); + alignment.remainder = offset; + } + + // We want to load a few more bytes than the original load did. + // This is safe under these conditions: + // - The alignment information is sufficient to determine the + // modified load is safe. + // - For internal buffers, we know this is safe because we + // allocate an extra 3 values of padding. // (In ASAN mode, don't read beyond the end of internal buffers either, // as ASAN will complain even about harmless stack overreads.) + // The min moves lower by offset. + bool alignment_safe = alignment.remainder >= 0; + // The max moves higher by stride->value - 1 - offset + alignment_safe &= + alignment.remainder + stride->value - 1 - offset <= alignment.modulus; bool external = op->param.defined() || op->image.defined(); - if (alignment.modulus % stride->value != 0 && - (external || target.has_feature(Target::ASAN))) { + if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { CodeGen_Posix::visit(op); return; } - Expr base = ramp->base; - int offset = mod_imp(alignment.remainder, stride->value); - if (offset) { - base = simplify(base - offset); - alignment.remainder -= offset; - } - - int align_bytes = - gcd(alignment.modulus, alignment.remainder) * op->type.bytes(); + alignment.remainder = mod_imp(alignment.remainder, alignment.modulus); + int align_bytes = gcd(alignment.modulus, alignment.remainder) * op->type.bytes(); // Maximum stack alignment on arm is 16 bytes, so we should // never claim alignment greater than that. align_bytes = gcd(align_bytes, 16); diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index b8e3f6636134..43786143aecd 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -52,11 +52,6 @@ class SimdOpCheckTest { .with_feature(Target::NoRuntime) .with_feature(Target::DisableLLVMLoopOpt); num_threads = Internal::ThreadPool::num_processors_online(); - - // Some tests need to know the alignment of the buffer. - for (ImageParam i : image_params) { - i.dim(0).set_min(0); - } } virtual ~SimdOpCheckTest() = default; size_t get_num_threads() const { @@ -307,6 +302,11 @@ class SimdOpCheckTest { virtual void setup_images() { for (auto p : image_params) { p.reset(); + + const int alignment_bytes = 16; + p.set_host_alignment(alignment_bytes); + const int alignment = alignment_bytes / p.type().bytes(); + p.dim(0).set_min((p.dim(0).min() / alignment) * alignment); } } virtual bool test_all() { From 927583b69cd42e000a2c07bec9f37f0460c4b983 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 4 Mar 2021 22:28:06 -0700 Subject: [PATCH 06/13] Remove merge straggler --- src/CodeGen_ARM.h | 74 ----------------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 src/CodeGen_ARM.h diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h deleted file mode 100644 index 49d31b9e1662..000000000000 --- a/src/CodeGen_ARM.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef HALIDE_CODEGEN_ARM_H -#define HALIDE_CODEGEN_ARM_H - -/** \file - * Defines the code-generator for producing ARM machine code - */ - -#include - -#include "CodeGen_Posix.h" - -namespace Halide { - -struct Target; - -namespace Internal { - -/** A code generator that emits ARM code from a given Halide stmt. */ -class CodeGen_ARM : public CodeGen_Posix { -public: - /** Create an ARM code generator for the given arm target. */ - CodeGen_ARM(const Target &); - -protected: - using CodeGen_Posix::visit; - - /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that - * takes one vector argument and splits it into two to call inner. */ - llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const std::string &name); - void init_module() override; - - /** Nodes for which we want to emit specific neon intrinsics */ - // @{ - void visit(const Cast *) override; - void visit(const Sub *) override; - void visit(const Mul *) override; - void visit(const Min *) override; - void visit(const Max *) override; - void visit(const Store *) override; - void visit(const Load *) override; - void visit(const Call *) override; - void visit(const LT *) override; - void visit(const LE *) override; - void codegen_vector_reduce(const VectorReduce *, const Expr &) override; - // @} - - int allocation_padding(Type type) const override; - - /** Various patterns to peephole match against */ - struct Pattern { - std::string intrin; ///< Name of the intrinsic - Expr pattern; ///< The pattern to match against - Pattern() = default; - Pattern(const std::string &intrin, Expr p) - : intrin(intrin), pattern(std::move(p)) { - } - }; - std::vector casts, averagings, negations; - - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; - - // NEON can be disabled for older processors. - bool neon_intrinsics_disabled() { - return target.has_feature(Target::NoNEON); - } -}; - -} // namespace Internal -} // namespace Halide - -#endif From 3099b2db514c90ede69daee4d627012d58ccfa1b Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 4 Mar 2021 22:52:18 -0700 Subject: [PATCH 07/13] Fix alignment condition (again). --- src/CodeGen_ARM.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index c3ec666865c2..aca91f468ca3 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1076,9 +1076,10 @@ void CodeGen_ARM::visit(const Load *op) { // as ASAN will complain even about harmless stack overreads.) // The min moves lower by offset. bool alignment_safe = alignment.remainder >= 0; - // The max moves higher by stride->value - 1 - offset - alignment_safe &= - alignment.remainder + stride->value - 1 - offset <= alignment.modulus; + // The max needs to not cross an alignment boundary. + int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value; + int new_max = old_max + stride->value - 1 - offset; + alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus); bool external = op->param.defined() || op->image.defined(); if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { CodeGen_Posix::visit(op); From 40d18c8b734abb648315382660bbc069a88df17b Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 5 Mar 2021 01:40:45 -0700 Subject: [PATCH 08/13] Fix alignment. --- src/CodeGen_ARM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index aca91f468ca3..aa78efee7254 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1063,7 +1063,7 @@ void CodeGen_ARM::visit(const Load *op) { if (offset) { base = simplify(base - offset); - alignment.remainder = offset; + alignment.remainder -= offset; } // We want to load a few more bytes than the original load did. From 7a911fb0b9fe5c317a56b47a2b7a319829ea5598 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 5 Mar 2021 11:21:30 -0700 Subject: [PATCH 09/13] Avoid divide by zero. --- src/CodeGen_ARM.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index aa78efee7254..a77612aeb5e8 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1077,9 +1077,11 @@ void CodeGen_ARM::visit(const Load *op) { // The min moves lower by offset. bool alignment_safe = alignment.remainder >= 0; // The max needs to not cross an alignment boundary. - int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value; - int new_max = old_max + stride->value - 1 - offset; - alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus); + if (alignment.modulus > 0) { + int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value; + int new_max = old_max + stride->value - 1 - offset; + alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus); + } bool external = op->param.defined() || op->image.defined(); if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { CodeGen_Posix::visit(op); From 1786c5f72099bbd1e2df3f349a8e4cd5d5191d18 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 8 Mar 2021 18:06:31 -0700 Subject: [PATCH 10/13] Move CodeGen_ARM's logic for strided loads to CodeGen_LLVM. --- src/CodeGen_ARM.cpp | 111 +--------------------------- src/CodeGen_LLVM.cpp | 165 ++++++++++++++++++++++++------------------ src/CodeGen_LLVM.h | 5 +- src/CodeGen_Posix.cpp | 4 +- 4 files changed, 101 insertions(+), 184 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index a77612aeb5e8..8054307cf271 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -62,8 +62,6 @@ class CodeGen_ARM : public CodeGen_Posix { void codegen_vector_reduce(const VectorReduce *, const Expr &) override; // @} - int allocation_padding(Type type) const override; - /** Various patterns to peephole match against */ struct Pattern { string intrin; ///< Name of the intrinsic @@ -1009,12 +1007,6 @@ void CodeGen_ARM::visit(const Store *op) { CodeGen_Posix::visit(op); } -int CodeGen_ARM::allocation_padding(Type type) const { - // We want to be able to load up to 3 values past the end of the buffer in - // the Load visitor. - return std::max(CodeGen_Posix::allocation_padding(type), type.bytes() * 3); -} - void CodeGen_ARM::visit(const Load *op) { // Predicated load if (!is_const_one(op->predicate)) { @@ -1035,112 +1027,13 @@ void CodeGen_ARM::visit(const Load *op) { return; } + // If the stride is in [-1, 4], we can deal with that using vanilla codegen const IntImm *stride = ramp ? ramp->stride.as() : nullptr; - - // If the stride is one or minus one, we can deal with that using vanilla codegen - if (stride && (stride->value == 1 || stride->value == -1)) { + if (stride && (-1 <= stride->value && stride->value <= 4)) { CodeGen_Posix::visit(op); return; } - // Try to rewrite strided loads as shuffles of dense loads, - // aligned to the stride. This makes adjacent strided loads - // shared a vldN op. - ModulusRemainder alignment = op->alignment; - alignment.remainder = mod_imp(alignment.remainder, alignment.modulus); - if (stride && stride->value >= 2 && stride->value <= 4) { - Expr base = ramp->base; - int aligned_stride = gcd(stride->value, alignment.modulus); - int offset = 0; - if (aligned_stride == stride->value) { - offset = mod_imp((int)alignment.remainder, aligned_stride); - } else { - const Add *add = ramp->base.as(); - if (const IntImm *add_c = add ? add->b.as() : ramp->base.as()) { - offset = mod_imp(add_c->value, stride->value); - } - } - - if (offset) { - base = simplify(base - offset); - alignment.remainder -= offset; - } - - // We want to load a few more bytes than the original load did. - // This is safe under these conditions: - // - The alignment information is sufficient to determine the - // modified load is safe. - // - For internal buffers, we know this is safe because we - // allocate an extra 3 values of padding. - // (In ASAN mode, don't read beyond the end of internal buffers either, - // as ASAN will complain even about harmless stack overreads.) - // The min moves lower by offset. - bool alignment_safe = alignment.remainder >= 0; - // The max needs to not cross an alignment boundary. - if (alignment.modulus > 0) { - int old_max = alignment.remainder + (op->type.lanes() - 1) * stride->value; - int new_max = old_max + stride->value - 1 - offset; - alignment_safe &= (old_max / alignment.modulus == new_max / alignment.modulus); - } - bool external = op->param.defined() || op->image.defined(); - if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { - CodeGen_Posix::visit(op); - return; - } - - alignment.remainder = mod_imp(alignment.remainder, alignment.modulus); - int align_bytes = gcd(alignment.modulus, alignment.remainder) * op->type.bytes(); - // Maximum stack alignment on arm is 16 bytes, so we should - // never claim alignment greater than that. - align_bytes = gcd(align_bytes, 16); - internal_assert(align_bytes > 0); - - // Decide what width to slice things into. If not a multiple - // of 64 or 128 bits, then we can't safely slice it up into - // some number of vlds, so we hand it over the base class. - int bit_width = op->type.bits() * op->type.lanes(); - int intrin_lanes = 0; - if (bit_width % 128 == 0) { - intrin_lanes = 128 / op->type.bits(); - } else if (bit_width % 64 == 0) { - intrin_lanes = 64 / op->type.bits(); - } else { - CodeGen_Posix::visit(op); - return; - } - - llvm::Type *load_return_type = llvm_type_of(op->type.with_lanes(intrin_lanes * stride->value)); - llvm::Type *load_return_pointer_type = load_return_type->getPointerTo(); - Value *undef = UndefValue::get(load_return_type); - SmallVector constants; - for (int j = 0; j < intrin_lanes; j++) { - Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset); - constants.push_back(constant); - } - Constant *constantsV = ConstantVector::get(constants); - - vector results; - for (int i = 0; i < op->type.lanes(); i += intrin_lanes) { - Expr slice_base = simplify(base + i * ramp->stride); - Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_lanes); - Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base); - Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type); - LoadInst *loadI = cast(builder->CreateLoad(bitcastI)); -#if LLVM_VERSION >= 110 - loadI->setAlignment(Align(align_bytes)); -#else - loadI->setAlignment(MaybeAlign(align_bytes)); -#endif - add_tbaa_metadata(loadI, op->name, slice_ramp); - Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV); - results.push_back(shuffleInstr); - } - - // Concat the results - value = concat_vectors(results); - return; - } - // We have builtins for strided loads with fixed but unknown stride, but they use inline assembly. if (target.bits != 64 /* Not yet implemented for aarch64 */) { ostringstream builtin; diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index eda89aca82da..5d82c3013f7b 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1910,63 +1910,76 @@ void CodeGen_LLVM::visit(const Load *op) { if (ramp && stride && stride->value == 1) { value = codegen_dense_vector_load(op); - } else if (ramp && stride && stride->value == 2) { - // Load two vectors worth and then shuffle - Expr base_a = ramp->base, base_b = ramp->base + ramp->lanes; - Expr stride_a = make_one(base_a.type()); - Expr stride_b = make_one(base_b.type()); - - ModulusRemainder align_a = op->alignment; - ModulusRemainder align_b = align_a + ramp->lanes; - - // False indicates we should take the even-numbered lanes - // from the load, true indicates we should take the - // odd-numbered-lanes. - bool shifted_a = false, shifted_b = false; + } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { + // Try to rewrite strided loads as shuffles of dense loads, + // aligned to the stride. This makes adjacent strided loads + // shared a vldN op. + ModulusRemainder align = op->alignment; + Expr base = ramp->base; + int aligned_stride = gcd(stride->value, align.modulus); + int offset = 0; + if (aligned_stride == stride->value) { + offset = mod_imp((int)align.remainder, aligned_stride); + } else { + const Add *add = base.as(); + if (const IntImm *add_c = add ? add->b.as() : base.as()) { + offset = mod_imp(add_c->value, stride->value); + } + } - bool external = op->param.defined() || op->image.defined(); + if (offset) { + base = simplify(base - offset); + align.remainder -= offset; + } - // Don't read beyond the end of an external buffer. + // We want to load a few more bytes than the original load did. + // This is safe under these conditions: + // - The alignment information is sufficient to determine the + // modified load is safe. + // - For internal buffers, we know this is safe because we + // allocate an extra 3 values of padding. // (In ASAN mode, don't read beyond the end of internal buffers either, // as ASAN will complain even about harmless stack overreads.) - if (external || target.has_feature(Target::ASAN)) { - base_b -= 1; - align_b = align_b - 1; - shifted_b = true; - } else { - // If the base ends in an odd constant, then subtract one - // and do a different shuffle. This helps expressions like - // (f(2*x) + f(2*x+1) share loads - const Add *add = ramp->base.as(); - const IntImm *offset = add ? add->b.as() : ramp->base.as(); - if (offset && offset->value & 1) { - base_a -= 1; - align_a = align_a - 1; - shifted_a = true; - base_b -= 1; - align_b = align_b - 1; - shifted_b = true; - } + // The min moves lower by offset. + int load_lanes = ramp->lanes * stride->value; + bool alignment_safe = align.remainder >= 0; + // The max needs to not cross an alignment boundary. + if (align.modulus > 0) { + int old_max = align.remainder + (op->type.lanes() - 1) * stride->value; + int new_max = old_max + stride->value - 1 - offset; + alignment_safe &= (old_max / align.modulus == new_max / align.modulus); + } + bool external = op->param.defined() || op->image.defined(); + if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { + load_lanes -= (stride->value - 1 - offset); } - // Do each load. - Expr ramp_a = Ramp::make(base_a, stride_a, ramp->lanes); - Expr ramp_b = Ramp::make(base_b, stride_b, ramp->lanes); - Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param, op->predicate, align_a); - Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param, op->predicate, align_b); - Value *vec_a = codegen(load_a); - Value *vec_b = codegen(load_b); + int slice_lanes = native_vector_bits() / op->type.bits(); - // Shuffle together the results. - vector indices(ramp->lanes); - for (int i = 0; i < (ramp->lanes + 1) / 2; i++) { - indices[i] = i * 2 + (shifted_a ? 1 : 0); - } - for (int i = (ramp->lanes + 1) / 2; i < ramp->lanes; i++) { - indices[i] = i * 2 + (shifted_b ? 1 : 0); + // We need to slice the result in to native vector lanes, otherwise + // LLVM misses optimizations like using ldN on ARM. + vector results; + for (int i = 0; i < op->type.lanes(); i += slice_lanes) { + int load_lanes_i = std::min(slice_lanes * stride->value, load_lanes - i); + int lanes_i = std::min(slice_lanes, op->type.lanes() - i); + Expr slice_base = simplify(base + i * ramp->stride); + + Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, + op->image, op->param, op->alignment, nullptr, false); + + SmallVector constants; + for (int j = 0; j < lanes_i; j++) { + Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset); + constants.push_back(constant); + } + Constant *constantsV = ConstantVector::get(constants); + Value *undef = UndefValue::get(load_i->getType()); + Value *shuffleInstr = builder->CreateShuffleVector(load_i, undef, constantsV); + results.push_back(shuffleInstr); } - value = shuffle_vectors(vec_a, vec_b, indices); + // Concat the results + value = concat_vectors(results); } else if (ramp && stride && stride->value == -1) { // Load the vector and then flip it in-place Expr flipped_base = ramp->base - ramp->lanes + 1; @@ -2249,14 +2262,14 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) { } } -Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) { - debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n"; - - const Ramp *ramp = load->index.as(); - internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; +llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred, bool slice_to_native) { + debug(4) << "Vectorize predicated dense vector load:\n\t" + << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; - bool is_external = (external_buffer.find(load->name) != external_buffer.end()); - int alignment = load->type.bytes(); // The size of a single element + bool is_external = (external_buffer.find(name) != external_buffer.end()); + int align_bytes = type.bytes(); // The size of a single element int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; @@ -2266,60 +2279,68 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) { // maximum alignment we can infer based on the index alone. // Boost the alignment if possible, up to the native vector width. - ModulusRemainder mod_rem = load->alignment; + ModulusRemainder mod_rem = alignment; while ((mod_rem.remainder & 1) == 0 && (mod_rem.modulus & 1) == 0 && - alignment < native_bytes) { + align_bytes < native_bytes) { mod_rem.modulus /= 2; mod_rem.remainder /= 2; - alignment *= 2; + align_bytes *= 2; } // If it is an external buffer, then we cannot assume that the host pointer // is aligned to at least native vector width. However, we may be able to do // better than just assuming that it is unaligned. if (is_external) { - if (load->param.defined()) { - int host_alignment = load->param.host_alignment(); - alignment = gcd(alignment, host_alignment); - } else if (get_target().has_feature(Target::JIT) && load->image.defined()) { + if (param.defined()) { + int host_alignment = param.host_alignment(); + align_bytes = gcd(align_bytes, host_alignment); + } else if (get_target().has_feature(Target::JIT) && image.defined()) { // If we're JITting, use the actual pointer value to determine alignment for embedded buffers. - alignment = gcd(alignment, (int)(((uintptr_t)load->image.data()) & std::numeric_limits::max())); + align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits::max())); } } // For dense vector loads wider than the native vector // width, bust them up into native vectors - int load_lanes = load->type.lanes(); - int native_lanes = std::max(1, native_bits / load->type.bits()); + int load_lanes = type.lanes(); + int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes; vector slices; for (int i = 0; i < load_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, load_lanes - i); - Expr slice_base = simplify(ramp->base + i); + Expr slice_base = simplify(base + i); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); - llvm::Type *slice_type = get_vector_type(llvm_type_of(load->type.element_of()), slice_lanes); - Value *elt_ptr = codegen_buffer_pointer(load->name, load->type.element_of(), slice_base); + llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes); + Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); Instruction *load_inst; if (vpred != nullptr) { Value *slice_mask = slice_vector(vpred, i, slice_lanes); #if LLVM_VERSION >= 110 - load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(alignment), slice_mask); + load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(align_bytes), slice_mask); #else - load_inst = builder->CreateMaskedLoad(vec_ptr, alignment, slice_mask); + load_inst = builder->CreateMaskedLoad(vec_ptr, align_bytes, slice_mask); #endif } else { - load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(alignment)); + load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(align_bytes)); } - add_tbaa_metadata(load_inst, load->name, slice_index); + add_tbaa_metadata(load_inst, name, slice_index); slices.push_back(load_inst); } value = concat_vectors(slices); return value; } +Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) { + const Ramp *ramp = load->index.as(); + internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; + + return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param, + load->alignment, vpred, slice_to_native); +} + void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 092bc7713b5b..06ec634938ec 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -552,7 +552,10 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false); - llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr); + llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base, + const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, + llvm::Value *vpred = nullptr, bool slice_to_native = true); + llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true); virtual void codegen_predicated_vector_load(const Load *op); virtual void codegen_predicated_vector_store(const Store *op); diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp index 32b3202477aa..49a627b767b1 100644 --- a/src/CodeGen_Posix.cpp +++ b/src/CodeGen_Posix.cpp @@ -75,10 +75,10 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type } int CodeGen_Posix::allocation_padding(Type type) const { - // We potentially load one scalar value past the end of the + // We potentially load 3 scalar values past the end of the // buffer, so pad the allocation with an extra instance of the // scalar type. - return type.bytes(); + return 3 * type.bytes(); } CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type, From 945ae21bffae8abab34117326f2e117254299969 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 8 Mar 2021 18:28:32 -0700 Subject: [PATCH 11/13] Fix comment. --- src/CodeGen_LLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 5d82c3013f7b..f5aeb156b674 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1913,7 +1913,7 @@ void CodeGen_LLVM::visit(const Load *op) { } else if (ramp && stride && 2 <= stride->value && stride->value <= 4) { // Try to rewrite strided loads as shuffles of dense loads, // aligned to the stride. This makes adjacent strided loads - // shared a vldN op. + // share the same underlying dense loads. ModulusRemainder align = op->alignment; Expr base = ramp->base; int aligned_stride = gcd(stride->value, align.modulus); From ecb8daa40d3d49312e52a1d97ae674a8d7139b18 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 8 Mar 2021 18:53:26 -0700 Subject: [PATCH 12/13] clang-format. --- src/CodeGen_LLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index f5aeb156b674..749ad56e3e73 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1965,7 +1965,7 @@ void CodeGen_LLVM::visit(const Load *op) { Expr slice_base = simplify(base + i * ramp->stride); Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base, - op->image, op->param, op->alignment, nullptr, false); + op->image, op->param, op->alignment, nullptr, false); SmallVector constants; for (int j = 0; j < lanes_i; j++) { From ace9fb3bc81bc6370fd8e9aad5dadda450534cf5 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Tue, 9 Mar 2021 12:20:02 -0700 Subject: [PATCH 13/13] Remove sketchy alignment check. --- src/CodeGen_LLVM.cpp | 16 ++--------- test/correctness/simd_op_check.cpp | 45 ++++++++++++++++-------------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 749ad56e3e73..f8fdf854e6b2 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1933,24 +1933,14 @@ void CodeGen_LLVM::visit(const Load *op) { } // We want to load a few more bytes than the original load did. - // This is safe under these conditions: - // - The alignment information is sufficient to determine the - // modified load is safe. - // - For internal buffers, we know this is safe because we - // allocate an extra 3 values of padding. + // We know this is safe for internal buffers because we allocate + // padding. // (In ASAN mode, don't read beyond the end of internal buffers either, // as ASAN will complain even about harmless stack overreads.) // The min moves lower by offset. int load_lanes = ramp->lanes * stride->value; - bool alignment_safe = align.remainder >= 0; - // The max needs to not cross an alignment boundary. - if (align.modulus > 0) { - int old_max = align.remainder + (op->type.lanes() - 1) * stride->value; - int new_max = old_max + stride->value - 1 - offset; - alignment_safe &= (old_max / align.modulus == new_max / align.modulus); - } bool external = op->param.defined() || op->image.defined(); - if (!alignment_safe && (external || target.has_feature(Target::ASAN))) { + if (external || target.has_feature(Target::ASAN)) { load_lanes -= (stride->value - 1 - offset); } diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index d2bebe1899a7..a67af7602a18 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -802,31 +802,34 @@ class SimdOpCheck : public SimdOpCheckTest { } // VLD2 X - Load Two-Element Structures - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_i32(x * 2) + in_i32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_u32(x * 2) + in_u32(x * 2 + 1)); - check(arm32 ? "vld2.32" : "ld2", 4 * w, in_f32(x * 2) + in_f32(x * 2 + 1)); - check(arm32 ? "vld2.8" : "ld2", 8 * w, in_i8(x * 2) + in_i8(x * 2 + 1)); - check(arm32 ? "vld2.8" : "ld2", 8 * w, in_u8(x * 2) + in_u8(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 4 * w, in_i16(x * 2) + in_i16(x * 2 + 1)); - check(arm32 ? "vld2.16" : "ld2", 4 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); + // These need to be vectorized at least 2 native vectors wide, + // so we get a full vectors' worth that we know is safe to + // access. + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_i8(x * 2) + in_i8(x * 2 + 1)); + check(arm32 ? "vld2.8" : "ld2", 32 * w, in_u8(x * 2) + in_u8(x * 2 + 1)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_i16(x * 2) + in_i16(x * 2 + 1)); + check(arm32 ? "vld2.16" : "ld2", 16 * w, in_u16(x * 2) + in_u16(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_i32(x * 2) + in_i32(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_u32(x * 2) + in_u32(x * 2 + 1)); + check(arm32 ? "vld2.32" : "ld2", 8 * w, in_f32(x * 2) + in_f32(x * 2 + 1)); // VLD3 X - Load Three-Element Structures - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_i32(x * 3)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_u32(x * 3)); - check(arm32 ? "vld3.32" : "ld3", 4 * w, in_f32(x * 3)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_i8(x * 3)); - check(arm32 ? "vld3.8" : "ld3", 8 * w, in_u8(x * 3)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_i16(x * 3)); - check(arm32 ? "vld3.16" : "ld3", 4 * w, in_u16(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_i8(x * 3)); + check(arm32 ? "vld3.8" : "ld3", 32 * w, in_u8(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_i16(x * 3)); + check(arm32 ? "vld3.16" : "ld3", 16 * w, in_u16(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_i32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_u32(x * 3)); + check(arm32 ? "vld3.32" : "ld3", 8 * w, in_f32(x * 3)); // VLD4 X - Load Four-Element Structures - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_i32(x * 4)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_u32(x * 4)); - check(arm32 ? "vld4.32" : "ld4", 4 * w, in_f32(x * 4)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_i8(x * 4)); - check(arm32 ? "vld4.8" : "ld4", 8 * w, in_u8(x * 4)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_i16(x * 4)); - check(arm32 ? "vld4.16" : "ld4", 4 * w, in_u16(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_i8(x * 4)); + check(arm32 ? "vld4.8" : "ld4", 32 * w, in_u8(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_i16(x * 4)); + check(arm32 ? "vld4.16" : "ld4", 16 * w, in_u16(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_i32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_u32(x * 4)); + check(arm32 ? "vld4.32" : "ld4", 8 * w, in_f32(x * 4)); // VLDM X F, D Load Multiple Registers // VLDR X F, D Load Single Register