diff --git a/.github/workflows/testing-make.yml b/.github/workflows/testing-make.yml index 4419bc593977..12ea624367fb 100644 --- a/.github/workflows/testing-make.yml +++ b/.github/workflows/testing-make.yml @@ -77,6 +77,7 @@ jobs: "lld-${LLVM_VERSION}" \ "liblld-${LLVM_VERSION}-dev" echo "LLVM_CONFIG=llvm-config-${LLVM_VERSION}" | tee -a "$GITHUB_ENV" + cat /proc/cpuinfo elif [ "$RUNNER_OS" = "macOS" ]; then brew install libjpeg-turbo libpng pkgconf protobuf "llvm@${LLVM_VERSION}" "lld@${LLVM_VERSION}" echo "LLVM_CONFIG=$(brew --prefix "llvm@${LLVM_VERSION}")/bin/llvm-config" | tee -a "$GITHUB_ENV" diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile index 49104b3e5fa3..92ed5d2a5b0b 100644 --- a/apps/iir_blur/Makefile +++ b/apps/iir_blur/Makefile @@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule. $(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS) $(BIN)/%/out.png: $(BIN)/%/filter - $< ../images/rgba.png $(BIN)/$*/out.png + $< ../images/rgb.png $(BIN)/$*/out.png clean: rm -rf $(BIN) diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp index ef3b44eef461..7f411d7e8fef 100644 --- a/apps/iir_blur/iir_blur_generator.cpp +++ b/apps/iir_blur/iir_blur_generator.cpp @@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule if (!skip_schedule) { if (!target.has_gpu_feature()) { // CPU schedule. - // 8.2ms on an Intel i9-9960X using 16 threads + // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads // Split the transpose into tiles of rows. Parallelize over channels - // and strips (Halide supports nested parallelism). - Var xo, yo, t; + // and strips. + Var xo, yo, t, yi; transpose.compute_root() .tile(x, y, xo, yo, x, y, vec, vec * 4) + .split(y, y, yi, vec) + .vectorize(yi) .vectorize(x) - .parallel(yo) - .parallel(c); + .fuse(yo, c, t) + .parallel(t); + + blur.in(transpose) + .compute_at(transpose, y) + .vectorize(x) + .unroll(y); // Run the filter on each row of tiles (which corresponds to a strip of // columns in the input). - blur.compute_at(transpose, yo); + blur.compute_at(transpose, t); // Vectorize computations within the strips. blur.update(0) diff --git a/src/CSE.cpp b/src/CSE.cpp index c2a46d93bc4d..e7e56bb4df09 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -237,10 +237,39 @@ class CSEEveryExprInStmt : public IRMutator { } const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle}); internal_assert(bundle && bundle->args.size() == 2); - Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1], + + Expr value = bundle->args[0], index = bundle->args[1]; + + // Figure out which ones are actually needed by the index + + auto add_all_vars_to_set = [&](const Expr &e, std::set &s) { + visit_with(e, [&](auto *, const Variable *var) { + s.insert(var->name); + }); + }; + + std::set index_lets; + add_all_vars_to_set(index, index_lets); + for (const auto &[var, val] : reverse_view(lets)) { + if (index_lets.count(var)) { + add_all_vars_to_set(val, index_lets); + } + } + + vector> deferred; + for (const auto &[var, val] : reverse_view(lets)) { + if (index_lets.count(var)) { + deferred.emplace_back(var, val); + } else { + value = Let::make(var, val, value); + } + } + + Stmt s = Store::make(op->name, value, index, op->param, mutate(op->predicate), op->alignment); - for (const auto &[var, value] : reverse_view(lets)) { - s = LetStmt::make(var, value, s); + + for (const auto &[var, val] : deferred) { + s = LetStmt::make(var, val, s); } return s; } diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 7178e82965d8..5cf9ccf77f26 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1478,10 +1478,11 @@ void CodeGen_ARM::visit(const Store *op) { intrin_type = t; Type elt = t.element_of(); int vec_bits = t.bits() * t.lanes(); - if (elt == Float(32) || elt == Float(64) || - is_float16_and_has_feature(elt) || - elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) || - elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) { + if (t.bits() <= target.bits && + (elt == Float(32) || elt == Float(64) || + is_float16_and_has_feature(elt) || + elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) || + elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) { const int target_vector_bits = native_vector_bits(); if (vec_bits % 128 == 0) { type_ok_for_vst = true; @@ -1895,6 +1896,7 @@ void CodeGen_ARM::visit(const Shuffle *op) { if (target.os != Target::IOS && target.os != Target::OSX && load && op->vectors.size() == 1 && + op->is_slice() && 2 <= stride && stride <= 4 && op->slice_begin() < stride && load->type.lanes() == stride * op->type.lanes()) { diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 065dcebd1a64..0681dd42605b 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_Posix { llvm::Value *interleave_vectors(const std::vector &v) override; llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b, const std::vector &indices) override; + llvm::Value *optimization_fence(llvm::Value *v) override; using CodeGen_Posix::shuffle_vectors; ///@} @@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b, return vdelta(concat_vectors({a, b}), indices); } +Value *CodeGen_Hexagon::optimization_fence(Value *v) { + // As of llvm 21, the base class version seems to trip up LLVM's hexagon + // backend, possibly because it relies on a floating point type. + return v; +} + Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index, int max_index) { llvm::Type *lut_ty = lut->getType(); @@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index, return slice_vector(concat_vectors(result), 0, idx_elements); } -bool is_power_of_two(int x) { - return (x & (x - 1)) == 0; -} - // vdelta and vrdelta are instructions that take an input vector and // pass it through a network made up of levels. Each element x at each // level i can either take the element from the previous level at the diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 300dfa096a1e..2d74f12b4c67 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1359,10 +1359,6 @@ void CodeGen_LLVM::codegen(const Stmt &s) { s.accept(this); } -bool CodeGen_LLVM::is_power_of_two(int x) const { - return (x & (x - 1)) == 0; -} - Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const { if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) { return Float(32, t.lanes()); @@ -2207,10 +2203,13 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector &vecs) { internal_assert(vecs[0]->getType() == vecs[i]->getType()); } int vec_elements = get_vector_num_elements(vecs[0]->getType()); + const int num_vecs = (int)vecs.size(); - if (vecs.size() == 1) { + int factor = gcd(vec_elements, num_vecs); + + if (num_vecs == 1) { return vecs[0]; - } else if (vecs.size() == 2) { + } else if (num_vecs == 2) { Value *a = vecs[0]; Value *b = vecs[1]; vector indices(vec_elements * 2); @@ -2218,57 +2217,251 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector &vecs) { indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements; } return shuffle_vectors(a, b, indices); - } else { - // Grab the even and odd elements of vecs. - vector even_vecs; - vector odd_vecs; - for (size_t i = 0; i < vecs.size(); i++) { - if (i % 2 == 0) { - even_vecs.push_back(vecs[i]); - } else { - odd_vecs.push_back(vecs[i]); + } else if (factor == 1) { + // The number of vectors and the vector length is + // coprime. (E.g. interleaving an odd number of vectors of some + // power-of-two length). Use the algorithm from "A Decomposition for + // In-place Matrix Transposition" by Catanzaro et al. + std::vector v = vecs; + + // Using unary shuffles, get each element into the right ultimate + // lane. This works out without collisions because the number of vectors + // and the length of each vector is coprime. + std::vector shuffle(vec_elements); + for (int i = 0; i < num_vecs; i++) { + for (int j = 0; j < vec_elements; j++) { + int k = j * num_vecs + i; + shuffle[k % vec_elements] = j; } + v[i] = shuffle_vectors(v[i], v[i], shuffle); + } + + // We intentionally don't put an optimization fence after the unary + // shuffles, because some architectures have a two-way shuffle, so it + // helps to fuse the unary shuffle into the first layer of two-way + // blends below. + + // Now we need to transfer the elements across the vectors. If we + // reorder the vectors, this becomes a rotation across the vectors of a + // different amount per lane. + std::vector new_v(v.size()); + for (int i = 0; i < num_vecs; i++) { + int j = (i * vec_elements) % num_vecs; + new_v[i] = v[j]; } + v.swap(new_v); - // If the number of vecs is odd, save the last one for later. - Value *last = nullptr; - if (even_vecs.size() > odd_vecs.size()) { - last = even_vecs.back(); - even_vecs.pop_back(); + std::vector rotation(vec_elements, 0); + for (int i = 0; i < vec_elements; i++) { + int k = (i * num_vecs) % vec_elements; + rotation[k] = (i * num_vecs) / vec_elements; } - internal_assert(even_vecs.size() == odd_vecs.size()); + internal_assert(rotation[0] == 0); - // Interleave the even and odd parts. - Value *even = interleave_vectors(even_vecs); - Value *odd = interleave_vectors(odd_vecs); + // We'll handle each bit of the rotation one at a time with a two-way + // shuffle. + int d = 1; + while (d < num_vecs) { - if (last) { - int result_elements = vec_elements * vecs.size(); + for (int i = 0; i < vec_elements; i++) { + shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements); + } + + for (int i = 0; i < num_vecs; i++) { + int j = (i + num_vecs - d) % num_vecs; + new_v[i] = shuffle_vectors(v[i], v[j], shuffle); + } + + v.swap(new_v); - // Interleave even and odd, leaving a space for the last element. - vector indices(result_elements, -1); - for (int i = 0, idx = 0; i < result_elements; i++) { - if (i % vecs.size() < vecs.size() - 1) { - indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size(); - idx++; + d *= 2; + } + + return concat_vectors(v); + } else { + // The number of vectors shares a factor with the length of the + // vectors. Pick some factor of the number of vectors, interleave in + // separate groups, and then interleave the results. Do the largest + // power of two factor first. + int f = largest_power_of_two_factor(num_vecs); + if (f == 1 || f == num_vecs) { + for (int i = 2; i < num_vecs; i++) { + if (num_vecs % i == 0) { + f = i; + break; } } - Value *even_odd = shuffle_vectors(even, odd, indices); + } - // Interleave the last vector into the result. - last = slice_vector(last, 0, result_elements); - for (int i = 0; i < result_elements; i++) { - if (i % vecs.size() < vecs.size() - 1) { - indices[i] = i; - } else { - indices[i] = i / vecs.size() + result_elements; + // if f == 1 then the vector length is a multiple of the + // interleaving factor and the number of vectors is prime but not two + // (e.g. vec_elements = 24 and num_vecs = 3). Pad each vector out to a + // power of two size, interleave, and discard the tail of the + // result. This buys us some extra room to run Catanzaro's algorithm in. + if (f == 1) { + int padded_size = next_power_of_two(vec_elements); + std::vector padded(num_vecs); + for (int i = 0; i < num_vecs; i++) { + // slice_vector can also be used to pad with don't cares + padded[i] = slice_vector(vecs[i], 0, padded_size); + } + Value *v = interleave_vectors(padded); + return slice_vector(v, 0, num_vecs * vec_elements); + } + + internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) + << f << " " << num_vecs << " " << factor; + + vector> groups(f); + for (int i = 0; i < num_vecs; i++) { + groups[i % f].push_back(vecs[i]); + } + + // Interleave each group + vector interleaved(f); + for (int i = 0; i < f; i++) { + interleaved[i] = optimization_fence(interleave_vectors(groups[i])); + } + + // Interleave the result + return interleave_vectors(interleaved); + } +} + +std::vector CodeGen_LLVM::deinterleave_vector(Value *vec, int num_vecs) { + int vec_elements = get_vector_num_elements(vec->getType()); + internal_assert(vec_elements % num_vecs == 0); + vec_elements /= num_vecs; + + int factor = gcd(vec_elements, num_vecs); + + if (num_vecs == 1) { + return {vec}; + } else if (num_vecs == 2) { + std::vector result(2); + std::vector indices(vec_elements); + for (int i = 0; i < vec_elements; i++) { + indices[i] = i * 2; + } + result[0] = shuffle_vectors(vec, vec, indices); + for (int i = 0; i < vec_elements; i++) { + indices[i]++; + } + result[1] = shuffle_vectors(vec, vec, indices); + return result; + } else if (factor == 1) { + // Use the inverse of Catanzaro's algorithm from above. We slice into + // distinct vectors, then rotate each element into the correct final + // vector, then do a unary permutation of each vector. + + // Instead of concatenating, we slice. + std::vector v(num_vecs); + for (int i = 0; i < num_vecs; i++) { + v[i] = slice_vector(vec, i * vec_elements, vec_elements); + } + + // Compute the same rotation as above + std::vector rotation(vec_elements, 0); + for (int i = 0; i < vec_elements; i++) { + int k = (i * num_vecs) % vec_elements; + rotation[k] = (i * num_vecs) / vec_elements; + } + internal_assert(rotation[0] == 0); + + // We'll handle each bit of the rotation one at a time with a two-way + // shuffle. + std::vector shuffle(vec_elements); + std::vector new_v(v.size()); + int d = 1; + while (d < num_vecs) { + + for (int i = 0; i < vec_elements; i++) { + shuffle[i] = ((rotation[i] & d) == 0) ? i : (i + vec_elements); + } + + for (int i = 0; i < num_vecs; i++) { + // The rotation is in the opposite direction to the interleaving + // version, so num_vecs - d becomes just d. + int j = (i + d) % num_vecs; + // An optimization fence here keeps it as a blend and stops it + // from getting fused with the unary shuffle below. + new_v[i] = optimization_fence(shuffle_vectors(v[i], v[j], shuffle)); + } + + v.swap(new_v); + d *= 2; + } + + // Now reorder the vectors in the inverse order to the above. + for (int i = 0; i < num_vecs; i++) { + int j = (i * vec_elements) % num_vecs; + // j and i are swapped below, because we're doing the inverse of the + // algorithm above. This map is 1:1 because vec_elements and + // num_vecs are coprime, so every slot of new_v is stored to. + new_v[j] = v[i]; + } + v.swap(new_v); + + // The elements are now in the correct vector. Finish up with a unary + // shuffle of each. + for (int i = 0; i < num_vecs; i++) { + for (int j = 0; j < vec_elements; j++) { + int k = j * num_vecs + i; + // This is the inverse shuffle of the interleaving version, so + // the index and the arg of the assignment below are swapped + // compared to the above. + shuffle[j] = k % vec_elements; + } + + v[i] = shuffle_vectors(v[i], v[i], shuffle); + } + + return v; + + } else { + // Do a lower-factor deinterleave, then deinterleave each result + // again. We know there's a non-trivial factor because if it were prime + // the gcd above would have been 1. Do the largest power-of-two factor + // first. + int f = largest_power_of_two_factor(num_vecs); + if (f == 1 || f == num_vecs) { + for (int i = 2; i < num_vecs; i++) { + if (num_vecs % i == 0) { + f = i; + break; } } + } - return shuffle_vectors(even_odd, last, indices); - } else { - return interleave_vectors({even, odd}); + // if f == 1 then the final vector length is a multiple of the + // deinterleaving factor and the number of vectors is prime but not two + // (e.g. vec_elements = 24 and num_vecs = 3). Pad the vector out to a + // power of two size, deinterleave, and discard the tail of each vector + // result. This buys us some extra room to run Catanzaro's algorithm in. + if (f == 1) { + int padded_size = next_power_of_two(vec_elements); + Value *padded = slice_vector(vec, 0, padded_size * num_vecs); + std::vector result = deinterleave_vector(padded, num_vecs); + for (int i = 0; i < num_vecs; i++) { + result[i] = slice_vector(result[i], 0, vec_elements); + } + return result; } + + internal_assert(f > 1 && f < num_vecs && num_vecs % f == 0) + << f << " " << num_vecs << " " << factor; + + auto partial = deinterleave_vector(vec, f); + std::vector result(num_vecs); + for (size_t i = 0; i < partial.size(); i++) { + Value *v = partial[i]; + auto vecs = deinterleave_vector(v, num_vecs / f); + for (size_t j = 0; j < vecs.size(); j++) { + result[j * f + i] = vecs[j]; + } + } + + return result; } } @@ -4162,6 +4355,24 @@ void CodeGen_LLVM::visit(const Shuffle *op) { if (op->is_interleave()) { value = interleave_vectors(vecs); + } else if (op->is_transpose()) { + int cols = op->transpose_factor(); + int rows = op->vectors[0].type().lanes() / cols; + if (is_power_of_two(cols) && + !is_power_of_two(rows)) { + // We're doing something like vectorizing over c and x when storing + // packed rgb. Best handled as an interleave. + std::vector slices(rows); + for (int i = 0; i < rows; i++) { + slices[i] = slice_vector(vecs[0], i * cols, cols); + } + value = interleave_vectors(slices); + } else { + // Deinterleave out the cols of the input matrix and concat + // them. Occurs when, for example, loading packed RGB and + // vectorizing across x. + value = concat_vectors(deinterleave_vector(vecs[0], cols)); + } } else if (op->is_concat()) { value = concat_vectors(vecs); } else { @@ -4981,8 +5192,12 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) { Value *CodeGen_LLVM::optimization_fence(Value *v) { llvm::Type *t = v->getType(); - internal_assert(!t->isScalableTy()) - << "optimization_fence does not support scalable vectors yet"; + if (t->isScalableTy()) { + // Convert to fixed, fence, convert back. + Value *fixed = scalable_to_fixed_vector_type(v); + fixed = optimization_fence(fixed); + return fixed_to_scalable_vector_type(fixed); + } const int bits = t->getPrimitiveSizeInBits(); if (bits % 32) { const int lanes = get_vector_num_elements(t); @@ -4994,7 +5209,7 @@ Value *CodeGen_LLVM::optimization_fence(Value *v) { v = slice_vector(v, 0, lanes); return v; } - llvm::Type *float_type = llvm_type_of(Float(32, bits / 32)); + llvm::Type *float_type = get_vector_type(f32_t, bits / 32, VectorTypeConstraint::Fixed); v = builder->CreateBitCast(v, float_type); v = builder->CreateArithmeticFence(v, float_type); return builder->CreateBitCast(v, t); diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index bdd267020f1a..57d78172c4fa 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -462,6 +462,9 @@ class CodeGen_LLVM : public IRVisitor { * an arbitrary number of vectors.*/ virtual llvm::Value *interleave_vectors(const std::vector &); + /** The inverse of interleave_vectors. */ + virtual std::vector deinterleave_vector(llvm::Value *vec, int num_vecs); + /** Description of an intrinsic function overload. Overloads are resolved * using both argument and return types. The scalar types of the arguments * and return type must match exactly for an overload resolution to succeed. */ @@ -530,8 +533,6 @@ class CodeGen_LLVM : public IRVisitor { /** Shorthand for shuffling a single vector. */ llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector &indices); - bool is_power_of_two(int x) const; - bool is_scalable_vector(llvm::Value *v) const; /** Go looking for a vector version of a runtime function. Will diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index f5cd7713884c..3e52ac7fa1a5 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -11,6 +11,8 @@ #include "Substitute.h" #include "Util.h" +#include + namespace Halide { namespace Internal { @@ -111,6 +113,9 @@ class CodeGen_X86 : public CodeGen_Posix { void codegen_vector_reduce(const VectorReduce *, const Expr &init) override; // @} + std::vector deinterleave_vector(llvm::Value *, int) override; + llvm::Value *interleave_vectors(const std::vector &) override; + private: Scope mem_type; }; @@ -929,6 +934,753 @@ void CodeGen_X86::codegen_vector_reduce(const VectorReduce *op, const Expr &init CodeGen_Posix::codegen_vector_reduce(op, init); } +std::vector CodeGen_X86::deinterleave_vector(Value *vec, int num_vecs) { + int vec_elements = get_vector_num_elements(vec->getType()) / num_vecs; + const size_t element_bits = vec->getType()->getScalarSizeInBits(); + if (target.has_feature(Target::AVX) && + is_power_of_two(num_vecs) && + is_power_of_two(vec_elements) && + (int)(vec_elements * num_vecs * element_bits) > native_vector_bits()) { + + // Our interleaving logic below supports this case + std::vector slices(vec_elements); + for (int i = 0; i < vec_elements; i++) { + slices[i] = slice_vector(vec, i * num_vecs, num_vecs); + } + vec = interleave_vectors(slices); + std::vector result(num_vecs); + for (int i = 0; i < num_vecs; i++) { + result[i] = slice_vector(vec, i * vec_elements, vec_elements); + } + return result; + } else { + return CodeGen_Posix::deinterleave_vector(vec, num_vecs); + } +} + +Value *CodeGen_X86::interleave_vectors(const std::vector &vecs) { + // Only use x86-specific interleaving for AVX and above + if (vecs.empty() || !target.has_feature(Target::AVX)) { + return CodeGen_Posix::interleave_vectors(vecs); + } + + if (vecs.size() == 1) { + return vecs[0]; + } + + // Get the element type and vector properties + llvm::Type *vec_type = vecs[0]->getType(); + llvm::Type *element_type = get_vector_element_type(vec_type); + int vec_elements = get_vector_num_elements(vec_type); + const size_t element_bits = element_type->getScalarSizeInBits(); + const size_t elems_per_native_vec = native_vector_bits() / element_bits; + const size_t elems_per_slice = 128 / element_bits; + + // Only apply special x86 logic for power-of-two interleaves for avx and + // above where we're going to end up with multiple native vectors. + + if (!is_power_of_two(vec_elements) && + vec_elements % elems_per_native_vec == 0) { + // It's not a power of two, but it's a multiple of the native vector + // length, so slice it and recurse. + std::vector results; + for (int i = 0; i < vec_elements; i += elems_per_native_vec) { + std::vector slices; + slices.reserve(vecs.size()); + for (auto *v : vecs) { + slices.push_back(slice_vector(v, i, (int)elems_per_native_vec)); + } + results.push_back(interleave_vectors(slices)); + } + return concat_vectors(results); + } + + if (!is_power_of_two(vec_elements) || + !is_power_of_two(vecs.size()) || + (vecs.size() * vec_elements * element_bits) <= (size_t)native_vector_bits()) { + return CodeGen_Posix::interleave_vectors(vecs); + } + + /* + x86 has a weird set of vector shuffle instructions due to historical + baggage, and the strategy in the base class for interleaving vectors + works poorly. Here we have a somewhat complex algorithm for generating + better sequences of shuffle instructions for avx and avx-512. + + Consider the location of one of the elements of one of the vectors. It has + a vector index, which says which vector it's in, and a vector lane index, + which gives the lane. x86 shuffles work in terms of 128-bit subvectors, + which we will call slices. So we'll decompose that lane index into a slice + index, to identify the 128-bit slice within a vector, and the lane index + within that slice. For avx the slice index is either zero or one, and for + avx-512 it can be zero through three. Because we have limited everything + to be a power of two, we can write out these indices in binary. We'll use + v for the vector index, s for the slice index, and l for the lane + index. For an avx-512 interleave of 16 vectors of 32 elements each + (i.e. uint16s), a location could thus be written as: + + [l0 l1 l2] [s0 s1] [v0 v1 v2 v3] + + where l0 is the least-significant bit of the lane index, and so on. + + An interleave takes the bits that give the vector index and moves them to + be the least significant bits, shifting everything else over. So the + indices of our vectors after the interleave should be: + + [v0 v1 v2] [v3 l0] [l1 l2 s0 s1] + + Assigning numbers to each according to their final location, we start with: + + [4 5 6] [7 8] [0 1 2 3] + + and we want to issue some sequence of instructions to get us to: + + [0 1 2] [3 4] [5 6 7 8] + + Now let's consider the instructions we have available. These generally + permute these bits. E.g. an instruction that interleaves two entire + vectors, applied to pairs of vectors, would take the same vector bit and + make it the lowest lane bit instead, shuffling the other bits upwards, + with the highest-order within-vector bit taking the place of the vector + bit (because we produce separate vectors for the low and high half of the + result. So if we used this instruction to push the highest vector bit + inwards, we could turn this: + + [4 5 6] [7 8] [0 1 2 3] + + into this: + + [3 4 5] [6 7] [0 1 2 8] + + If we did this three more times, pulling a different vector bit in each + time, we'd get: + + [0 1 2] [3 4] [5 6 7 8] + + and we'd be done! This is what the base class does. Unfortunately, x86 has + no such instruction, so we'll have to figure out something else. + Interleaving vectors often happens in contexts with high register + pressure, so we will restrict our attention to instructions that take + immediates. The most important one is vunpckl/h. This interleaves lanes + between two vectors but staying within each 128-bit slice. So the slice + bits will be unchanged, and the lane bits will be rotated right along with + one of the vector bits. So if we interleave vectors starting from the + second-highest vector bit, we can turn this: + + [4 5 6] [_ _] [_ _ 2 _] + + into this: + + [2 4 5] [_ _] [_ _ 6 _] + + where the underscores indicate bits that are unchanged. + + Unlike a full vector interleave, the slice bits stayed fixed, and the + highest within-slice lane bit (6) took the place of the vector bit + instead. This is at least a good start. If we do this two more times, + pulling in vector bits 0 and 1, we can make this: + + [0 1 2] [7 8] [4 5 6 3] + + The lane bits are now in the desired state. The next instruction to + consider is shufi. It's more general than this, but for our purposes there + are two interesting things we can do with it. We concatenate the low halves + of two vectors or the high halves of two vectors, which swaps the + high-order slice bit with one of the vector bits: + + [_ _ _] [_ 8] [_ _ _ 3] -> [_ _ _] [_ 3] [_ _ _ 8] + + We can also interleave the even slices of a vector with the even slices of + another (and do the same for odd), which rotates left the two slice bits + together with one of the vector bits: + + [_ _ _] [7 3] [4 _ _ _] -> [_ _ _] [3 4] [7 _ _ _] + + The vector bit became the high slice bit, the low slice bit took the place + of the vector bit, and the high slice bit becomes the low slice + bit. Filling in the underscores, we're now in this state: + + [0 1 2] [3 4] [7 5 6 8] + + Only the vector bits are wrong, but permuting entire vectors is free, + because that's just changing which register names we're referring to + (shuffling our array of llvm::Value *). So all totalled, per vector, we + needed three unckl/h instructions, and one shufi instruction of each + kind. If the vectors were a narrower type, it would have just added one + more unpckl. + + If you're interleaving lots of complete vectors, that's the whole story, + but there are other situations to consider. It's not uncommon to want to + interleave half-vectors to make some number of full vectors. We can model + this by having some slice or even lane bits start as missing. So + interleaving 16 half-vectors of uint16s to 8 full vectors would be + starting from this: + + [4 5 6] [7] [0 1 2 3] + + and trying to get here: + + [0 1 2] [3 4] [5 6 7] + + Each of our instructions has to operate on every vector, so to reduce the + number of instructions so we'd first like to do something to create that + missing high slice bit, halving the number of vectors. E.g. we could + identify pairs of vectors to concatenate. Let's try concatenating pairs + using the high vector bit (3): + + [4 5 6] [7 3] [0 1 2] + + Now we do three unpcks to rotate 0 1 2 into the correct place: + + [0 1 2] [7 3] [4 5 6] + + Now a single shufi can rotate 7 3 and 4: + + [0 1 2] [3 4] [7 5 6] + + and we just need to reorder whole vectors and we're done. So in this case + we needed only a single shufi instruction, because our desired low slice + bit (3) was already sitting there as the high slice bit after + pairwise concatenation. + + Now consider the case where we had only four half-vectors to interleave to + produce two whole vectors: + + [2 3 4] [5] [0 1] + + Let's concatenate adjacent pairs as before. + + [2 3 4] [5 0] [1] + + Now we do one unpck + + [1 2 3] [5 0] [4] + + And we encounter a problem when it comes to the second one. The next bit + we want pull in is hiding in the slice bits, which unpck instructions + can't access. So at this point we use a shufi to push it back into the + vector bits, swapping 0 and 4. + + [1 2 3] [5 4] [0] + + Now we can do the last unpck. + + [0 1 2] [5 4] [3] + + From here we can use two shufi instructions to fix up the vector and slice + bits. + + So there are many possible paths depending on the number of elements per + vector, the number of elements per 128-bit slice of each vector, and the + number of vectors to interleave. The way to stay sane is to just + explicitly track the vectors above as l_bits, s_bits, and v_bits, and + transform it alongside all our instructions as we try to get the right + bits in the right final places. + */ + + // Make a working copy + std::vector v = vecs; + + // The number of 128-bit slices per vector is 2 for avx and 4 for avx512 + const int final_num_s_bits = ctz64(native_vector_bits() / 128); + internal_assert(final_num_s_bits == 1 || final_num_s_bits == 2) + << native_vector_bits() << " " << final_num_s_bits; + + const int num_v_bits = ctz64(v.size()); + const int num_s_bits = ((size_t)vec_elements <= elems_per_slice) ? 0 : ctz64(vec_elements / elems_per_slice); + const int num_l_bits = ctz64(std::min((size_t)vec_elements, elems_per_slice)); + + // Construct the initial tracking vectors for each bit location + std::vector v_bits(num_v_bits), l_bits(num_l_bits), s_bits(num_s_bits); + int c = 0; + for (int i = 0; i < num_v_bits; i++) { + // We want the v bits to end up innermost, so number them 0, 1, 2 ... + v_bits[i] = c++; + } + for (int i = 0; i < num_l_bits; i++) { + // Then come the l bits + l_bits[i] = c++; + } + for (int i = 0; i < num_s_bits; i++) { + // and finally, the slice bits + s_bits[i] = c++; + } + + // Now we define helpers for each instruction we are going to use + + // Useful for debugging or enhancing this algorithm + /* + auto dump_bits = [&]() { + for (int b : l_bits) { + debug(0) << b << " "; + } + debug(0) << "| "; + for (int b : s_bits) { + debug(0) << b << " "; + } + debug(0) << "| "; + for (int b : v_bits) { + debug(0) << b << " "; + } + debug(0) << "\n"; + }; + */ + + // unpckl/h instruction + auto unpck = [&](Value *a, Value *b) -> std::pair { + int n = get_vector_num_elements(a->getType()); + std::vector lo_indices, hi_indices; + + for (int i = 0; i < n; i += (int)elems_per_slice) { + int half = (int)elems_per_slice / 2; + // For the low result, interleave the first half of each slice + for (int j = 0; j < half; j++) { + lo_indices.push_back(i + j); + lo_indices.push_back(n + i + j); + } + // For the high result, interleave the second half of each slice + for (int j = half; j < (int)elems_per_slice; j++) { + hi_indices.push_back(i + j); + hi_indices.push_back(n + i + j); + } + } + + Value *lo = shuffle_vectors(a, b, lo_indices); + Value *hi = shuffle_vectors(a, b, hi_indices); + // Everything falls apart if we let LLVM fuse shuffles, so we add + // optimization fences around the results to ensure we get the + // instructions we're asking for. + return {optimization_fence(lo), optimization_fence(hi)}; + }; + + // shufi instruction, with or without cross-over + auto shufi = [&](Value *a, Value *b, bool crossover) -> std::pair { + int n = get_vector_num_elements(a->getType()); + std::vector lo_indices, hi_indices; + if (final_num_s_bits == 2) { + // AVX-512 + for (int i = 0; i < (int)elems_per_slice; i++) { + lo_indices.push_back(i); + hi_indices.push_back(i + (crossover ? 1 : 2) * (int)elems_per_slice); + } + for (int i = 0; i < (int)elems_per_slice; i++) { + lo_indices.push_back(i + (crossover ? 2 : 1) * (int)elems_per_slice); + hi_indices.push_back(i + 3 * (int)elems_per_slice); + } + for (int i = 0; i < (int)elems_per_slice * 2; i++) { + lo_indices.push_back(lo_indices[i] + n); + hi_indices.push_back(hi_indices[i] + n); + } + } else { + // AVX-2 + for (int i = 0; i < (int)elems_per_slice; i++) { + lo_indices.push_back(i); + hi_indices.push_back(i + elems_per_slice); + } + for (int i = 0; i < (int)elems_per_slice; i++) { + lo_indices.push_back(lo_indices[i] + n); + hi_indices.push_back(hi_indices[i] + n); + } + } + Value *lo = shuffle_vectors(a, b, lo_indices); + Value *hi = shuffle_vectors(a, b, hi_indices); + return {lo, hi}; + }; + + // A 2x2 transpose of slices within a single vector + auto self_shufi = [&](Value *a) -> Value * { + internal_assert(4 * (int)elems_per_slice == vec_elements) + << "Should only be using shufi helper when targeting avx-512 shuffles on native vectors\n" + << elems_per_slice << " " << vec_elements << " " << native_vector_bits() << "\n"; + std::vector indices; + for (int j : {0, 2, 1, 3}) { + for (int i = 0; i < (int)elems_per_slice; i++) { + indices.push_back(i + j * (int)elems_per_slice); + } + } + return shuffle_vectors(a, a, indices); + }; + + // A helper to iterate over all pairs of entries in v, separated by some + // power-of-two spacing. + auto for_all_pairs = [&](size_t log_step, auto fn) { + size_t step = 1 << log_step; + for (size_t i = 0; i < v.size(); i++) { + // Pair each vector with the one separated by the step. + size_t j = i ^ step; + + // Don't process vectors twice. + if (j < i) { + continue; + } + + fn(&v[i], &v[j]); + } + }; + + // First, if the vectors are wider than native, that will manifest as too + // many slice bits. Cut them into separate native vectors. This will not + // create any instructions. + while ((size_t)vec_elements > elems_per_native_vec) { + int cut = vec_elements / 2; + std::vector new_v; + new_v.reserve(v.size() * 2); + for (auto *vec : v) { + new_v.push_back(slice_vector(vec, 0, cut)); + } + for (auto *vec : v) { + new_v.push_back(slice_vector(vec, cut, cut)); + } + v = new_v; + vec_elements = cut; + + v_bits.push_back(s_bits.back()); + s_bits.pop_back(); + } + + // If adjacent vectors are shuffles of the same underlying vector(s), + // concatenate pairs, because this is probably free. + while ((size_t)vec_elements < elems_per_native_vec && !v_bits.empty()) { + std::vector new_v; + new_v.reserve(v.size() / 2); + bool fail = false; + std::vector indices; + indices.reserve(vec_elements * 2); + for (size_t i = 0; i < v.size(); i += 2) { + ShuffleVectorInst *a = llvm::dyn_cast(v[i]); + ShuffleVectorInst *b = llvm::dyn_cast(v[i + 1]); + if (a && + b && + a->getOperand(0) == b->getOperand(0) && + a->getOperand(1) == b->getOperand(1)) { + + // Concatenate the two shuffles + indices.clear(); + for (int j : a->getShuffleMask()) { + indices.push_back(j); + } + for (int j : b->getShuffleMask()) { + indices.push_back(j); + } + new_v.push_back(shuffle_vectors(a->getOperand(0), a->getOperand(1), indices)); + } else { + fail = true; + } + } + if (fail) { + break; + } + + v.swap(new_v); + // The lowest vector bit becomes the highest lane or slice bit + if ((size_t)vec_elements < elems_per_slice) { + l_bits.push_back(v_bits[0]); + } else { + s_bits.push_back(v_bits[0]); + } + v_bits.erase(v_bits.begin()); + vec_elements *= 2; + } + + if (final_num_s_bits > 1 && + (size_t)vec_elements == elems_per_native_vec && + (size_t)v_bits[0] >= l_bits.size() - 1) { + // A big binary shuffle of adjacent pairs will fix the l bits + // entirely. AVX-512 has these. Yes, this will use registers for the + // shuffle indices, but the alternative requires very many unpck + // operations to completely cycle out the v_bits that are hiding in the + // bottom of the l_bits. + + std::vector lo_indices(vec_elements); + std::vector hi_indices(vec_elements); + std::vector sorted_bits = l_bits; + sorted_bits.insert(sorted_bits.end(), s_bits.begin(), s_bits.end()); + sorted_bits.push_back(v_bits[0]); + std::sort(sorted_bits.begin(), sorted_bits.end()); + std::vector idx_of_bit(l_bits.size() + s_bits.size() + v_bits.size(), 0); + for (size_t b = 0; b < sorted_bits.size(); b++) { + idx_of_bit[sorted_bits[b]] = b; + } + + for (size_t dst_idx = 0; dst_idx < (size_t)vec_elements * 2; dst_idx++) { + size_t src_idx = 0; + for (size_t b = 0; b < l_bits.size(); b++) { + src_idx |= ((dst_idx >> idx_of_bit[l_bits[b]]) & 1) << b; + } + for (size_t b = 0; b < s_bits.size(); b++) { + src_idx |= ((dst_idx >> idx_of_bit[s_bits[b]]) & 1) << (b + l_bits.size()); + } + src_idx |= ((dst_idx >> idx_of_bit[v_bits[0]]) & 1) << (l_bits.size() + s_bits.size()); + if (dst_idx < (size_t)vec_elements) { + lo_indices[dst_idx] = (int)src_idx; + } else { + hi_indices[dst_idx - vec_elements] = (int)src_idx; + } + } + + for_all_pairs(0, [&](auto *a, auto *b) { + Value *lo = shuffle_vectors(*a, *b, lo_indices); + Value *hi = shuffle_vectors(*a, *b, hi_indices); + *a = lo; + *b = hi; + }); + + auto first_s_bit = sorted_bits.begin() + l_bits.size(); + std::copy(sorted_bits.begin(), first_s_bit, l_bits.begin()); + std::copy(first_s_bit, first_s_bit + s_bits.size(), s_bits.begin()); + v_bits[0] = sorted_bits.back(); + } + + // Interleave pairs if we have vectors smaller than a single slice. Choosing + // which pairs to interleave is important because we want to pull down v + // bits that are destined to end up as l bits, and we want to pull them down + // in order. + if ((size_t)vec_elements < elems_per_slice) { + int highest_desired_l_bit = ctz64(elems_per_slice) - 1; + int bit = highest_desired_l_bit; + if (!v_bits.empty() && std::find(v_bits.begin(), v_bits.end(), bit) == v_bits.end()) { + bit = v_bits.back(); + } + + while (bit >= 0 && (size_t)vec_elements < elems_per_slice && !v_bits.empty()) { + auto it = std::find(v_bits.begin(), v_bits.end(), bit); + if (it == v_bits.end()) { + break; + } + int j = it - v_bits.begin(); + v_bits.erase(it); + l_bits.insert(l_bits.begin(), bit); + + // The distance in the vecs array is the index of the corresponding + // v bit we're pulling down. + std::vector new_v; + new_v.reserve(v.size() / 2); + for_all_pairs(j, [&](auto *a, auto *b) { + // Just interleave the two vectors. Because we have fewer + // elements than one slice, unpckl/h is a straight interleave. + std::vector indices; + for (int k = 0; k < vec_elements; k++) { + indices.push_back(k); + indices.push_back(vec_elements + k); + } + new_v.push_back(shuffle_vectors(*a, *b, indices)); + }); + v.swap(new_v); + vec_elements *= 2; + bit--; + } + } + + // Concatenate/repack to get at least the desired number of slice bits. + while ((int)s_bits.size() < final_num_s_bits && !v_bits.empty()) { + const int desired_low_slice_bit = ctz64(elems_per_slice); + const int desired_high_slice_bit = desired_low_slice_bit + 1; + int bit; + if (!s_bits.empty() && + s_bits[0] == desired_low_slice_bit) { + // Only the avx-512 path should land here due to the while condition. + internal_assert(final_num_s_bits == 2); + bit = desired_high_slice_bit; + } else { + bit = desired_low_slice_bit; + } + + auto v_it = std::find(v_bits.begin(), v_bits.end(), bit); + + if (v_it == v_bits.end()) { + // Just concatenate according to the lowest vector bit. + v_it = v_bits.begin(); + bit = *v_it; + } + + int j = v_it - v_bits.begin(); + v_bits.erase(v_it); + s_bits.push_back(bit); + + std::vector new_v; + new_v.reserve(v.size() / 2); + for_all_pairs(j, [&](auto *a, auto *b) { + new_v.push_back(concat_vectors({*a, *b})); + }); + v.swap(new_v); + vec_elements *= 2; + } + + // There should be more than one vector left + internal_assert(v.size() > 1); + + // Now we have at least two whole vectors. Next we try to finalize lane bits using + // unpck instructions. + while (l_bits[0] != 0) { + + int first_s_bit = (int)ctz64(elems_per_slice); + int bit = std::min(l_bits[0], first_s_bit) - 1; + + auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit); + + // internal_assert(vb_it != v_bits.end()); + if (vb_it == v_bits.end()) { + // The next bit is not in vector bits. It must be hiding in the + // slice bits due to earlier concatenation. Move it into the v_bits + // with a shufi. We'll need to pick a v bit to take its place, + // ideally one destined to end up in the s bits. + vb_it = std::find_if(v_bits.begin(), v_bits.end(), [&](int b) { return b >= first_s_bit; }); + if (vb_it == v_bits.end()) { + vb_it = v_bits.begin(); + } + + if (s_bits.back() == bit) { + // It's the last (or sole) slice bit. Swap it with the first v bit + std::swap(s_bits.back(), *vb_it); + for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) { + auto [lo, hi] = shufi(*a, *b, false); + *a = lo; + *b = hi; + }); + } else { + internal_assert(s_bits.size() == 2 && s_bits[0] == bit); + // It's the low slice bit. We need shufi with crossover. + int v_bit = *vb_it; + *vb_it = s_bits[0]; + s_bits[0] = s_bits[1]; + s_bits[1] = v_bit; + for_all_pairs(vb_it - v_bits.begin(), [&](auto *a, auto *b) { + auto [lo, hi] = shufi(*a, *b, true); + *a = lo; + *b = hi; + }); + } + } + + int j = vb_it - v_bits.begin(); + *vb_it = l_bits.back(); + l_bits.pop_back(); + l_bits.insert(l_bits.begin(), bit); + + for_all_pairs(j, [&](auto *a, auto *b) { + auto [lo, hi] = unpck(*a, *b); + *a = lo; + *b = hi; + }); + } + + // Lane bits should now be 0, 1, 2, 3... + for (int i = 0; i < (int)l_bits.size(); i++) { + internal_assert(l_bits[i] == i); + } + + // Time to fix the slice bits + + // First the low slice bit. If it's one of the v bits, move it to be the + // high slice bit with a shufi. + int low_slice_bit = l_bits.size(); + auto ls_in_v = std::find(v_bits.begin(), v_bits.end(), low_slice_bit); + if (ls_in_v != v_bits.end()) { + int j = ls_in_v - v_bits.begin(); + std::swap(*ls_in_v, s_bits.back()); + + for_all_pairs(j, [&](auto *a, auto *b) { + auto [lo, hi] = shufi(*a, *b, false); + *a = lo; + *b = hi; + }); + } + + // And then the high slice bit, if there is one + if (final_num_s_bits == 2) { + // AVX-512 + int high_slice_bit = low_slice_bit + 1; + auto hs_in_v = std::find(v_bits.begin(), v_bits.end(), high_slice_bit); + if (hs_in_v != v_bits.end()) { + // The high slice bit is in the v_bits. Note that if it's not, it'll + // be one of the slice bits. It can't be an l bit, because we've + // already finalized them. + int j = hs_in_v - v_bits.begin(); + + if (!s_bits.empty() && s_bits.back() == low_slice_bit) { + // The low slice bit is currently occupying the high slice bit slot, + // so we need to shuffle it over at the same time by using the + // crossover variant of shufi. + int temp = s_bits[0]; + s_bits[0] = s_bits.back(); + s_bits.back() = *hs_in_v; + *hs_in_v = temp; + + for_all_pairs(j, [&](auto *a, auto *b) { + auto [lo, hi] = shufi(*a, *b, true); + *a = lo; + *b = hi; + }); + + } else { + // The low slice bit must be already in place, so no crossover required. + internal_assert(s_bits[0] == low_slice_bit); + std::swap(*hs_in_v, s_bits.back()); + + for_all_pairs(j, [&](auto *a, auto *b) { + auto [lo, hi] = shufi(*a, *b, false); + *a = lo; + *b = hi; + }); + } + } else if (s_bits.size() == 2 && + s_bits[0] == high_slice_bit && + s_bits[1] == low_slice_bit) { + // The slice bits are both there, but in the wrong order + std::swap(s_bits[0], s_bits[1]); + for (auto &vec : v) { + vec = self_shufi(vec); + } + } + + // Both slice bits should be correct now + internal_assert(s_bits.size() == 2 && + s_bits[0] == low_slice_bit && + s_bits[1] == high_slice_bit); + + } else { + // AVX-2 The sole slice bit should be correct now. + internal_assert(s_bits.size() == 1 && + s_bits[0] == low_slice_bit); + } + + // The lane and slice bits are correct, but the vectors are in some + // arbitrary order. We'll reorder them by deinterleaving the list according + // to each bit position, in increasing order. + for (size_t i = 0; i < v_bits.size(); i++) { + int bit = i + s_bits.size() + l_bits.size(); + auto vb_it = std::find(v_bits.begin(), v_bits.end(), bit); + internal_assert(vb_it != v_bits.end()); + + int j = vb_it - v_bits.begin(); + v_bits.erase(vb_it); + v_bits.push_back(bit); + + std::vector a, b; + a.reserve(v.size() / 2); + b.reserve(v.size() / 2); + int mask = 1 << j; + for (size_t k = 0; k < v.size(); k++) { + if ((k & mask) == 0) { + a.push_back(v[k]); + } else { + b.push_back(v[k]); + } + } + v.clear(); + v.insert(v.end(), a.begin(), a.end()); + v.insert(v.end(), b.begin(), b.end()); + } + + // The v bits should be correct now + for (int i = 0; i < (int)v_bits.size(); i++) { + internal_assert(v_bits[i] == i + (int)(l_bits.size() + s_bits.size())); + } + + // Concatenate all results into a single vector. Phew. + return concat_vectors(v); +} + void CodeGen_X86::visit(const Allocate *op) { ScopedBinding bind(mem_type, op->name, op->memory_type); CodeGen_Posix::visit(op); diff --git a/src/IR.cpp b/src/IR.cpp index 2c91d16b50e1..c5158728f367 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -815,6 +815,21 @@ Expr Shuffle::make_interleave(const std::vector &vectors) { return make(vectors, indices); } +Expr Shuffle::make_transpose(Expr e, int cols) { + internal_assert(e.type().lanes() % cols == 0) + << "Transpose cols must divide the number of lanes.\n"; + int rows = e.type().lanes() / cols; + + std::vector indices(e.type().lanes()); + for (int j = 0; j < cols; j++) { + for (int i = 0; i < rows; i++) { + indices[j * rows + i] = i * cols + j; + } + } + + return make({std::move(e)}, indices); +} + Expr Shuffle::make_concat(const std::vector &vectors) { internal_assert(!vectors.empty()) << "Concat of zero vectors.\n"; @@ -1012,6 +1027,33 @@ bool Shuffle::is_concat() const { return indices.size() == input_lanes && is_ramp(indices); } +bool Shuffle::is_transpose() const { + if (vectors.size() > 1 || + (int)indices.size() != vectors[0].type().lanes() || + indices.size() < 2 || + indices[0] != 0 || + indices[1] <= 0) { + return false; + } + int cols = indices[1]; + int rows = vectors[0].type().lanes() / cols; + if ((int)indices.size() != rows * cols) { + return false; + } + for (int row = 0; row < rows; row++) { + for (int col = 0; col < cols; col++) { + if (indices[col * rows + row] != row * cols + col) { + return false; + } + } + } + return true; +} + +int Shuffle::transpose_factor() const { + return indices[1] - indices[0]; +} + bool Shuffle::is_slice() const { size_t input_lanes = 0; for (const Expr &i : vectors) { diff --git a/src/IR.h b/src/IR.h index 3666581803db..c1d5a57483e3 100644 --- a/src/IR.h +++ b/src/IR.h @@ -990,6 +990,13 @@ struct Shuffle : public ExprNode { * interleaving of vectors of the same length. */ static Expr make_interleave(const std::vector &vectors); + /** Convenience constructor for making a shuffle representing an in-place + * transpose of a row-major matrix with the given number of columns. The + * output, interpreted as a row-major matrix, therefore has than number of + * rows. For example, to turn the vector RGBRGBRGBRGB into RRRRGGGGBBBB cols + * would be 3, and to do the reverse cols would be 4. */ + static Expr make_transpose(Expr e, int cols); + /** Convenience constructor for making a shuffle representing a * concatenation of the vectors. */ static Expr make_concat(const std::vector &vectors); @@ -1010,6 +1017,13 @@ struct Shuffle : public ExprNode { * arguments. */ bool is_interleave() const; + /** Check if this shuffle is an in-place transpose of a single vector. The + * factor is the number of columns of the source matrix, or equivalently, + * the number of rows of the destination matrix, interpreting a vector as a + * matrix stored row-major. */ + bool is_transpose() const; + int transpose_factor() const; + /** Check if this shuffle can be represented as a repeating pattern that * repeats the same shuffle of the single input vector some number of times. * For example: 0, 3, 1, 1, 0, 3, 1, 1, ....., 0, 3, 1, 1 diff --git a/src/IRMatch.h b/src/IRMatch.h index 4ec8b2694e3f..dc5922a80028 100644 --- a/src/IRMatch.h +++ b/src/IRMatch.h @@ -2249,6 +2249,60 @@ HALIDE_ALWAYS_INLINE auto slice(Vec vec, Base base, Stride stride, Lanes lanes) return {pattern_arg(vec), pattern_arg(base), pattern_arg(stride), pattern_arg(lanes)}; } +template +struct TransposeOp { + struct pattern_tag {}; + Vec vec; + Factor factor; + + static constexpr uint32_t binds = Vec::binds | Factor::binds; + + constexpr static IRNodeType min_node_type = IRNodeType::Shuffle; + constexpr static IRNodeType max_node_type = IRNodeType::Shuffle; + constexpr static bool canonical = Vec::canonical && Factor::canonical; + + template + HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept { + if (e.node_type != IRNodeType::Shuffle) { + return false; + } + const Shuffle &v = (const Shuffle &)e; + return v.vectors.size() == 1 && + v.is_transpose() && + vec.template match(*v.vectors[0].get(), state) && + factor.template match<(bound | bindings::mask)>(v.transpose_factor(), state); + } + + HALIDE_ALWAYS_INLINE + Expr make(MatcherState &state, halide_type_t type_hint) const { + halide_scalar_value_t factor_val; + halide_type_t ty; + factor.make_folded_const(factor_val, ty, state); + int f = (int)factor_val.u.i64; + return Shuffle::make_transpose(vec.make(state, type_hint), f); + } + + constexpr static bool foldable = false; + + HALIDE_ALWAYS_INLINE + TransposeOp(Vec v, Factor f) + : vec(v), factor(f) { + static_assert(Factor::foldable, "Factor of transpose should consist only of operations that constant-fold"); + } +}; + +template +std::ostream &operator<<(std::ostream &s, const TransposeOp &op) { + s << "transpose(" << op.vec << ", " << op.factor << ")"; + return s; +} + +template +HALIDE_ALWAYS_INLINE auto transpose(Vec vec, Factor factor) noexcept + -> TransposeOp { + return {pattern_arg(vec), pattern_arg(factor)}; +} + template struct Fold { struct pattern_tag {}; diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index e95286af03ee..9cd5527b09a6 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1461,6 +1461,11 @@ void IRPrinter::visit(const Shuffle *op) { stream << paren(", ") << imm_int(op->slice_begin()) << paren(", ") << imm_int(op->slice_stride()) << paren(", ") << imm_int(op->indices.size()); + } else if (op->is_transpose()) { + openf("transpose_vector"); + print_list(op->vectors); + stream << paren(", ") << imm_int(op->transpose_factor()); + } else { openf("shuffle"); print_list(op->vectors); diff --git a/src/Lower.cpp b/src/Lower.cpp index 9b55bd20840d..08e6f8dd5b97 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -376,7 +376,7 @@ void lower_impl(const vector &output_funcs, log("Lowering after partitioning loops:", s); debug(1) << "Staging strided loads...\n"; - s = stage_strided_loads(s); + s = stage_strided_loads(s, t); log("Lowering after staging strided loads:", s); debug(1) << "Trimming loops to the region over which they do something...\n"; diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp index 6158cc9cd48c..06967a8d32d3 100644 --- a/src/Simplify_Add.cpp +++ b/src/Simplify_Add.cpp @@ -120,6 +120,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *info) { rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) + z), slice(x + y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(slice(x, c0, c1, c2) + (z - slice(y, c0, c1, c2)), slice(x - y, c0, c1, c2) + z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(slice(x, c0, c1, c2) + (slice(y, c0, c1, c2) - z), slice(x + y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(transpose(x, c0) + transpose(y, c0), transpose(x + y, c0)) || (no_overflow(op->type) && (rewrite(x + x * y, x * (y + 1)) || diff --git a/src/Simplify_EQ.cpp b/src/Simplify_EQ.cpp index 994d14cd4cee..5d8c09901b49 100644 --- a/src/Simplify_EQ.cpp +++ b/src/Simplify_EQ.cpp @@ -195,6 +195,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *info) { slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(slice(x, c0, c1, c2) == slice(y, c0, c1, c2) + z, slice(x - y, c0, c1, c2) == z, c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(transpose(x, c0) == transpose(y, c0), transpose(x == y, c0)) || false) || (no_overflow(a.type()) && EVAL_IN_LAMBDA // (rewrite(x * y == 0, (x == 0) || (y == 0)) || diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp index bbd67a5bace0..da8a96e41f4f 100644 --- a/src/Simplify_Exprs.cpp +++ b/src/Simplify_Exprs.cpp @@ -328,8 +328,9 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) { } ExprInfo base_info; - if (const Ramp *r = index.as()) { - mutate(r->base, &base_info); + const Ramp *r_index = index.as(); + if (r_index) { + mutate(r_index->base, &base_info); } base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment); @@ -349,18 +350,41 @@ Expr Simplify::visit(const Load *op, ExprInfo *info) { op->image, op->param, const_true(new_lanes, nullptr), align); return Broadcast::make(load, b_index->lanes); } else if (s_index && - is_const_one(predicate) && (s_index->is_concat() || s_index->is_interleave())) { - // Loads of concats/interleaves should be concats/interleaves of loads + // Loads of concats/interleaves should be concats/interleaves of + // loads. We'll need to slice up the predicate though. std::vector loaded_vecs; for (const Expr &new_index : s_index->vectors) { int new_lanes = new_index.type().lanes(); + Expr predicate_slice = + is_const_one(predicate) ? const_true(new_lanes, nullptr) : + s_index->is_concat() ? + Shuffle::make_slice(predicate, (int)loaded_vecs.size() * new_lanes, 1, new_lanes) : + Shuffle::make_slice(predicate, (int)loaded_vecs.size(), op->type.lanes() / new_lanes, new_lanes); + predicate_slice = mutate(predicate_slice, nullptr); + Expr load = Load::make(op->type.with_lanes(new_lanes), op->name, new_index, - op->image, op->param, const_true(new_lanes, nullptr), ModulusRemainder{}); + op->image, op->param, predicate_slice, ModulusRemainder{}); loaded_vecs.emplace_back(std::move(load)); } return Shuffle::make(loaded_vecs, s_index->indices); + } else if (const Ramp *inner_ramp = r_index ? r_index->base.as() : nullptr; + inner_ramp && + !is_const_one(inner_ramp->stride) && + is_const_one(r_index->stride)) { + // If it's a nested ramp and the outer ramp has stride 1, swap the + // nesting order of the ramps to make dense loads and transpose the + // resulting vector instead. + Expr transposed_index = + Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes), + Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes); + Expr transposed_predicate = (predicate.as() ? + predicate : // common case optimization + Shuffle::make_transpose(predicate, inner_ramp->lanes)); + Expr transposed_load = + Load::make(op->type, op->name, transposed_index, op->image, op->param, transposed_predicate, align); + return mutate(Shuffle::make_transpose(transposed_load, r_index->lanes), info); } else if (predicate.same_as(op->predicate) && index.same_as(op->index) && align == op->alignment) { return op; } else { diff --git a/src/Simplify_Max.cpp b/src/Simplify_Max.cpp index 1926bc9a069e..cc4253ca718f 100644 --- a/src/Simplify_Max.cpp +++ b/src/Simplify_Max.cpp @@ -212,6 +212,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *info) { rewrite(max(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(max(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(max(slice(x, c0, c1, c2), max(slice(y, c0, c1, c2), z)), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(max(slice(x, c0, c1, c2), max(z, slice(y, c0, c1, c2))), max(slice(max(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(max(transpose(x, c0), transpose(y, c0)), transpose(max(x, y), c0)) || (no_overflow(op->type) && (rewrite(max(max(x, y) + c0, x), max(x, y + c0), c0 < 0) || diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp index 3f6084c6c4f1..e6515ab280e9 100644 --- a/src/Simplify_Min.cpp +++ b/src/Simplify_Min.cpp @@ -214,6 +214,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *info) { rewrite(min(slice(x, c0, c1, c2), slice(y, c0, c1, c2)), slice(min(x, y), c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(min(slice(x, c0, c1, c2), min(slice(y, c0, c1, c2), z)), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(min(slice(x, c0, c1, c2), min(z, slice(y, c0, c1, c2))), min(slice(min(x, y), c0, c1, c2), z), c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(min(transpose(x, c0), transpose(y, c0)), transpose(min(x, y), c0)) || (no_overflow(op->type) && (rewrite(min(min(x, y) + c0, x), min(x, y + c0), c0 > 0) || rewrite(min(min(x, y) + c0, x), min(x, y) + c0, c0 < 0) || diff --git a/src/Simplify_Mul.cpp b/src/Simplify_Mul.cpp index dfa38d39111c..e1bcb68fe7bc 100644 --- a/src/Simplify_Mul.cpp +++ b/src/Simplify_Mul.cpp @@ -81,6 +81,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *info) { rewrite(slice(x, c0, c1, c2) * slice(y, c0, c1, c2), slice(x * y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(slice(x, c0, c1, c2) * (slice(y, c0, c1, c2) * z), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite(slice(x, c0, c1, c2) * (z * slice(y, c0, c1, c2)), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(transpose(x, c0) * transpose(y, c0), transpose(x * y, c0)) || false) { return mutate(rewrite.result, info); diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp index aecb4c6fc99a..2a614ac81744 100644 --- a/src/Simplify_Shuffle.cpp +++ b/src/Simplify_Shuffle.cpp @@ -95,10 +95,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { // broadcast. Note that it doesn't matter what the indices // are. const Broadcast *b1 = new_vectors[0].as(); - if (b1) { + if (b1 && b1->value.type().is_scalar()) { bool can_collapse = true; for (size_t i = 1; i < new_vectors.size() && can_collapse; i++) { - if (const Broadcast *b2 = new_vectors[i].as()) { + if (const Broadcast *b2 = new_vectors[i].as(); + b2 && b2->value.type().is_scalar()) { Expr check = mutate(b1->value - b2->value, nullptr); can_collapse &= is_const_zero(check); } else { diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp index 152c8b0fe797..3a6b459b5c88 100644 --- a/src/Simplify_Stmts.cpp +++ b/src/Simplify_Stmts.cpp @@ -342,12 +342,14 @@ Stmt Simplify::visit(const Store *op) { } ExprInfo base_info; - if (const Ramp *r = index.as()) { - mutate(r->base, &base_info); + const Ramp *r_index = index.as(); + if (r_index) { + mutate(r_index->base, &base_info); } base_info.alignment = ModulusRemainder::intersect(base_info.alignment, index_info.alignment); const Load *load = value.as(); + const Shuffle *shuf = index.as(); const Broadcast *scalar_pred = predicate.as(); if (scalar_pred && !scalar_pred->value.type().is_scalar()) { // Nested vectorization @@ -365,6 +367,45 @@ Stmt Simplify::visit(const Store *op) { } else if (is_undef(value) || (load && load->name == op->name && equal(load->index, index))) { // foo[x] = foo[x] or foo[x] = undef is a no-op return Evaluate::make(0); + } else if (shuf && shuf->is_concat()) { + // Break a store of a concat of vector indices into separate stores. A + // concat index will result in a general scatter at codegen time. We + // should just break it up here, where there is a hope that the + // individual elements might be simplifiable to dense ramps. + std::string var_name = unique_name('t'); + Expr var = Variable::make(value.type(), var_name); + std::vector stores; + int lanes = 0; + for (const Expr &idx : shuf->vectors) { + stores.push_back(Store::make(op->name, + Shuffle::make_slice(var, lanes, 1, idx.type().lanes()), + idx, + op->param, + Shuffle::make_slice(predicate, lanes, 1, idx.type().lanes()), + ModulusRemainder{})); + lanes += idx.type().lanes(); + } + Stmt s = Block::make(stores); + s = LetStmt::make(var_name, value, s); + return mutate(s); + } else if (const Ramp *inner_ramp = r_index ? r_index->base.as() : nullptr; + inner_ramp && + !is_const_one(inner_ramp->stride) && + is_const_one(r_index->stride)) { + // If it's a nested ramp and the outer ramp has stride 1, swap the + // nesting order of the ramps to make dense stores and transpose the + // index and value instead. Later in lowering after flattening the + // nested ramps it will turn into a concat of dense ramps and hit the + // case above. + Expr transposed_index = + Ramp::make(Ramp::make(inner_ramp->base, make_one(inner_ramp->base.type()), r_index->lanes), + Broadcast::make(inner_ramp->stride, r_index->lanes), inner_ramp->lanes); + Expr transposed_value = Shuffle::make_transpose(value, inner_ramp->lanes); + Expr transposed_predicate = (predicate.as() ? + predicate : // common case optimization + Shuffle::make_transpose(predicate, inner_ramp->lanes)); + return mutate(Store::make(op->name, transposed_value, transposed_index, + op->param, transposed_predicate, align)); } else if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index) && align == op->alignment) { return op; } else { diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp index 29bd02c78ed6..2444cb6fd1d9 100644 --- a/src/Simplify_Sub.cpp +++ b/src/Simplify_Sub.cpp @@ -177,6 +177,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *info) { rewrite(slice(x, c0, c1, c2) - (slice(y, c0, c1, c2) + z), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite((slice(x, c0, c1, c2) - z) - slice(y, c0, c1, c2), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) || rewrite((z - slice(x, c0, c1, c2)) - slice(y, c0, c1, c2), z - slice(x + y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) || + rewrite(transpose(x, c0) - transpose(y, c0), transpose(x - y, c0)) || (no_overflow(op->type) && EVAL_IN_LAMBDA // (rewrite(max(x, y) - x, max(y - x, 0)) || diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp index 5073d6194522..896a33b5193e 100644 --- a/src/StageStridedLoads.cpp +++ b/src/StageStridedLoads.cpp @@ -104,7 +104,7 @@ class FindStridedLoads : public IRVisitor { // TODO: We do not yet handle nested vectorization here for // ramps which have not already collapsed. We could potentially // handle more interesting types of shuffle than simple flat slices. - if (stride >= 2 && stride <= r->lanes && r->stride.type().is_scalar()) { + if (stride >= 2 && r->stride.type().is_scalar()) { const IRNode *s = scope; const Allocate *a = nullptr; if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) { @@ -283,7 +283,7 @@ bool can_hoist_shared_load(const IRNode *n, const std::string &buf, const Expr & } // namespace -Stmt stage_strided_loads(const Stmt &stmt) { +Stmt stage_strided_loads(const Stmt &stmt, const Target &target) { FindStridedLoads finder; ReplaceStridedLoads replacer; @@ -334,9 +334,23 @@ Stmt stage_strided_loads(const Stmt &stmt) { Type t = k.type.with_lanes(lanes); const Load *op = load->second[0]; + int last_offset = first_offset; + int64_t biggest_gap = 0; std::set all_loads; for (auto l = load; l != v.end() && l->first < first_offset + k.stride; l++) { all_loads.insert(l->second.begin(), l->second.end()); + biggest_gap = std::max(biggest_gap, l->first - last_offset); + last_offset = l->first; + } + biggest_gap = std::max(biggest_gap, (first_offset + k.stride) - last_offset); + + // If our contiguous shared load has contiguous vectors in it of + // size at least k.lanes that are going to be entirely unused, this + // is a bad idea (e.g. a cluster of {ramp(0, 1024, 8) and ramp(37, + // 1024, 8)} should not be staged). + if (biggest_gap >= k.lanes) { + load++; + continue; } Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param, @@ -353,15 +367,28 @@ Stmt stage_strided_loads(const Stmt &stmt) { const IRNode *outermost = k.scope ? k.scope : s.get(); const IRNode *let_site = innermost_containing_node(outermost, all_loads); if (can_hoist_shared_load(let_site, k.buf, idx)) { + // For larger strides we can do a better job at shuffling if we + // do it as one big task. For stride 2 it interferes with + // horizontal add pattern matching. On ARM it also interferes + // with LLVM's pattern matching for vld3 and vld4. + bool transpose_shared_load = k.stride > 2; + if (target.arch == Target::ARM || target.arch == Target::Hexagon) { + transpose_shared_load = k.stride > 4; + } std::string name = unique_name('t'); Expr var = Variable::make(shared_load.type(), name); for (; load != v.end() && load->first < first_offset + k.stride; load++) { int row = load->first - first_offset; - Expr shuf = Shuffle::make_slice(var, row, k.stride, k.lanes); + Expr shuf = transpose_shared_load ? + Shuffle::make_slice(var, row * k.lanes, 1, k.lanes) : + Shuffle::make_slice(var, row, k.stride, k.lanes); for (const Load *l : load->second) { replacer.replacements.emplace(l, shuf); } } + if (transpose_shared_load) { + shared_load = Shuffle::make_transpose(shared_load, k.stride); + } replacer.let_injections[let_site].emplace_back(name, shared_load); } else { for (; load != v.end() && load->first < first_offset + k.stride; load++) { @@ -378,7 +405,7 @@ Stmt stage_strided_loads(const Stmt &stmt) { // picked up in a cluster, but for whom we know it's safe to do a // dense load before their start. for (const auto &[offset, loads] : reverse_view(v)) { - if (replacer.replacements.count(loads[0])) { + if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) { continue; } int64_t delta = k.stride - 1; @@ -403,7 +430,7 @@ Stmt stage_strided_loads(const Stmt &stmt) { // Look for any loads we can densify because an overlapping load occurs // in any parent scope. for (const auto &[offset, loads] : reverse_view(v)) { - if (replacer.replacements.count(loads[0])) { + if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) { continue; } int64_t min_offset = offset; @@ -443,7 +470,7 @@ Stmt stage_strided_loads(const Stmt &stmt) { // external allocations by doing a dense load at a trimmed size. We rely // on codegen to do a good job at loading vectors of a funny size. for (const auto &[offset, loads] : v) { - if (replacer.replacements.count(loads[0])) { + if (replacer.replacements.count(loads[0]) || k.lanes < k.stride) { continue; } diff --git a/src/StageStridedLoads.h b/src/StageStridedLoads.h index a29cef2438f1..b6afd3770981 100644 --- a/src/StageStridedLoads.h +++ b/src/StageStridedLoads.h @@ -37,7 +37,7 @@ namespace Internal { * internal allocations it adds padding to the allocation explicitly, by setting * the padding field on Allocate nodes. */ -Stmt stage_strided_loads(const Stmt &s); +Stmt stage_strided_loads(const Stmt &s, const Target &target); } // namespace Internal } // namespace Halide diff --git a/src/Util.h b/src/Util.h index 6382b497dc5d..f29e0ad9b6f0 100644 --- a/src/Util.h +++ b/src/Util.h @@ -575,6 +575,16 @@ inline int64_t next_power_of_two(int64_t x) { return static_cast(1) << static_cast(std::ceil(std::log2(x))); } +/** Returns the largest power of two which is a factor of the argument. */ +inline int64_t largest_power_of_two_factor(int64_t x) { + return x & -x; +} + +/** Return whether or not an integer is a power of two. */ +inline bool is_power_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + template inline T align_up(T x, int n) { return (x + n - 1) / n * n; diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 6d41a9e71219..04c087f33338 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -322,6 +322,7 @@ tests(GROUPS correctness tracing_broadcast.cpp tracing_stack.cpp transitive_bounds.cpp + transpose_idioms.cpp trim_no_ops.cpp tuple_partial_update.cpp tuple_reduction.cpp diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp index f0183412323a..226884e2cac6 100644 --- a/test/correctness/simd_op_check_sve2.cpp +++ b/test/correctness/simd_op_check_sve2.cpp @@ -447,13 +447,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { Expr shift = (i_2 % bits) - (bits / 2); Expr round_s = (cast_i(1) >> min(shift, 0)) / 2; Expr round_u = (cast_u(1) >> min(shift, 0)) / 2; - add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift)); - add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift)); + // The r suffix is optional - it just changes which of the two args gets clobbered + add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) << shift)); + add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) << shift)); round_s = (cast_i(1) << max(shift, 0)) / 2; round_u = (cast_u(1) << max(shift, 0)) / 2; - add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift)); - add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift)); + add_8_16_32(sel_op("vrshl.s", "srshlr?"), cast_i((widen_i(i_1) + round_s) >> shift)); + add_8_16_32(sel_op("vrshl.u", "urshlr?"), cast_u((widen_u(u_1) + round_u) >> shift)); // VRSHR I - Rounding Shift Right add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1)); @@ -1220,6 +1221,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { std::stringstream type_name_stream; type_name_stream << e.type(); std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor); + + // Some regex symbols are illegal in filenames on windows + std::string illegal = "<>:\"/\\|?*"; + std::replace_if(decorated_op_name.begin(), decorated_op_name.end(), // + [&](char c) { return illegal.find(c) != std::string::npos; }, '_'); + auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size()); // Bail out after generating the unique_name, so that names are diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp index 8a82f5ca33d1..e0373a5be69e 100644 --- a/test/correctness/stage_strided_loads.cpp +++ b/test/correctness/stage_strided_loads.cpp @@ -200,7 +200,7 @@ int main(int argc, char **argv) { { Func f; Var x; - f(x) = buf(17 * x) + buf(17 * x + 15); + f(x) = buf(50 * x) + buf(50 * x + 15); f.vectorize(x, 16, TailStrategy::RoundUp); checker.check_not(f, 0); diff --git a/test/correctness/transpose_idioms.cpp b/test/correctness/transpose_idioms.cpp new file mode 100644 index 000000000000..9fb29c2883e0 --- /dev/null +++ b/test/correctness/transpose_idioms.cpp @@ -0,0 +1,212 @@ +#include "Halide.h" + +using namespace Halide; +using namespace Halide::Internal; + +// This test enumerates all the scheduling idioms in Halide that *should* +// produce good code for a transpose/interleave/deinterleave operation. + +class Checker : public IRMutator { + + using IRMutator::visit; + + Expr visit(const Load *op) override { + if (const Ramp *r = op->index.as(); + r && is_const_one(r->stride)) { + dense_loads++; + } else if (op->type.is_vector()) { + gathers++; + } + return IRMutator::visit(op); + } + + Stmt visit(const Store *op) override { + if (const Ramp *r = op->index.as(); + r && is_const_one(r->stride)) { + dense_stores++; + } else if (op->index.type().is_vector()) { + scatters++; + } + return IRMutator::visit(op); + } + + Expr visit(const Shuffle *op) override { + transposes += op->is_transpose(); + interleaves += op->is_interleave(); + if (op->is_slice()) { + if (op->slice_stride() == 1) { + dense_slices++; + } else { + strided_slices++; + } + } + return IRMutator::visit(op); + } + +public: + int dense_loads = 0; + int gathers = 0; + int dense_stores = 0; + int scatters = 0; + int dense_slices = 0; + int strided_slices = 0; + int interleaves = 0; + int transposes = 0; + + void check() { + internal_assert(gathers == 0) << "Vector gathers found"; + internal_assert(scatters == 0) << "Vector scatters found"; + internal_assert(strided_slices == 0) << "strided slices found"; + internal_assert(dense_loads) << "No dense loads found"; + internal_assert(dense_stores) << "No dense stores found"; + internal_assert(interleaves + transposes) << "No interleaves or transposes found"; + } +}; + +void check(Func g) { + Checker checker; + g.add_custom_lowering_pass(&checker, nullptr); + + // Choose a shape with lots of factors so that our RoundUp schedules work + int n = 16 * 9 * 7; + Buffer out = g.realize({n, n}); + for (int y = 0; y < out.height(); y++) { + for (int x = 0; x < out.width(); x++) { + int correct = 100 * x + y; + internal_assert(out(x, y) == correct) + << "out(" << x << ", " << y << ") = " << out(x, y) + << " instead of " << correct << "\n"; + } + } + + checker.check(); +} + +int main(int argc, char **argv) { + Var x{"x"}, y{"y"}, xi{"xi"}, yi{"yi"}; + + // In each case we'll say g(x, y) = f(y, x) and tile it. We will try power + // of two sizes, and sizes that are coprime, and sizes that are neither + // coprime no powers of two. We'll use sizes larger than 4, because some + // backends like to do different things for small strides. + + for (auto tile : {std::pair{8, 16}, {7, 5}, {6, 9}}) { + { + // Idiom 1: Strided stores into a staged transposed copy of the + // input. The strided stores that get mashed together into one big + // interleave + store by the pass that interleaves strided + // stores. This has to be done on a staged copy of the input rather + // than g so that the strided stores have a constant stride. + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp) + .vectorize(xi) + .unroll(yi); + + f.in().compute_at(g, x).reorder_storage(y, x).vectorize(x).unroll(y); + + check(g); + } + + { + // Idiom 2: Vectorize x, unroll y. Stage a copy of the input but + // don't transpose it. This will create strided loads from the + // staged input that get hoisted out into one big dense load + + // transpose by the stage_strided_stores pass. The staging is + // required so that the strides are constant. + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp) + .vectorize(xi) + .unroll(yi); + + f.in().compute_at(g, x).vectorize(x).unroll(y); + + check(g); + } + + { + // Idiom 3: Vectorize both, x innermost. This should be handled by + // shuffle optimization logic in the simplifier: a store of a concat + // of ramps turns into a sequence of stores of slices of the RHS, + // and a load of a ramp of a ramp where the *outer* ramp has stride + // 1 but the inner doesn't turns into a transpose of a concat of + // dense loads. + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp) + .vectorize(xi) + .vectorize(yi); + + check(g); + } + + { + // Idiom 4: Vectorize both, y innermost. In this case the store of a + // ramp of a ramp gets rewritten by the simplifier to move the ramp + // with stride one innermost, transposing the RHS. + + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g.tile(x, y, xi, yi, tile.first, tile.second, TailStrategy::RoundUp) + .reorder(yi, xi) + .vectorize(xi) + .vectorize(yi); + + check(g); + } + } + + { + // Check the double-vectorization approaches also work when there is a + // vector predicate on one of the two vectors, to be sure the simplifier + // is transforming the predicate correctly. We can't predicate both, + // because the vectorizer can't handle it and generates a scalar tail. + { + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g + .never_partition(x, y) + .split(x, x, xi, 13, TailStrategy::Predicate) + .split(y, y, yi, 11, TailStrategy::ShiftInwards) + .reorder(xi, yi, x, y) + .vectorize(xi) + .vectorize(yi); + + check(g); + } + { + Func f{"f"}, g{"g"}; + f(x, y) = x + 100 * y; + g(x, y) = f(y, x); + f.compute_root(); + + g + .never_partition(x, y) + .split(x, x, xi, 13, TailStrategy::ShiftInwards) + .split(y, y, yi, 11, TailStrategy::Predicate) + .reorder(yi, xi, x, y) + .vectorize(xi) + .vectorize(yi); + + check(g); + } + } + + printf("Success!\n"); +} diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt index 3e8142f882c0..4c2b515814b5 100644 --- a/test/performance/CMakeLists.txt +++ b/test/performance/CMakeLists.txt @@ -18,6 +18,7 @@ tests(GROUPS performance fast_pow.cpp fast_sine_cosine.cpp gpu_half_throughput.cpp + interleave.cpp jit_stress.cpp lots_of_inputs.cpp memcpy.cpp diff --git a/test/performance/block_transpose.cpp b/test/performance/block_transpose.cpp index 740908358443..921d7f9a913b 100644 --- a/test/performance/block_transpose.cpp +++ b/test/performance/block_transpose.cpp @@ -7,108 +7,77 @@ using namespace Halide; using namespace Halide::Tools; -enum { - scalar_trans, - vec_y_trans, - vec_x_trans +struct Result { + int type_size, block_width, block_height; + double bandwidth; }; -Buffer test_transpose(int mode) { - Func input, block, block_transpose, output; - Var x, y; - - input(x, y) = cast(x + y); - input.compute_root(); +template +Result test_transpose(int block_width, int block_height, const Target &t) { + const int N = 256; + Buffer in(N, N), out(N, N); - block(x, y) = input(x, y); - block_transpose(x, y) = block(y, x); - output(x, y) = block_transpose(x, y); - - Var xi, yi; - output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi); - - // Do 8 vectorized loads from the input. - block.compute_at(output, x).vectorize(x).unroll(y); - - std::string algorithm; - switch (mode) { - case scalar_trans: - block_transpose.compute_at(output, x).unroll(x).unroll(y); - algorithm = "Scalar transpose"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector()); - break; - case vec_y_trans: - block_transpose.compute_at(output, x).vectorize(y).unroll(x); - algorithm = "Transpose vectorized in y"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector()); - break; - case vec_x_trans: - block_transpose.compute_at(output, x).vectorize(x).unroll(y); - algorithm = "Transpose vectorized in x"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector()); - break; + for (int y = 0; y < N; y++) { + for (int x = 0; x < N; x++) { + in(x, y) = (T)(x + y * N); + } } - Buffer result(1024, 1024); - output.compile_jit(); - - output.realize(result); - - double t = benchmark([&]() { - output.realize(result); - }); - - std::cout << "Dummy Func version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n"; - return result; -} - -/* This illustrates how to achieve the same scheduling behavior using the 'in()' - * directive as opposed to creating dummy Funcs as done in 'test_transpose()' */ -Buffer test_transpose_wrap(int mode) { Func input, block_transpose, block, output; Var x, y; - input(x, y) = cast(x + y); - input.compute_root(); + input(x, y) = in(x, y); output(x, y) = input(y, x); Var xi, yi; - output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi); - - // Do 8 vectorized loads from the input. - block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y); - - std::string algorithm; - switch (mode) { - case scalar_trans: - block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).unroll(x).unroll(y); - algorithm = "Scalar transpose"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "scalar_transpose.s", std::vector()); - break; - case vec_y_trans: - block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(y).unroll(x); - algorithm = "Transpose vectorized in y"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_y.s", std::vector()); - break; - case vec_x_trans: - block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y); - algorithm = "Transpose vectorized in x"; - output.compile_to_assembly(Internal::get_test_tmp_dir() + "fast_transpose_x.s", std::vector()); - break; - } + output.tile(x, y, xi, yi, block_width, block_height, TailStrategy::RoundUp) + .vectorize(xi) + .vectorize(yi); + + // Explicitly vectorized loads from the input. Was necessary before we + // automatically swizzled the 2D load into dense order. + // input.in().compute_at(output, x).vectorize(x).unroll(y); + + // Explicit transpose in registers. This used to be the idiom, but is no + // longer necessary because stage_strided_loads should detect the strided + // loads from input.in() and turn it into a transpose. + // input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y); + + // TODO: Should not be necessary, but prevents licm from doing something dumb. + output.output_buffer().dim(0).set_bounds(0, 256); - Buffer result(1024, 1024); output.compile_jit(); - output.realize(result); + output.realize(out); - double t = benchmark([&]() { - output.realize(result); + double time = benchmark(10, 10, [&]() { + output.realize(out); }); - std::cout << "Wrapper version: " << algorithm << " bandwidth " << 1024 * 1024 / t << " byte/s.\n"; - return result; + for (int y = 0; y < N; y++) { + for (int x = 0; x < N; x++) { + T actual = out(x, y), correct = in(y, x); + if (actual != correct) { + std::cerr << "For block size (" << block_width << ", " << block_height << "): " + << "out(" << x << ", " << y << ") = " + << actual << " instead of " << correct << "\n"; + exit(1); + } + } + } + + // Uncomment to dump asm for inspection + /* + output.compile_to_assembly(Internal::get_test_tmp_dir() + "transpose_uint" + + std::to_string(sizeof(T) * 8) + "_" + + std::to_string(block_width) + "x" + + std::to_string(block_height) + ".s", + std::vector{in}, "transpose", t); + */ + + return Result{(int)sizeof(T), block_width, block_height, + out.size_in_bytes() / (1.0e9 * time)}; } int main(int argc, char **argv) { @@ -118,23 +87,48 @@ int main(int argc, char **argv) { return 0; } - test_transpose(scalar_trans); - test_transpose_wrap(scalar_trans); - test_transpose(vec_y_trans); - test_transpose_wrap(vec_y_trans); - - Buffer im1 = test_transpose(vec_x_trans); - Buffer im2 = test_transpose_wrap(vec_x_trans); - - // Check correctness of the wrapper version - for (int y = 0; y < im2.height(); y++) { - for (int x = 0; x < im2.width(); x++) { - if (im2(x, y) != im1(x, y)) { - printf("wrapper(%d, %d) = %d instead of %d\n", - x, y, im2(x, y), im1(x, y)); - return 1; + // Set the target features to use for dumping to assembly + target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery}); + + std::cout << "Computing best tile sizes for each type\n"; + std::vector results; + int limit = 64 * 64; + for (int bh : {1, 2, 4, 8, 16, 32, 64}) { + for (int bw : {1, 2, 4, 8, 16, 32, 64}) { + std::cout << "." << std::flush; + results.push_back(test_transpose(bw, bh, target)); + if (bw * bh <= limit / 2) { + results.push_back(test_transpose(bw, bh, target)); + } + if (bw * bh <= limit / 4) { + results.push_back(test_transpose(bw, bh, target)); + } + if (bw * bh <= limit / 8) { + results.push_back(test_transpose(bw, bh, target)); + } + } + } + std::cout << "\nbytes, tile width, tile height, bandwidth (GB/s):\n"; + + // Sort the results by bandwidth + std::sort(results.begin(), results.end(), + [](const Result &a, const Result &b) { + return a.bandwidth > b.bandwidth; + }); + + // Print top n tile sizes for each type + for (int t : {1, 2, 4, 8}) { + int top_n = 5; + for (size_t i = 0; i < results.size() && top_n > 0; i++) { + if (results[i].type_size == t) { + std::cout << t << " " + << results[i].block_width << " " + << results[i].block_height << " " + << results[i].bandwidth << "\n"; + top_n--; } } + std::cout << "\n"; } printf("Success!\n"); diff --git a/test/performance/interleave.cpp b/test/performance/interleave.cpp new file mode 100644 index 000000000000..ee1598e40d41 --- /dev/null +++ b/test/performance/interleave.cpp @@ -0,0 +1,158 @@ +#include "Halide.h" +#include "halide_benchmark.h" +#include "halide_test_dirs.h" + +#include + +using namespace Halide; +using namespace Halide::Tools; + +struct Result { + int type_size, factor; + double bandwidth; +}; + +template +Result test_interleave(int factor, const Target &t) { + const int N = 8192; + Buffer in(N, factor), out(N * factor); + + for (int y = 0; y < factor; y++) { + for (int x = 0; x < N; x++) { + in(x, y) = (T)(x * factor + y); + } + } + + Func output; + Var x, y; + + output(x) = in(x / factor, x % factor); + + Var xi, yi; + // We'll use the interleaving-stores scheduling idiom, where unrolling + // strided stores gets mashed together into a single dense store of a + // interleave_vectors call. + output.unroll(x, factor, TailStrategy::RoundUp) + .vectorize(x, t.natural_vector_size(), TailStrategy::RoundUp); + output.output_buffer().dim(0).set_min(0); + + output.compile_jit(); + + output.realize(out); + + double time = benchmark(20, 20, [&]() { + output.realize(out); + }); + + for (int y = 0; y < factor; y++) { + for (int x = 0; x < N; x++) { + uint64_t actual = out(x * factor + y), correct = in(x, y); + if (actual != correct) { + std::cerr << "For factor " << factor + << "out(" << x << " * " << factor << " + " << y << ") = " + << actual << " instead of " << correct << "\n"; + exit(1); + } + } + } + + // Uncomment to dump asm for inspection + // output.compile_to_assembly("/dev/stdout", + // std::vector{in}, "interleave", t); + + return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)}; +} + +template +Result test_deinterleave(int factor, const Target &t) { + const int N = 8192; + Buffer in(N * factor), out(N, factor); + + for (int x = 0; x < N; x++) { + for (int y = 0; y < factor; y++) { + in(x * factor + y) = (T)(x + y * N); + } + } + + Func output; + Var x, y; + + output(x, y) = in(x * factor + y); + + Var xi, yi; + output.bound(y, 0, factor) + .reorder(y, x) + .unroll(y) // Also works if we vectorize y + .vectorize(x, t.natural_vector_size(), TailStrategy::RoundUp); + + output.compile_jit(); + + output.realize(out); + + double time = benchmark(20, 20, [&]() { + output.realize(out); + }); + + for (int y = 0; y < factor; y++) { + for (int x = 0; x < N; x++) { + uint64_t actual = out(x, y), correct = in(x * factor + y); + if (actual != correct) { + std::cerr << "For factor " << factor + << "out(" << x << ", " << y << ") = " + << actual << " instead of " << correct << "\n"; + exit(1); + } + } + } + + // Uncomment to dump asm for inspection + // output.compile_to_assembly("/dev/stdout", + // std::vector{in}, "deinterleave", t); + + return Result{(int)sizeof(T), factor, out.size_in_bytes() / (1.0e9 * time)}; +} + +int main(int argc, char **argv) { + Target target = get_jit_target_from_environment(); + if (target.arch == Target::WebAssembly) { + printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n"); + return 0; + } + + // Set the target features to use for dumping to assembly + target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery}); + + std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n"; + for (int t : {1, 2, 4, 8}) { + for (int f = 2; f < 16; f++) { + Result r1, r2; + switch (t) { + case 1: + r1 = test_interleave(f, target); + r2 = test_deinterleave(f, target); + break; + case 2: + r1 = test_interleave(f, target); + r2 = test_deinterleave(f, target); + break; + case 4: + r1 = test_interleave(f, target); + r2 = test_deinterleave(f, target); + break; + case 8: + r1 = test_interleave(f, target); + r2 = test_deinterleave(f, target); + break; + default: + break; + } + std::cout << r1.type_size << " " + << r1.factor << " " + << r1.bandwidth << " " + << r2.bandwidth << "\n"; + } + } + + printf("Success!\n"); + return 0; +}