Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 2 additions & 85 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,96 +1027,13 @@ void CodeGen_ARM::visit(const Load *op) {
return;
}

// If the stride is in [-1, 4], we can deal with that using vanilla codegen
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;

// If the stride is one or minus one, we can deal with that using vanilla codegen
if (stride && (stride->value == 1 || stride->value == -1)) {
if (stride && (-1 <= stride->value && stride->value <= 4)) {
CodeGen_Posix::visit(op);
return;
}

// Strided loads with known stride
if (stride && stride->value >= 2 && stride->value <= 4) {
// Check alignment on the base. Attempt to shift to an earlier
// address if it simplifies the expression. This makes
// adjacent strided loads shared a vldN op.
Expr base = ramp->base;
int offset = 0;
ModulusRemainder mod_rem = modulus_remainder(ramp->base);

const Add *add = base.as<Add>();
const IntImm *add_b = add ? add->b.as<IntImm>() : nullptr;

if ((mod_rem.modulus % stride->value) == 0) {
offset = mod_rem.remainder % stride->value;
} else if ((mod_rem.modulus == 1) && add_b) {
offset = add_b->value % stride->value;
if (offset < 0) {
offset += stride->value;
}
}

if (offset) {
base = simplify(base - offset);
mod_rem.remainder -= offset;
if (mod_rem.modulus) {
mod_rem.remainder = mod_imp(mod_rem.remainder, mod_rem.modulus);
}
}

int alignment = op->type.bytes();
alignment *= gcd(mod_rem.modulus, mod_rem.remainder);
// Maximum stack alignment on arm is 16 bytes, so we should
// never claim alignment greater than that.
alignment = gcd(alignment, 16);
internal_assert(alignment > 0);

// Decide what width to slice things into. If not a multiple
// of 64 or 128 bits, then we can't safely slice it up into
// some number of vlds, so we hand it over the base class.
int bit_width = op->type.bits() * op->type.lanes();
int intrin_lanes = 0;
if (bit_width % 128 == 0) {
intrin_lanes = 128 / op->type.bits();
} else if (bit_width % 64 == 0) {
intrin_lanes = 64 / op->type.bits();
} else {
CodeGen_Posix::visit(op);
return;
}

llvm::Type *load_return_type = llvm_type_of(op->type.with_lanes(intrin_lanes * stride->value));
llvm::Type *load_return_pointer_type = load_return_type->getPointerTo();
Value *undef = UndefValue::get(load_return_type);
SmallVector<Constant *, 256> constants;
for (int j = 0; j < intrin_lanes; j++) {
Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
constants.push_back(constant);
}
Constant *constantsV = ConstantVector::get(constants);

vector<Value *> results;
for (int i = 0; i < op->type.lanes(); i += intrin_lanes) {
Expr slice_base = simplify(base + i * ramp->stride);
Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_lanes);
Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base);
Value *bitcastI = builder->CreateBitOrPointerCast(ptr, load_return_pointer_type);
LoadInst *loadI = cast<LoadInst>(builder->CreateLoad(bitcastI));
#if LLVM_VERSION >= 110
loadI->setAlignment(Align(alignment));
#else
loadI->setAlignment(MaybeAlign(alignment));
#endif
add_tbaa_metadata(loadI, op->name, slice_ramp);
Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV);
results.push_back(shuffleInstr);
}

// Concat the results
value = concat_vectors(results);
return;
}

// We have builtins for strided loads with fixed but unknown stride, but they use inline assembly.
if (target.bits != 64 /* Not yet implemented for aarch64 */) {
ostringstream builtin;
Expand Down
153 changes: 82 additions & 71 deletions src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1910,63 +1910,66 @@ void CodeGen_LLVM::visit(const Load *op) {

if (ramp && stride && stride->value == 1) {
value = codegen_dense_vector_load(op);
} else if (ramp && stride && stride->value == 2) {
// Load two vectors worth and then shuffle
Expr base_a = ramp->base, base_b = ramp->base + ramp->lanes;
Expr stride_a = make_one(base_a.type());
Expr stride_b = make_one(base_b.type());

ModulusRemainder align_a = op->alignment;
ModulusRemainder align_b = align_a + ramp->lanes;

// False indicates we should take the even-numbered lanes
// from the load, true indicates we should take the
// odd-numbered-lanes.
bool shifted_a = false, shifted_b = false;
} else if (ramp && stride && 2 <= stride->value && stride->value <= 4) {
// Try to rewrite strided loads as shuffles of dense loads,
// aligned to the stride. This makes adjacent strided loads
// share the same underlying dense loads.
ModulusRemainder align = op->alignment;
Expr base = ramp->base;
int aligned_stride = gcd(stride->value, align.modulus);
int offset = 0;
if (aligned_stride == stride->value) {
offset = mod_imp((int)align.remainder, aligned_stride);
} else {
const Add *add = base.as<Add>();
if (const IntImm *add_c = add ? add->b.as<IntImm>() : base.as<IntImm>()) {
offset = mod_imp(add_c->value, stride->value);
}
}

bool external = op->param.defined() || op->image.defined();
if (offset) {
base = simplify(base - offset);
align.remainder -= offset;
}

// Don't read beyond the end of an external buffer.
// We want to load a few more bytes than the original load did.
// We know this is safe for internal buffers because we allocate
// padding.
// (In ASAN mode, don't read beyond the end of internal buffers either,
// as ASAN will complain even about harmless stack overreads.)
// The min moves lower by offset.
int load_lanes = ramp->lanes * stride->value;
bool external = op->param.defined() || op->image.defined();
if (external || target.has_feature(Target::ASAN)) {
base_b -= 1;
align_b = align_b - 1;
shifted_b = true;
} else {
// If the base ends in an odd constant, then subtract one
// and do a different shuffle. This helps expressions like
// (f(2*x) + f(2*x+1) share loads
const Add *add = ramp->base.as<Add>();
const IntImm *offset = add ? add->b.as<IntImm>() : ramp->base.as<IntImm>();
if (offset && offset->value & 1) {
base_a -= 1;
align_a = align_a - 1;
shifted_a = true;
base_b -= 1;
align_b = align_b - 1;
shifted_b = true;
}
load_lanes -= (stride->value - 1 - offset);
}

// Do each load.
Expr ramp_a = Ramp::make(base_a, stride_a, ramp->lanes);
Expr ramp_b = Ramp::make(base_b, stride_b, ramp->lanes);
Expr load_a = Load::make(op->type, op->name, ramp_a, op->image, op->param, op->predicate, align_a);
Expr load_b = Load::make(op->type, op->name, ramp_b, op->image, op->param, op->predicate, align_b);
Value *vec_a = codegen(load_a);
Value *vec_b = codegen(load_b);
int slice_lanes = native_vector_bits() / op->type.bits();

// Shuffle together the results.
vector<int> indices(ramp->lanes);
for (int i = 0; i < (ramp->lanes + 1) / 2; i++) {
indices[i] = i * 2 + (shifted_a ? 1 : 0);
}
for (int i = (ramp->lanes + 1) / 2; i < ramp->lanes; i++) {
indices[i] = i * 2 + (shifted_b ? 1 : 0);
// We need to slice the result in to native vector lanes, otherwise
// LLVM misses optimizations like using ldN on ARM.
vector<Value *> results;
for (int i = 0; i < op->type.lanes(); i += slice_lanes) {
int load_lanes_i = std::min<int>(slice_lanes * stride->value, load_lanes - i);
int lanes_i = std::min<int>(slice_lanes, op->type.lanes() - i);
Expr slice_base = simplify(base + i * ramp->stride);

Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
op->image, op->param, op->alignment, nullptr, false);

SmallVector<Constant *, 256> constants;
for (int j = 0; j < lanes_i; j++) {
Constant *constant = ConstantInt::get(i32_t, j * stride->value + offset);
constants.push_back(constant);
}
Constant *constantsV = ConstantVector::get(constants);
Value *undef = UndefValue::get(load_i->getType());
Value *shuffleInstr = builder->CreateShuffleVector(load_i, undef, constantsV);
results.push_back(shuffleInstr);
}

value = shuffle_vectors(vec_a, vec_b, indices);
// Concat the results
value = concat_vectors(results);
} else if (ramp && stride && stride->value == -1) {
// Load the vector and then flip it in-place
Expr flipped_base = ramp->base - ramp->lanes + 1;
Expand Down Expand Up @@ -2249,14 +2252,14 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
}
}

Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n";

const Ramp *ramp = load->index.as<Ramp>();
internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";
llvm::Value *CodeGen_LLVM::codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
llvm::Value *vpred, bool slice_to_native) {
debug(4) << "Vectorize predicated dense vector load:\n\t"
<< "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n";

bool is_external = (external_buffer.find(load->name) != external_buffer.end());
int alignment = load->type.bytes(); // The size of a single element
bool is_external = (external_buffer.find(name) != external_buffer.end());
int align_bytes = type.bytes(); // The size of a single element

int native_bits = native_vector_bits();
int native_bytes = native_bits / 8;
Expand All @@ -2266,60 +2269,68 @@ Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
// maximum alignment we can infer based on the index alone.

// Boost the alignment if possible, up to the native vector width.
ModulusRemainder mod_rem = load->alignment;
ModulusRemainder mod_rem = alignment;
while ((mod_rem.remainder & 1) == 0 &&
(mod_rem.modulus & 1) == 0 &&
alignment < native_bytes) {
align_bytes < native_bytes) {
mod_rem.modulus /= 2;
mod_rem.remainder /= 2;
alignment *= 2;
align_bytes *= 2;
}

// If it is an external buffer, then we cannot assume that the host pointer
// is aligned to at least native vector width. However, we may be able to do
// better than just assuming that it is unaligned.
if (is_external) {
if (load->param.defined()) {
int host_alignment = load->param.host_alignment();
alignment = gcd(alignment, host_alignment);
} else if (get_target().has_feature(Target::JIT) && load->image.defined()) {
if (param.defined()) {
int host_alignment = param.host_alignment();
align_bytes = gcd(align_bytes, host_alignment);
} else if (get_target().has_feature(Target::JIT) && image.defined()) {
// If we're JITting, use the actual pointer value to determine alignment for embedded buffers.
alignment = gcd(alignment, (int)(((uintptr_t)load->image.data()) & std::numeric_limits<int>::max()));
align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits<int>::max()));
}
}

// For dense vector loads wider than the native vector
// width, bust them up into native vectors
int load_lanes = load->type.lanes();
int native_lanes = std::max(1, native_bits / load->type.bits());
int load_lanes = type.lanes();
int native_lanes = slice_to_native ? std::max(1, native_bits / type.bits()) : load_lanes;
vector<Value *> slices;
for (int i = 0; i < load_lanes; i += native_lanes) {
int slice_lanes = std::min(native_lanes, load_lanes - i);
Expr slice_base = simplify(ramp->base + i);
Expr slice_base = simplify(base + i);
Expr slice_stride = make_one(slice_base.type());
Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes);
llvm::Type *slice_type = get_vector_type(llvm_type_of(load->type.element_of()), slice_lanes);
Value *elt_ptr = codegen_buffer_pointer(load->name, load->type.element_of(), slice_base);
llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes);
Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base);
Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo());

Instruction *load_inst;
if (vpred != nullptr) {
Value *slice_mask = slice_vector(vpred, i, slice_lanes);
#if LLVM_VERSION >= 110
load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(alignment), slice_mask);
load_inst = builder->CreateMaskedLoad(vec_ptr, llvm::Align(align_bytes), slice_mask);
#else
load_inst = builder->CreateMaskedLoad(vec_ptr, alignment, slice_mask);
load_inst = builder->CreateMaskedLoad(vec_ptr, align_bytes, slice_mask);
#endif
} else {
load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(alignment));
load_inst = builder->CreateAlignedLoad(vec_ptr, llvm::Align(align_bytes));
}
add_tbaa_metadata(load_inst, load->name, slice_index);
add_tbaa_metadata(load_inst, name, slice_index);
slices.push_back(load_inst);
}
value = concat_vectors(slices);
return value;
}

Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) {
const Ramp *ramp = load->index.as<Ramp>();
internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n";

return codegen_dense_vector_load(load->type, load->name, ramp->base, load->image, load->param,
load->alignment, vpred, slice_to_native);
}

void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) {
const Ramp *ramp = op->index.as<Ramp>();
const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
Expand Down
5 changes: 4 additions & 1 deletion src/CodeGen_LLVM.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,10 @@ class CodeGen_LLVM : public IRVisitor {

llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv = false);

llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr);
llvm::Value *codegen_dense_vector_load(const Type &type, const std::string &name, const Expr &base,
const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
llvm::Value *vpred = nullptr, bool slice_to_native = true);
llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);

virtual void codegen_predicated_vector_load(const Load *op);
virtual void codegen_predicated_vector_store(const Store *op);
Expand Down
4 changes: 2 additions & 2 deletions src/CodeGen_Posix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type
}

int CodeGen_Posix::allocation_padding(Type type) const {
// We potentially load one scalar value past the end of the
// We potentially load 3 scalar values past the end of the
// buffer, so pad the allocation with an extra instance of the
// scalar type.
return type.bytes();
return 3 * type.bytes();
}

CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type,
Expand Down
5 changes: 3 additions & 2 deletions test/correctness/nested_tail_strategies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@ void check(Func out, int line, std::vector<TailStrategy> tails) {
largest_allocation = 0;
out.realize({s});
size_t expected = (s + 1) * 4;
if (largest_allocation > expected) {
size_t tolerance = 3 * sizeof(int);
if (largest_allocation > expected + tolerance) {
std::cerr << "Failure on line " << line << "\n"
<< "with tail strategies: ";
for (auto t : tails) {
std::cerr << t << " ";
}
std::cerr << "\n allocation of " << largest_allocation
<< " bytes is too large. Expected " << expected << "\n";
<< " bytes is too large. Expected " << expected + tolerance << "\n";
abort();
}
}
Expand Down
Loading