Skip to content
Closed
9 changes: 8 additions & 1 deletion src/CodeGen_GPU_Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,14 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
i));
}

builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
int8_t buffer_type = 0;
if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) {
buffer_type = 2;
} else if (closure_args[i].is_buffer) {
buffer_type = 1;
}

builder->CreateStore(ConstantInt::get(i8_t, buffer_type),
builder->CreateConstGEP2_32(
gpu_arg_is_buffer_arr_type,
gpu_arg_is_buffer_arr,
Expand Down
48 changes: 46 additions & 2 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
vector<llvm::Type *> arg_types(args.size());
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) {
arg_types[i] = llvm_type_of(Int(64));
} else {
arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
}
} else {
arg_types[i] = llvm_type_of(args[i].type);
}
Expand All @@ -83,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,

// Mark the buffer args as no alias
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) {
function->addParamAttr(i, Attribute::NoAlias);
}
}
Expand Down Expand Up @@ -172,6 +176,46 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
builder->CreateCall(barrier0);
value = ConstantInt::get(i32_t, 0);
} else if (op->is_intrinsic(Call::image_load)) {
int num_args = (op->args.size() - 2) / 2;
user_assert(num_args >= 1 && num_args <= 2);

string res_desc = "";
user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
llvm::Type *element_type;
if (op->type.is_float()) {
res_desc = "f32";
element_type = llvm_type_of(Float(32));
} else {
res_desc = "s32";
element_type = llvm_type_of(Int(32));
}
// PTX returns a 4 element struct (not a vector!) regardless of
llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type);

string coord_desc = "";
Type coord_type = op->args[2].type();
internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
if (coord_type.is_float()) {
coord_desc = ".f32";
} else if (coord_type.is_uint()) {
coord_desc = ".u32";
} else if (coord_type.is_int()) {
coord_desc = ".s32";
}
internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type;

string dim = std::to_string(num_args) + "d";
string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;

vector<Value *> coords;
coords.push_back(codegen(Variable::make(Int(64), op->args[0].as<StringImm>()->value)));
for (size_t i = 2; i < op->args.size(); i += 2) {
internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
coords.push_back(codegen(op->args[i]));
}
llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
value = builder->CreateExtractValue(call, {0});
} else {
CodeGen_LLVM::visit(op);
}
Expand Down
12 changes: 7 additions & 5 deletions src/Lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,13 @@ Module lower(const vector<Function> &output_funcs,
debug(2) << "Lowering after bounding small realizations:\n"
<< s << "\n\n";

if (will_inject_host_copies) {
debug(1) << "Selecting a GPU API for GPU loops...\n";
s = select_gpu_api(s, t);
debug(2) << "Lowering after selecting a GPU API:\n"
<< s << "\n\n";
}

debug(1) << "Performing storage flattening...\n";
s = storage_flattening(s, outputs, env, t);
debug(2) << "Lowering after storage flattening:\n"
Expand All @@ -296,11 +303,6 @@ Module lower(const vector<Function> &output_funcs,
}

if (will_inject_host_copies) {
debug(1) << "Selecting a GPU API for GPU loops...\n";
s = select_gpu_api(s, t);
debug(2) << "Lowering after selecting a GPU API:\n"
<< s << "\n\n";

debug(1) << "Injecting host <-> dev buffer copies...\n";
s = inject_host_dev_buffer_copies(s, t);
debug(2) << "Lowering after injecting host <-> dev buffer copies:\n"
Expand Down
62 changes: 59 additions & 3 deletions src/StorageFlattening.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,39 @@ using std::string;
using std::vector;

namespace {
class FindBuffersInGPU : public IRVisitor {
public:
map<string, set<DeviceAPI>> buffer_device_usage;

private:
bool in_gpu = false;
DeviceAPI in_device_api = DeviceAPI::None;
using IRVisitor::visit;

void visit(const Call *op) override {
debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n";
if (in_gpu &&
(op->call_type == Call::Halide || op->call_type == Call::Image)) {
debug(2) << " load call to " << op->name << " " << in_device_api << "\n";
buffer_device_usage[op->name].insert(in_device_api);
}

IRVisitor::visit(op);
}

void visit(const For *op) override {
bool old_in_gpu = in_gpu;
DeviceAPI old_in_device_api = in_device_api;
if (op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) {
in_gpu = true;
in_device_api = op->device_api;
}
IRVisitor::visit(op);
in_gpu = old_in_gpu;
in_device_api = old_in_device_api;
}
};

class FlattenDimensions : public IRMutator {
public:
Expand All @@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator {
}
}

map<string, set<DeviceAPI>> buffer_apis;

private:
const map<string, pair<Function, int>> &env;
set<string> outputs;
Expand All @@ -42,6 +77,7 @@ class FlattenDimensions : public IRMutator {
Scope<> realizations, shader_scope_realizations;
bool in_shader = false;
bool in_gpu = false;
DeviceAPI in_device_api = DeviceAPI::None;

Expr make_shape_var(string name, const string &field, size_t dim,
const Buffer<> &buf, const Parameter &param) {
Expand Down Expand Up @@ -116,7 +152,7 @@ class FlattenDimensions : public IRMutator {

if (op->memory_type == MemoryType::GPUTexture) {
textures.insert(op->name);
debug(2) << "found texture " << op->name << "\n";
debug(2) << "found texture " << op->name << " in " << in_device_api << "\n";
}

Stmt body = mutate(op->body);
Expand Down Expand Up @@ -152,11 +188,23 @@ class FlattenDimensions : public IRMutator {
if (args[j] == storage_dims[i].var) {
storage_permutation.push_back((int)j);
Expr alignment = storage_dims[i].alignment;

if (alignment.defined()) {
allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment;
} else {
allocation_extents[j] = extents[j];
}

// Promote row alignment for buffers used as CUDA Textures
if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) {
// This could be symbolically fetched from runtime I guess?
int target_align_bytes = 32;
int target_align_items = target_align_bytes / op->types[0].bytes();

debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n";

allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items;
}
}
}
internal_assert(storage_permutation.size() == i + 1);
Expand Down Expand Up @@ -259,7 +307,7 @@ class FlattenDimensions : public IRMutator {
Expr store = Call::make(value.type(), Call::image_store,
args, Call::Intrinsic);
return Evaluate::make(store);
} else if (in_gpu && textures.count(op->name)) {
} else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory
Expr buffer_var =
Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
vector<Expr> args(2);
Expand Down Expand Up @@ -398,6 +446,7 @@ class FlattenDimensions : public IRMutator {
Stmt visit(const For *op) override {
bool old_in_shader = in_shader;
bool old_in_gpu = in_gpu;
DeviceAPI old_in_device_api = in_device_api;
if ((op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) &&
op->device_api == DeviceAPI::GLSL) {
Expand All @@ -406,10 +455,12 @@ class FlattenDimensions : public IRMutator {
if (op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) {
in_gpu = true;
in_device_api = op->device_api;
}
Stmt stmt = IRMutator::visit(op);
in_shader = old_in_shader;
in_gpu = old_in_gpu;
in_device_api = old_in_device_api;
return stmt;
}
};
Expand Down Expand Up @@ -483,7 +534,12 @@ Stmt storage_flattening(Stmt s,
}
}

s = FlattenDimensions(tuple_env, outputs, target).mutate(s);
FindBuffersInGPU finder;
s.accept(&finder);
FlattenDimensions flatten(tuple_env, outputs, target);
flatten.buffer_apis = finder.buffer_device_usage;

s = flatten.mutate(s);
s = PromoteToMemoryType().mutate(s);
return s;
}
Expand Down
Loading