Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/CodeGen_GPU_Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,14 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
i));
}

builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
int8_t buffer_type = 0;
if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) {
buffer_type = 2;
Copy link
Contributor Author

@jlaxson jlaxson Nov 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little clumsy but it's certainly effective for getting the texture flag to the runtime. Alternatives welcome. Haven't thought about yet what this would look like if any sampler-related info needs to be sent as well.

} else if (closure_args[i].is_buffer) {
buffer_type = 1;
}

builder->CreateStore(ConstantInt::get(i8_t, buffer_type),
builder->CreateConstGEP2_32(
gpu_arg_is_buffer_arr_type,
gpu_arg_is_buffer_arr,
Expand Down
48 changes: 46 additions & 2 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
vector<llvm::Type *> arg_types(args.size());
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) {
arg_types[i] = llvm_type_of(Int(64));
} else {
arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
}
} else {
arg_types[i] = llvm_type_of(args[i].type);
}
Expand All @@ -83,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,

// Mark the buffer args as no alias
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) {
function->addParamAttr(i, Attribute::NoAlias);
}
}
Expand Down Expand Up @@ -172,6 +176,46 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
builder->CreateCall(barrier0);
value = ConstantInt::get(i32_t, 0);
} else if (op->is_intrinsic(Call::image_load)) {
int num_args = (op->args.size() - 2) / 2;
user_assert(num_args >= 1 && num_args <= 2);

string res_desc = "";
user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
llvm::Type *element_type;
if (op->type.is_float()) {
res_desc = "f32";
element_type = llvm_type_of(Float(32));
} else {
res_desc = "s32";
element_type = llvm_type_of(Int(32));
}
// PTX returns a 4 element struct (not a vector!) regardless of
llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type);

string coord_desc = "";
Type coord_type = op->args[2].type();
internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
if (coord_type.is_float()) {
coord_desc = ".f32";
} else if (coord_type.is_uint()) {
coord_desc = ".u32";
} else if (coord_type.is_int()) {
coord_desc = ".s32";
}
internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type;

string dim = std::to_string(num_args) + "d";
string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;

vector<Value *> coords;
coords.push_back(codegen(Variable::make(Int(64), op->args[0].as<StringImm>()->value)));
for (size_t i = 2; i < op->args.size(); i += 2) {
internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
coords.push_back(codegen(op->args[i]));
}
llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
value = builder->CreateExtractValue(call, {0});
} else {
CodeGen_LLVM::visit(op);
}
Expand Down
12 changes: 7 additions & 5 deletions src/Lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,13 @@ Module lower(const vector<Function> &output_funcs,
debug(2) << "Lowering after bounding small realizations:\n"
<< s << "\n\n";

if (will_inject_host_copies) {
debug(1) << "Selecting a GPU API for GPU loops...\n";
s = select_gpu_api(s, t);
debug(2) << "Lowering after selecting a GPU API:\n"
<< s << "\n\n";
}

debug(1) << "Performing storage flattening...\n";
s = storage_flattening(s, outputs, env, t);
debug(2) << "Lowering after storage flattening:\n"
Expand All @@ -296,11 +303,6 @@ Module lower(const vector<Function> &output_funcs,
}

if (will_inject_host_copies) {
debug(1) << "Selecting a GPU API for GPU loops...\n";
s = select_gpu_api(s, t);
debug(2) << "Lowering after selecting a GPU API:\n"
<< s << "\n\n";

debug(1) << "Injecting host <-> dev buffer copies...\n";
s = inject_host_dev_buffer_copies(s, t);
debug(2) << "Lowering after injecting host <-> dev buffer copies:\n"
Expand Down
62 changes: 59 additions & 3 deletions src/StorageFlattening.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,39 @@ using std::string;
using std::vector;

namespace {
class FindBuffersInGPU : public IRVisitor {
public:
map<string, set<DeviceAPI>> buffer_device_usage;

private:
bool in_gpu = false;
DeviceAPI in_device_api = DeviceAPI::None;
using IRVisitor::visit;

void visit(const Call *op) override {
debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n";
if (in_gpu &&
(op->call_type == Call::Halide || op->call_type == Call::Image)) {
debug(2) << " load call to " << op->name << " " << in_device_api << "\n";
buffer_device_usage[op->name].insert(in_device_api);
}

IRVisitor::visit(op);
}

void visit(const For *op) override {
bool old_in_gpu = in_gpu;
DeviceAPI old_in_device_api = in_device_api;
if (op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) {
in_gpu = true;
in_device_api = op->device_api;
}
IRVisitor::visit(op);
in_gpu = old_in_gpu;
in_device_api = old_in_device_api;
}
};

class FlattenDimensions : public IRMutator {
public:
Expand All @@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator {
}
}

map<string, set<DeviceAPI>> buffer_apis;

private:
const map<string, pair<Function, int>> &env;
set<string> outputs;
Expand All @@ -42,6 +77,7 @@ class FlattenDimensions : public IRMutator {
Scope<> realizations, shader_scope_realizations;
bool in_shader = false;
bool in_gpu = false;
DeviceAPI in_device_api = DeviceAPI::None;

Expr make_shape_var(string name, const string &field, size_t dim,
const Buffer<> &buf, const Parameter &param) {
Expand Down Expand Up @@ -116,7 +152,7 @@ class FlattenDimensions : public IRMutator {

if (op->memory_type == MemoryType::GPUTexture) {
textures.insert(op->name);
debug(2) << "found texture " << op->name << "\n";
debug(2) << "found texture " << op->name << " in " << in_device_api << "\n";
}

Stmt body = mutate(op->body);
Expand Down Expand Up @@ -152,11 +188,23 @@ class FlattenDimensions : public IRMutator {
if (args[j] == storage_dims[i].var) {
storage_permutation.push_back((int)j);
Expr alignment = storage_dims[i].alignment;

if (alignment.defined()) {
allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment;
} else {
allocation_extents[j] = extents[j];
}

// Promote row alignment for buffers used as CUDA Textures
if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) {
// This could be symbolically fetched from runtime I guess?
int target_align_bytes = 32;
int target_align_items = target_align_bytes / op->types[0].bytes();

debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n";

allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items;
}
}
}
internal_assert(storage_permutation.size() == i + 1);
Expand Down Expand Up @@ -259,7 +307,7 @@ class FlattenDimensions : public IRMutator {
Expr store = Call::make(value.type(), Call::image_store,
args, Call::Intrinsic);
return Evaluate::make(store);
} else if (in_gpu && textures.count(op->name)) {
} else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory
Expr buffer_var =
Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
vector<Expr> args(2);
Expand Down Expand Up @@ -398,6 +446,7 @@ class FlattenDimensions : public IRMutator {
Stmt visit(const For *op) override {
bool old_in_shader = in_shader;
bool old_in_gpu = in_gpu;
DeviceAPI old_in_device_api = in_device_api;
if ((op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) &&
op->device_api == DeviceAPI::GLSL) {
Expand All @@ -406,10 +455,12 @@ class FlattenDimensions : public IRMutator {
if (op->for_type == ForType::GPUBlock ||
op->for_type == ForType::GPUThread) {
in_gpu = true;
in_device_api = op->device_api;
}
Stmt stmt = IRMutator::visit(op);
in_shader = old_in_shader;
in_gpu = old_in_gpu;
in_device_api = old_in_device_api;
return stmt;
}
};
Expand Down Expand Up @@ -483,7 +534,12 @@ Stmt storage_flattening(Stmt s,
}
}

s = FlattenDimensions(tuple_env, outputs, target).mutate(s);
FindBuffersInGPU finder;
s.accept(&finder);
FlattenDimensions flatten(tuple_env, outputs, target);
flatten.buffer_apis = finder.buffer_device_usage;

s = flatten.mutate(s);
s = PromoteToMemoryType().mutate(s);
return s;
}
Expand Down
Loading