halide · jlaxson · Nov 2, 2020 · Nov 2, 2020 · Nov 2, 2020 · Nov 2, 2020
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
@@ -441,7 +441,14 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
                                          i));
             }
 
-            builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
+            int8_t buffer_type = 0;
+            if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) {
+                buffer_type = 2;
+            } else if (closure_args[i].is_buffer) {
+                buffer_type = 1;
+            }
+
+            builder->CreateStore(ConstantInt::get(i8_t, buffer_type),
                                  builder->CreateConstGEP2_32(
                                      gpu_arg_is_buffer_arr_type,
                                      gpu_arg_is_buffer_arr,

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     vector<llvm::Type *> arg_types(args.size());
     for (size_t i = 0; i < args.size(); i++) {
         if (args[i].is_buffer) {
-            arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) {
+                arg_types[i] = llvm_type_of(Int(64));
+            } else {
+                arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            }
         } else {
             arg_types[i] = llvm_type_of(args[i].type);
         }
@@ -83,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 
     // Mark the buffer args as no alias
     for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_buffer) {
+        if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) {
             function->addParamAttr(i, Attribute::NoAlias);
         }
     }
@@ -172,6 +176,46 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
         builder->CreateCall(barrier0);
         value = ConstantInt::get(i32_t, 0);
+    } else if (op->is_intrinsic(Call::image_load)) {
+        int num_args = (op->args.size() - 2) / 2;
+        user_assert(num_args >= 1 && num_args <= 2);
+
+        string res_desc = "";
+        user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
+        llvm::Type *element_type;
+        if (op->type.is_float()) {
+            res_desc = "f32";
+            element_type = llvm_type_of(Float(32));
+        } else {
+            res_desc = "s32";
+            element_type = llvm_type_of(Int(32));
+        }
+        // PTX returns a 4 element struct (not a vector!) regardless of
+        llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type);
+
+        string coord_desc = "";
+        Type coord_type = op->args[2].type();
+        internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
+        if (coord_type.is_float()) {
+            coord_desc = ".f32";
+        } else if (coord_type.is_uint()) {
+            coord_desc = ".u32";
+        } else if (coord_type.is_int()) {
+            coord_desc = ".s32";
+        }
+        internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type;
+
+        string dim = std::to_string(num_args) + "d";
+        string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;
+
+        vector<Value *> coords;
+        coords.push_back(codegen(Variable::make(Int(64), op->args[0].as<StringImm>()->value)));
+        for (size_t i = 2; i < op->args.size(); i += 2) {
+            internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
+            coords.push_back(codegen(op->args[i]));
+        }
+        llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
+        value = builder->CreateExtractValue(call, {0});
     } else {
         CodeGen_LLVM::visit(op);
     }

diff --git a/src/Lower.cpp b/src/Lower.cpp
@@ -271,6 +271,13 @@ Module lower(const vector<Function> &output_funcs,
     debug(2) << "Lowering after bounding small realizations:\n"
              << s << "\n\n";
 
+    if (will_inject_host_copies) {
+        debug(1) << "Selecting a GPU API for GPU loops...\n";
+        s = select_gpu_api(s, t);
+        debug(2) << "Lowering after selecting a GPU API:\n"
+                 << s << "\n\n";
+    }
+
     debug(1) << "Performing storage flattening...\n";
     s = storage_flattening(s, outputs, env, t);
     debug(2) << "Lowering after storage flattening:\n"
@@ -296,11 +303,6 @@ Module lower(const vector<Function> &output_funcs,
     }
 
     if (will_inject_host_copies) {
-        debug(1) << "Selecting a GPU API for GPU loops...\n";
-        s = select_gpu_api(s, t);
-        debug(2) << "Lowering after selecting a GPU API:\n"
-                 << s << "\n\n";
-
         debug(1) << "Injecting host <-> dev buffer copies...\n";
         s = inject_host_dev_buffer_copies(s, t);
         debug(2) << "Lowering after injecting host <-> dev buffer copies:\n"

diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
@@ -22,6 +22,39 @@ using std::string;
 using std::vector;
 
 namespace {
+class FindBuffersInGPU : public IRVisitor {
+public:
+    map<string, set<DeviceAPI>> buffer_device_usage;
+
+private:
+    bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
+    using IRVisitor::visit;
+
+    void visit(const Call *op) override {
+        debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n";
+        if (in_gpu &&
+            (op->call_type == Call::Halide || op->call_type == Call::Image)) {
+            debug(2) << " load call to " << op->name << " " << in_device_api << "\n";
+            buffer_device_usage[op->name].insert(in_device_api);
+        }
+
+        IRVisitor::visit(op);
+    }
+
+    void visit(const For *op) override {
+        bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
+        if (op->for_type == ForType::GPUBlock ||
+            op->for_type == ForType::GPUThread) {
+            in_gpu = true;
+            in_device_api = op->device_api;
+        }
+        IRVisitor::visit(op);
+        in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
+    }
+};
 
 class FlattenDimensions : public IRMutator {
 public:
@@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator {
         }
     }
 
+    map<string, set<DeviceAPI>> buffer_apis;
+
 private:
     const map<string, pair<Function, int>> &env;
     set<string> outputs;
@@ -42,6 +77,7 @@ class FlattenDimensions : public IRMutator {
     Scope<> realizations, shader_scope_realizations;
     bool in_shader = false;
     bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
 
     Expr make_shape_var(string name, const string &field, size_t dim,
                         const Buffer<> &buf, const Parameter &param) {
@@ -116,7 +152,7 @@ class FlattenDimensions : public IRMutator {
 
         if (op->memory_type == MemoryType::GPUTexture) {
             textures.insert(op->name);
-            debug(2) << "found texture " << op->name << "\n";
+            debug(2) << "found texture " << op->name << " in " << in_device_api << "\n";
         }
 
         Stmt body = mutate(op->body);
@@ -152,11 +188,23 @@ class FlattenDimensions : public IRMutator {
                     if (args[j] == storage_dims[i].var) {
                         storage_permutation.push_back((int)j);
                         Expr alignment = storage_dims[i].alignment;
+
                         if (alignment.defined()) {
                             allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment;
                         } else {
                             allocation_extents[j] = extents[j];
                         }
+
+                        // Promote row alignment for buffers used as CUDA Textures
+                        if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) {
+                            // This could be symbolically fetched from runtime I guess?
+                            int target_align_bytes = 32;
+                            int target_align_items = target_align_bytes / op->types[0].bytes();
+
+                            debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n";
+
+                            allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items;
+                        }
                     }
                 }
                 internal_assert(storage_permutation.size() == i + 1);
@@ -259,7 +307,7 @@ class FlattenDimensions : public IRMutator {
             Expr store = Call::make(value.type(), Call::image_store,
                                     args, Call::Intrinsic);
             return Evaluate::make(store);
-        } else if (in_gpu && textures.count(op->name)) {
+        } else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) {  // CUDA writes are still directly to memory
             Expr buffer_var =
                 Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
             vector<Expr> args(2);
@@ -398,6 +446,7 @@ class FlattenDimensions : public IRMutator {
     Stmt visit(const For *op) override {
         bool old_in_shader = in_shader;
         bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
         if ((op->for_type == ForType::GPUBlock ||
              op->for_type == ForType::GPUThread) &&
             op->device_api == DeviceAPI::GLSL) {
@@ -406,10 +455,12 @@ class FlattenDimensions : public IRMutator {
         if (op->for_type == ForType::GPUBlock ||
             op->for_type == ForType::GPUThread) {
             in_gpu = true;
+            in_device_api = op->device_api;
         }
         Stmt stmt = IRMutator::visit(op);
         in_shader = old_in_shader;
         in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
         return stmt;
     }
 };
@@ -483,7 +534,12 @@ Stmt storage_flattening(Stmt s,
         }
     }
 
-    s = FlattenDimensions(tuple_env, outputs, target).mutate(s);
+    FindBuffersInGPU finder;
+    s.accept(&finder);
+    FlattenDimensions flatten(tuple_env, outputs, target);
+    flatten.buffer_apis = finder.buffer_device_usage;
+
+    s = flatten.mutate(s);
     s = PromoteToMemoryType().mutate(s);
     return s;
 }