From 7940fa5c304b5051d1b38757777f805cacb904ee Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Sun, 1 Nov 2020 21:37:28 -0700
Subject: [PATCH 01/13] some stuff

---
 src/CodeGen_GPU_Host.cpp         |   9 ++-
 src/CodeGen_LLVM.cpp             |   2 +-
 src/CodeGen_PTX_Dev.cpp          |  43 +++++++++-
 src/StorageFlattening.cpp        |   6 +-
 src/runtime/cuda.cpp             |  60 +++++++++++++-
 src/runtime/cuda_functions.h     |   3 +
 src/runtime/mini_cuda.h          | 130 +++++++++++++++++++++++++++++++
 test/correctness/gpu_texture.cpp |  20 ++---
 8 files changed, 258 insertions(+), 15 deletions(-)
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index c27ab4d0e788..3262a57e8491 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -441,7 +441,14 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
                                          i));
             }
 
-            builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
+            int8_t buffer_type = 0;
+            if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) {
+                buffer_type = 2;
+            } else if (closure_args[i].is_buffer) {
+                buffer_type = 1;
+            }
+
+            builder->CreateStore(ConstantInt::get(i8_t, buffer_type),
                                  builder->CreateConstGEP2_32(
                                      gpu_arg_is_buffer_arr_type,
                                      gpu_arg_is_buffer_arr,
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index e7b4d7954fbd..5b9051e18706 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1435,7 +1435,7 @@ Value *CodeGen_LLVM::codegen(const Expr &e) {
                     value->getType() == llvm_type_of(e.type()))
         << "Codegen of Expr " << e
         << " of type " << e.type()
-        << " did not produce llvm IR of the corresponding llvm type.\n";
+        << " did not produce llvm IR of the corresponding llvm type.  Llvm was " << llvm_type_of(e.type()) << "\n";
     return value;
 }
 
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index ea876762b3ee..79ff9f8763ae 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     vector<llvm::Type *> arg_types(args.size());
     for (size_t i = 0; i < args.size(); i++) {
         if (args[i].is_buffer) {
-            arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            if (args[i].memory_type == MemoryType::GPUTexture) {
+                arg_types[i] = llvm_type_of(Int(64));
+            } else {
+                arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            }
         } else {
             arg_types[i] = llvm_type_of(args[i].type);
         }
@@ -172,6 +176,43 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
         builder->CreateCall(barrier0);
         value = ConstantInt::get(i32_t, 0);
+    } else if (op->is_intrinsic(Call::image_load)) {
+        int num_args = (op->args.size() - 2) / 2;
+        user_assert(num_args >= 1 && num_args <= 2);
+
+        string res_desc = "";
+        user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
+        Type res_type;
+        if (op->type.is_float()) {
+            res_desc = "f32";
+            res_type = Type(Type::Float, 32, 4);
+        } else {
+            res_desc = "s32";
+            res_type = Type(Type::Int, 32, 4);
+        }
+
+        string coord_desc = "";
+        if (op->args[2].type().is_float()) {
+            coord_desc = ".f32";
+        } else {
+            coord_desc = ".s32";
+        }
+
+        string dim = std::to_string(num_args) + "d";
+        string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;
+
+        vector<Expr> coords;
+        coords.push_back(Variable::make(Int(64), op->args[0].as<StringImm>()->value));
+        for (size_t i = 2; i < op->args.size(); i += 2) {
+            internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
+            coords.push_back(op->args[i]);
+        }
+        llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords);
+        // call->getCalledFunction()->setCallingConv(CallingConv::Tail);
+        // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords);
+        // call->setTailCall(true);
+        value = builder->CreateExtractElement(call, ConstantInt::get(i32_t, 0));
+
     } else {
         CodeGen_LLVM::visit(op);
     }
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index e3ad51038666..aab178b4bc05 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -42,6 +42,7 @@ class FlattenDimensions : public IRMutator {
     Scope<> realizations, shader_scope_realizations;
     bool in_shader = false;
     bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
 
     Expr make_shape_var(string name, const string &field, size_t dim,
                         const Buffer<> &buf, const Parameter &param) {
@@ -259,7 +260,7 @@ class FlattenDimensions : public IRMutator {
             Expr store = Call::make(value.type(), Call::image_store,
                                     args, Call::Intrinsic);
             return Evaluate::make(store);
-        } else if (in_gpu && textures.count(op->name)) {
+        } else if (in_gpu && textures.count(op->name) && false && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory
             Expr buffer_var =
                 Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
             vector<Expr> args(2);
@@ -398,6 +399,7 @@ class FlattenDimensions : public IRMutator {
     Stmt visit(const For *op) override {
         bool old_in_shader = in_shader;
         bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
         if ((op->for_type == ForType::GPUBlock ||
              op->for_type == ForType::GPUThread) &&
             op->device_api == DeviceAPI::GLSL) {
@@ -406,10 +408,12 @@ class FlattenDimensions : public IRMutator {
         if (op->for_type == ForType::GPUBlock ||
             op->for_type == ForType::GPUThread) {
             in_gpu = true;
+            in_device_api = op->device_api;
         }
         Stmt stmt = IRMutator::visit(op);
         in_shader = old_in_shader;
         in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
         return stmt;
     }
 };
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 7c423e179d85..10f14595b86a 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1163,8 +1163,24 @@ WEAK int halide_cuda_run(void *user_context,
     for (size_t i = 0; i <= num_args; i++) {  // Get nullptr at end.
         if (arg_is_buffer[i]) {
             halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
-            dev_handles[i] = ((halide_buffer_t *)args[i])->device;
-            translated_args[i] = &(dev_handles[i]);
+            if (arg_is_buffer[i] == 2) {
+                cudaResourceDesc rdesc;
+                cudaTextureDesc tdesc;
+                cudaResourceViewDesc rviewdesc;
+                cudaTextureObject_t *texture = (cudaTextureObject_t *)&dev_handles[i];
+                err = cudaCreateTextureObject(texture, &rdesc, &tdesc, &rviewdesc);
+                if (err != CUDA_SUCCESS) {
+                    error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed: "
+                                        << get_error_name(err);
+                    free(dev_handles);
+                    free(translated_args);
+                    return err;
+                }
+                translated_args[i] = (void *)*texture;
+            } else {
+                dev_handles[i] = ((halide_buffer_t *)args[i])->device;
+                translated_args[i] = &(dev_handles[i]);
+            }
             debug(user_context) << "    halide_cuda_run translated arg" << (int)i
                                 << " [" << (*((void **)translated_args[i])) << " ...]\n";
         } else {
@@ -1192,6 +1208,14 @@ WEAK int halide_cuda_run(void *user_context,
                          stream,
                          translated_args,
                          nullptr);
+
+    for (size_t i = 0; i <= num_args; i++) {  // Get nullptr at end.
+        if (arg_is_buffer[i] == 2) {
+            cudaTextureObject_t texture = (cudaTextureObject_t)translated_args[i];
+            cudaDestroyTextureObject(texture);
+        }
+    }
+
     free(dev_handles);
     free(translated_args);
     if (err != CUDA_SUCCESS) {
@@ -1311,6 +1335,38 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min
     return 0;
 }
 
+WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) {
+    if (!cudaCreateTextureObject) {
+        debug(user_context) << "requesting texture object but don't have runtime functions";
+        return -1;
+    }
+
+    struct cudaResourceDesc resourceDesc;
+    struct cudaTextureDesc textureDesc;
+    struct cudaResourceViewDesc resourceViewDesc;
+
+    cudaTextureObject_t texture;
+    CUresult err = cudaCreateTextureObject(&texture, &resourceDesc, &textureDesc, &resourceViewDesc);
+
+    if (err != CUDA_SUCCESS) {
+        error(user_context)
+            << "CUDA: cudaCreateTextureObject failed ("
+            << Halide::Runtime::Internal::Cuda::get_error_name(err)
+            << ")";
+        return 0;
+    }
+
+    return texture;
+}
+
+WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) {
+    if (!cudaDestroyTextureObject && texture_object) {
+        error(user_context) << "attempting to free texture object but don't have runtime functions";
+    }
+
+    return cudaDestroyTextureObject(texture_object);
+}
+
 namespace {
 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
     halide_cuda_device_release(nullptr);
diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h
index 2f311bfd603e..b0d32755f707 100644
--- a/src/runtime/cuda_functions.h
+++ b/src/runtime/cuda_functions.h
@@ -47,6 +47,9 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p
 
 CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream));
 
+CUDA_FN_OPTIONAL(CUresult,  cudaCreateTextureObject, (cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc));
+CUDA_FN_OPTIONAL(CUresult, cudaDestroyTextureObject, (cudaTextureObject_t texObject));
+
 #undef CUDA_FN
 #undef CUDA_FN_OPTIONAL
 #undef CUDA_FN_3020
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index cfe21d70617a..0598a146c944 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -229,6 +229,136 @@ typedef struct CUDA_MEMCPY3D_st {
     size_t Depth;        /**< Depth of 3D memory copy */
 } CUDA_MEMCPY3D;
 
+typedef unsigned long long cudaTextureObject_t;
+
+enum cudaChannelFormatKind {
+    cudaChannelFormatKindSigned = 0,   /**< Signed channel format */
+    cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */
+    cudaChannelFormatKindFloat = 2,    /**< Float channel format */
+    cudaChannelFormatKindNone = 3      /**< No channel format */
+};
+
+enum cudaResourceType {
+    cudaResourceTypeArray = 0x00,
+    cudaResourceTypeMipmappedArray = 0x01,
+    cudaResourceTypeLinear = 0x02,
+    cudaResourceTypePitch2D = 0x03
+};
+
+struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+};
+
+enum cudaTextureAddressMode {
+    cudaAddressModeWrap = 0,
+    cudaAddressModeClamp = 1,
+    cudaAddressModeMirror = 2,
+    cudaAddressModeBorder = 3
+};
+
+enum cudaTextureFilterMode {
+    cudaFilterModePoint = 0,
+    cudaFilterModeLinear = 1
+};
+
+enum cudaTextureReadMode {
+    cudaReadModeElementType = 0,
+    cudaReadModeNormalizedFloat = 1
+};
+
+/**
+ * CUDA texture resource view formats
+ */
+enum cudaResourceViewFormat
+{
+    cudaResViewFormatNone                      = 0x00, /**< No resource view format (use underlying resource format) */
+    cudaResViewFormatUnsignedChar1             = 0x01, /**< 1 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar2             = 0x02, /**< 2 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar4             = 0x03, /**< 4 channel unsigned 8-bit integers */
+    cudaResViewFormatSignedChar1               = 0x04, /**< 1 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar2               = 0x05, /**< 2 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar4               = 0x06, /**< 4 channel signed 8-bit integers */
+    cudaResViewFormatUnsignedShort1            = 0x07, /**< 1 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort2            = 0x08, /**< 2 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort4            = 0x09, /**< 4 channel unsigned 16-bit integers */
+    cudaResViewFormatSignedShort1              = 0x0a, /**< 1 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort2              = 0x0b, /**< 2 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort4              = 0x0c, /**< 4 channel signed 16-bit integers */
+    cudaResViewFormatUnsignedInt1              = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt2              = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt4              = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    cudaResViewFormatSignedInt1                = 0x10, /**< 1 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt2                = 0x11, /**< 2 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt4                = 0x12, /**< 4 channel signed 32-bit integers */
+    cudaResViewFormatHalf1                     = 0x13, /**< 1 channel 16-bit floating point */
+    cudaResViewFormatHalf2                     = 0x14, /**< 2 channel 16-bit floating point */
+    cudaResViewFormatHalf4                     = 0x15, /**< 4 channel 16-bit floating point */
+    cudaResViewFormatFloat1                    = 0x16, /**< 1 channel 32-bit floating point */
+    cudaResViewFormatFloat2                    = 0x17, /**< 2 channel 32-bit floating point */
+    cudaResViewFormatFloat4                    = 0x18, /**< 4 channel 32-bit floating point */
+    cudaResViewFormatUnsignedBlockCompressed1  = 0x19, /**< Block compressed 1 */
+    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a, /**< Block compressed 2 */
+    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b, /**< Block compressed 3 */
+    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c, /**< Block compressed 4 unsigned */
+    cudaResViewFormatSignedBlockCompressed4    = 0x1d, /**< Block compressed 4 signed */
+    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e, /**< Block compressed 5 unsigned */
+    cudaResViewFormatSignedBlockCompressed5    = 0x1f, /**< Block compressed 5 signed */
+    cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    cudaResViewFormatSignedBlockCompressed6H   = 0x21, /**< Block compressed 6 signed half-float */
+    cudaResViewFormatUnsignedBlockCompressed7  = 0x22  /**< Block compressed 7 */
+};
+
+struct cudaResourceViewDesc {
+    enum cudaResourceViewFormat format;
+    size_t width;
+    size_t height;
+    size_t depth;
+    unsigned int firstMipmapLevel;
+    unsigned int lastMipmapLevel;
+    unsigned int firstLayer;
+    unsigned int lastLayer;
+};
+
+struct cudaTextureDesc {
+    enum cudaTextureAddressMode addressMode[3];
+    enum cudaTextureFilterMode filterMode;
+    enum cudaTextureReadMode readMode;
+    int sRGB;
+    float borderColor[4];
+    int normalizedCoords;
+    unsigned int maxAnisotropy;
+    enum cudaTextureFilterMode mipmapFilterMode;
+    float mipmapLevelBias;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+};
+
+struct cudaResourceDesc {
+    enum cudaResourceType resType;
+
+    union {
+        struct {
+            // cudaArray_t array;
+        } array;
+        struct {
+            // cudaMipmappedArray_t mipmap;
+        } mipmap;
+        struct {
+            void *devPtr;
+            struct cudaChannelFormatDesc desc;
+            size_t sizeInBytes;
+        } linear;
+        struct {
+            void *devPtr;
+            struct cudaChannelFormatDesc desc;
+            size_t width;
+            size_t height;
+            size_t pitchInBytes;
+        } pitch2D;
+    } res;
+};
+
 #define CU_POINTER_ATTRIBUTE_CONTEXT 1
 
 }  // namespace Cuda
diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp
index 62ae5feb77a2..3de269d07fa2 100644
--- a/test/correctness/gpu_texture.cpp
+++ b/test/correctness/gpu_texture.cpp
@@ -8,18 +8,20 @@ using namespace Halide::Internal;
 int main(int argc, char **argv) {
     Target t = get_jit_target_from_environment();
 
-    if (!t.has_feature(halide_target_feature_opencl)) {
-        printf("[SKIP] No OpenCL target enabled.\n");
+    if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) {
+        printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n");
         return 0;
     }
 
-    const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL);
-    assert(interface->compute_capability != nullptr);
-    int major, minor;
-    int err = interface->compute_capability(nullptr, &major, &minor);
-    if (err != 0 || (major == 1 && minor < 2)) {
-        printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor);
-        return 0;
+    if (t.has_feature(halide_target_feature_opencl)) {
+        const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL);
+        assert(interface->compute_capability != nullptr);
+        int major, minor;
+        int err = interface->compute_capability(nullptr, &major, &minor);
+        if (err != 0 || (major == 1 && minor < 2)) {
+            printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor);
+            return 0;
+        }
     }
 
     // Check dynamic allocations into Heap and Texture memory

From 9a2239a1eb3b325cffb35d252f08bc67e1dacddf Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 00:39:00 -0700
Subject: [PATCH 02/13] codegen struct x4 not vector

---
 src/CodeGen_PTX_Dev.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 79ff9f8763ae..40160fa6ce32 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -70,7 +70,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     vector<llvm::Type *> arg_types(args.size());
     for (size_t i = 0; i < args.size(); i++) {
         if (args[i].is_buffer) {
-            if (args[i].memory_type == MemoryType::GPUTexture) {
+            if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) {
                 arg_types[i] = llvm_type_of(Int(64));
             } else {
                 arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
@@ -87,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 
     // Mark the buffer args as no alias
     for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_buffer) {
+        if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) {
             function->addParamAttr(i, Attribute::NoAlias);
         }
     }
@@ -182,16 +182,19 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
 
         string res_desc = "";
         user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
-        Type res_type;
+        llvm::Type *res_type;
         if (op->type.is_float()) {
             res_desc = "f32";
-            res_type = Type(Type::Float, 32, 4);
+            auto element = llvm_type_of(Float(32));
+            res_type = llvm::StructType::get(element, element, element, element);
         } else {
             res_desc = "s32";
-            res_type = Type(Type::Int, 32, 4);
+            auto element = llvm_type_of(Int(32));
+            res_type = llvm::StructType::get(element, element, element, element);
         }
 
         string coord_desc = "";
+        user_assert(op->args[2].type().bits() == 32) << "ptx texture sampler only supports 32 bit args";
         if (op->args[2].type().is_float()) {
             coord_desc = ".f32";
         } else {
@@ -201,17 +204,17 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         string dim = std::to_string(num_args) + "d";
         string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;
 
-        vector<Expr> coords;
-        coords.push_back(Variable::make(Int(64), op->args[0].as<StringImm>()->value));
+        vector<Value *> coords;
+        coords.push_back(codegen(Variable::make(Int(64), op->args[0].as<StringImm>()->value)));
         for (size_t i = 2; i < op->args.size(); i += 2) {
             internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
-            coords.push_back(op->args[i]);
+            coords.push_back(codegen(op->args[i]));
         }
-        llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords);
+        llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
         // call->getCalledFunction()->setCallingConv(CallingConv::Tail);
         // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords);
         // call->setTailCall(true);
-        value = builder->CreateExtractElement(call, ConstantInt::get(i32_t, 0));
+        value = builder->CreateExtractValue(call, {0});
 
     } else {
         CodeGen_LLVM::visit(op);

From 63288d497399bc9809ad0e781446c82fa663d016 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 09:11:21 -0700
Subject: [PATCH 03/13] it kinda works

---
 src/runtime/cuda.cpp         | 149 ++++++++++++++++-------
 src/runtime/cuda_functions.h |   4 +-
 src/runtime/mini_cuda.h      | 227 ++++++++++++++++++++---------------
 3 files changed, 234 insertions(+), 146 deletions(-)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 10f14595b86a..1db647d899e6 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1099,6 +1099,102 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
     return 0;
 }
 
+namespace {
+WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) {
+    debug(user_context)
+        << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n";
+
+    halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device);
+
+    if (!cuTexObjectCreate) {
+        error(user_context) << "requesting texture object but don't have runtime functions";
+        return 0;
+    }
+
+    CUDA_RESOURCE_DESC resourceDesc;
+    CUDA_TEXTURE_DESC textureDesc;
+    // CUDA_RESOURCE_VIEW_DESC resourceViewDesc;
+
+    memset(&resourceDesc, 0, sizeof(resourceDesc));
+    memset(&textureDesc, 0, sizeof(textureDesc));
+
+    // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT;
+
+    CUarray_format format = (CUarray_format)0;
+    struct halide_type_t type = buf->type;
+    if (type.code == halide_type_int) {
+        if (type.bits == 8) {
+            format = CU_AD_FORMAT_SIGNED_INT8;
+        } else if (type.bits == 16) {
+            format = CU_AD_FORMAT_SIGNED_INT16;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_SIGNED_INT32;
+        }
+    } else if (type.code == halide_type_uint) {
+        if (type.bits == 8) {
+            format = CU_AD_FORMAT_UNSIGNED_INT8;
+        } else if (type.bits == 16) {
+            format = CU_AD_FORMAT_UNSIGNED_INT16;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_UNSIGNED_INT32;
+        }
+    } else if (type.code == halide_type_float) {
+        if (type.bits == 16) {
+            format = CU_AD_FORMAT_HALF;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_FLOAT;
+        }
+    }
+    if (format == 0) {
+        error(user_context) << "Unhandled datatype for CUDA texture object: " << type;
+        return 0;
+    }
+
+    resourceDesc.flags = 0;
+    if (buf->dimensions == 1) {
+        resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+        resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; 
+        resourceDesc.res.linear.format = format;
+        resourceDesc.res.linear.numChannels = 1;
+        resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes();
+    } else if (buf->dimensions == 2) {
+        resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+        resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device;
+        resourceDesc.res.pitch2D.format = format;
+        resourceDesc.res.pitch2D.numChannels = 1;
+        resourceDesc.res.pitch2D.width = buf->dim[0].extent;
+        resourceDesc.res.pitch2D.height = buf->dim[1].extent;
+        resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride;
+    } else {
+        error(user_context) << "cuda texture support only handles 1d and td textures";
+        return 0;
+    }
+
+    CUtexObject texture = 0;
+    CUresult err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr);
+
+    if (err != CUDA_SUCCESS) {
+        error(user_context)
+            << "CUDA: cuTexObjectCreate failed ("
+            << Halide::Runtime::Internal::Cuda::get_error_name(err)
+            << ")";
+        return 0;
+    }
+
+    debug(user_context) << "    got texture " << texture << "\n";
+
+    return texture;
+}
+
+WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) {
+    if (!cuTexObjectDestroy && texture_object) {
+        error(user_context) << "attempting to free texture object but don't have runtime functions";
+    }
+
+    return cuTexObjectDestroy(texture_object);
+}
+}  // namespace
+
 WEAK int halide_cuda_run(void *user_context,
                          void *state_ptr,
                          const char *entry_name,
@@ -1164,19 +1260,16 @@ WEAK int halide_cuda_run(void *user_context,
         if (arg_is_buffer[i]) {
             halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
             if (arg_is_buffer[i] == 2) {
-                cudaResourceDesc rdesc;
-                cudaTextureDesc tdesc;
-                cudaResourceViewDesc rviewdesc;
-                cudaTextureObject_t *texture = (cudaTextureObject_t *)&dev_handles[i];
-                err = cudaCreateTextureObject(texture, &rdesc, &tdesc, &rviewdesc);
-                if (err != CUDA_SUCCESS) {
-                    error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed: "
-                                        << get_error_name(err);
+                CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true);
+
+                if (!texture) {
+                    error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed";
                     free(dev_handles);
                     free(translated_args);
-                    return err;
+                    return -1;
                 }
-                translated_args[i] = (void *)*texture;
+                dev_handles[i] = texture;
+                translated_args[i] = &(dev_handles[i]);
             } else {
                 dev_handles[i] = ((halide_buffer_t *)args[i])->device;
                 translated_args[i] = &(dev_handles[i]);
@@ -1211,8 +1304,8 @@ WEAK int halide_cuda_run(void *user_context,
 
     for (size_t i = 0; i <= num_args; i++) {  // Get nullptr at end.
         if (arg_is_buffer[i] == 2) {
-            cudaTextureObject_t texture = (cudaTextureObject_t)translated_args[i];
-            cudaDestroyTextureObject(texture);
+            CUtexObject texture = (CUtexObject)dev_handles[i];
+            halide_cuda_free_texture(user_context, (halide_buffer_t *)args[i], texture);
         }
     }
 
@@ -1335,38 +1428,6 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min
     return 0;
 }
 
-WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) {
-    if (!cudaCreateTextureObject) {
-        debug(user_context) << "requesting texture object but don't have runtime functions";
-        return -1;
-    }
-
-    struct cudaResourceDesc resourceDesc;
-    struct cudaTextureDesc textureDesc;
-    struct cudaResourceViewDesc resourceViewDesc;
-
-    cudaTextureObject_t texture;
-    CUresult err = cudaCreateTextureObject(&texture, &resourceDesc, &textureDesc, &resourceViewDesc);
-
-    if (err != CUDA_SUCCESS) {
-        error(user_context)
-            << "CUDA: cudaCreateTextureObject failed ("
-            << Halide::Runtime::Internal::Cuda::get_error_name(err)
-            << ")";
-        return 0;
-    }
-
-    return texture;
-}
-
-WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) {
-    if (!cudaDestroyTextureObject && texture_object) {
-        error(user_context) << "attempting to free texture object but don't have runtime functions";
-    }
-
-    return cudaDestroyTextureObject(texture_object);
-}
-
 namespace {
 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
     halide_cuda_device_release(nullptr);
diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h
index b0d32755f707..ba6f352ebb0e 100644
--- a/src/runtime/cuda_functions.h
+++ b/src/runtime/cuda_functions.h
@@ -47,8 +47,8 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p
 
 CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream));
 
-CUDA_FN_OPTIONAL(CUresult,  cudaCreateTextureObject, (cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc));
-CUDA_FN_OPTIONAL(CUresult, cudaDestroyTextureObject, (cudaTextureObject_t texObject));
+CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc));
+CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject));
 
 #undef CUDA_FN
 #undef CUDA_FN_OPTIONAL
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index 0598a146c944..a2be5f0aa4ce 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -229,38 +229,50 @@ typedef struct CUDA_MEMCPY3D_st {
     size_t Depth;        /**< Depth of 3D memory copy */
 } CUDA_MEMCPY3D;
 
-typedef unsigned long long cudaTextureObject_t;
+typedef unsigned long long CUtexObject;
 
-enum cudaChannelFormatKind {
-    cudaChannelFormatKindSigned = 0,   /**< Signed channel format */
-    cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */
-    cudaChannelFormatKindFloat = 2,    /**< Float channel format */
-    cudaChannelFormatKindNone = 3      /**< No channel format */
-};
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+} CUarray_format;
 
-enum cudaResourceType {
-    cudaResourceTypeArray = 0x00,
-    cudaResourceTypeMipmappedArray = 0x01,
-    cudaResourceTypeLinear = 0x02,
-    cudaResourceTypePitch2D = 0x03
-};
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
 
-struct cudaChannelFormatDesc {
-    int x, y, z, w;
-    enum cudaChannelFormatKind f;
-};
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
 
-enum cudaTextureAddressMode {
-    cudaAddressModeWrap = 0,
-    cudaAddressModeClamp = 1,
-    cudaAddressModeMirror = 2,
-    cudaAddressModeBorder = 3
-};
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
 
-enum cudaTextureFilterMode {
-    cudaFilterModePoint = 0,
-    cudaFilterModeLinear = 1
-};
 
 enum cudaTextureReadMode {
     cudaReadModeElementType = 0,
@@ -270,94 +282,109 @@ enum cudaTextureReadMode {
 /**
  * CUDA texture resource view formats
  */
-enum cudaResourceViewFormat
+typedef enum CUresourceViewFormat_enum
 {
-    cudaResViewFormatNone                      = 0x00, /**< No resource view format (use underlying resource format) */
-    cudaResViewFormatUnsignedChar1             = 0x01, /**< 1 channel unsigned 8-bit integers */
-    cudaResViewFormatUnsignedChar2             = 0x02, /**< 2 channel unsigned 8-bit integers */
-    cudaResViewFormatUnsignedChar4             = 0x03, /**< 4 channel unsigned 8-bit integers */
-    cudaResViewFormatSignedChar1               = 0x04, /**< 1 channel signed 8-bit integers */
-    cudaResViewFormatSignedChar2               = 0x05, /**< 2 channel signed 8-bit integers */
-    cudaResViewFormatSignedChar4               = 0x06, /**< 4 channel signed 8-bit integers */
-    cudaResViewFormatUnsignedShort1            = 0x07, /**< 1 channel unsigned 16-bit integers */
-    cudaResViewFormatUnsignedShort2            = 0x08, /**< 2 channel unsigned 16-bit integers */
-    cudaResViewFormatUnsignedShort4            = 0x09, /**< 4 channel unsigned 16-bit integers */
-    cudaResViewFormatSignedShort1              = 0x0a, /**< 1 channel signed 16-bit integers */
-    cudaResViewFormatSignedShort2              = 0x0b, /**< 2 channel signed 16-bit integers */
-    cudaResViewFormatSignedShort4              = 0x0c, /**< 4 channel signed 16-bit integers */
-    cudaResViewFormatUnsignedInt1              = 0x0d, /**< 1 channel unsigned 32-bit integers */
-    cudaResViewFormatUnsignedInt2              = 0x0e, /**< 2 channel unsigned 32-bit integers */
-    cudaResViewFormatUnsignedInt4              = 0x0f, /**< 4 channel unsigned 32-bit integers */
-    cudaResViewFormatSignedInt1                = 0x10, /**< 1 channel signed 32-bit integers */
-    cudaResViewFormatSignedInt2                = 0x11, /**< 2 channel signed 32-bit integers */
-    cudaResViewFormatSignedInt4                = 0x12, /**< 4 channel signed 32-bit integers */
-    cudaResViewFormatHalf1                     = 0x13, /**< 1 channel 16-bit floating point */
-    cudaResViewFormatHalf2                     = 0x14, /**< 2 channel 16-bit floating point */
-    cudaResViewFormatHalf4                     = 0x15, /**< 4 channel 16-bit floating point */
-    cudaResViewFormatFloat1                    = 0x16, /**< 1 channel 32-bit floating point */
-    cudaResViewFormatFloat2                    = 0x17, /**< 2 channel 32-bit floating point */
-    cudaResViewFormatFloat4                    = 0x18, /**< 4 channel 32-bit floating point */
-    cudaResViewFormatUnsignedBlockCompressed1  = 0x19, /**< Block compressed 1 */
-    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a, /**< Block compressed 2 */
-    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b, /**< Block compressed 3 */
-    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c, /**< Block compressed 4 unsigned */
-    cudaResViewFormatSignedBlockCompressed4    = 0x1d, /**< Block compressed 4 signed */
-    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e, /**< Block compressed 5 unsigned */
-    cudaResViewFormatSignedBlockCompressed5    = 0x1f, /**< Block compressed 5 signed */
-    cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */
-    cudaResViewFormatSignedBlockCompressed6H   = 0x21, /**< Block compressed 6 signed half-float */
-    cudaResViewFormatUnsignedBlockCompressed7  = 0x22  /**< Block compressed 7 */
-};
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
 
-struct cudaResourceViewDesc {
-    enum cudaResourceViewFormat format;
-    size_t width;
-    size_t height;
-    size_t depth;
-    unsigned int firstMipmapLevel;
-    unsigned int lastMipmapLevel;
-    unsigned int firstLayer;
-    unsigned int lastLayer;
-};
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC;
 
-struct cudaTextureDesc {
-    enum cudaTextureAddressMode addressMode[3];
-    enum cudaTextureFilterMode filterMode;
-    enum cudaTextureReadMode readMode;
-    int sRGB;
-    float borderColor[4];
-    int normalizedCoords;
-    unsigned int maxAnisotropy;
-    enum cudaTextureFilterMode mipmapFilterMode;
-    float mipmapLevelBias;
-    float minMipmapLevelClamp;
-    float maxMipmapLevelClamp;
-};
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC;
 
-struct cudaResourceDesc {
-    enum cudaResourceType resType;
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
 
     union {
         struct {
-            // cudaArray_t array;
+            // CUarray hArray;                   /**< CUDA array */
         } array;
         struct {
-            // cudaMipmappedArray_t mipmap;
+            // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
         } mipmap;
         struct {
-            void *devPtr;
-            struct cudaChannelFormatDesc desc;
-            size_t sizeInBytes;
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
         } linear;
         struct {
-            void *devPtr;
-            struct cudaChannelFormatDesc desc;
-            size_t width;
-            size_t height;
-            size_t pitchInBytes;
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
         } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
     } res;
-};
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC;
 
 #define CU_POINTER_ATTRIBUTE_CONTEXT 1
 

From c80ff231e33dd842a09bc393cda74bac4449d3c8 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 16:37:29 -0700
Subject: [PATCH 04/13] alignment v1

---
 src/AlignGPUBuffers.cpp | 164 ++++++++++++++++++++++++++++++++++++++++
 src/AlignGPUBuffers.h   |  24 ++++++
 src/CMakeLists.txt      |   2 +
 src/CodeGen_PTX_Dev.cpp |   3 -
 src/Lower.cpp           |   6 ++
 src/runtime/cuda.cpp    |  70 +++++++++++++++--
 src/runtime/mini_cuda.h |  25 ++++++
 7 files changed, 285 insertions(+), 9 deletions(-)
 create mode 100644 src/AlignGPUBuffers.cpp
 create mode 100644 src/AlignGPUBuffers.h

diff --git a/src/AlignGPUBuffers.cpp b/src/AlignGPUBuffers.cpp
new file mode 100644
index 000000000000..f337024ffc2a
--- /dev/null
+++ b/src/AlignGPUBuffers.cpp
@@ -0,0 +1,164 @@
+#include "InjectHostDevBufferCopies.h"
+
+#include "CodeGen_GPU_Dev.h"
+#include "Debug.h"
+#include "ExternFuncArgument.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "IRPrinter.h"
+#include "Substitute.h"
+
+#include <map>
+#include <utility>
+
+namespace Halide {
+namespace Internal {
+
+using std::set;
+using std::string;
+using std::vector;
+
+namespace {
+
+class FindTexturesInGPU : public IRVisitor {
+    public:
+    set<string> textures;
+
+    private:
+    bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
+
+    void visit(const Call *op) override {
+        if (in_gpu && op->is_intrinsic(Call::image_load)) {
+            debug(2) << " load call to " << op->name << " " << textures.count(op->name) << "\n";
+            textures.insert(op->args[0].as<StringImm>()->value);
+        }
+
+        IRVisitor::visit(op);
+    }
+
+    void visit(const For *op) override {
+        bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
+        if (op->for_type == ForType::GPUBlock ||
+            op->for_type == ForType::GPUThread) {
+            in_gpu = true;
+            in_device_api = op->device_api;
+        }
+        IRVisitor::visit(op);
+        in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
+    }
+};
+
+class FindBufferInitType : public IRVisitor {
+    public:
+    Type type;
+
+    private:
+    void visit(const Call *op) override {
+        if (op->name == Call::buffer_init) {
+            internal_assert(op->args.size() == 10) << "don't understand the format of buffer_init";
+            
+            halide_type_code_t code = (halide_type_code_t)op->args[5].as<IntImm>()->value;
+            int bits = op->args[6].as<IntImm>()->value;
+            type = Type(code, bits, 1);
+        }
+
+        IRVisitor::visit(op);
+    }
+};
+
+class AdjustAllocationStride : public IRMutator {
+    Type buffer_type;
+private:
+    Stmt visit(const LetStmt *op) override {
+        if (op->name == buffer) {
+            bool old_in_buffer = in_buffer;
+            debug(2) << " enter buffer " << op->name << "\n";
+            internal_assert(!old_in_buffer) << " Already in buffer?!?";
+            in_buffer = true;
+
+            FindBufferInitType typeFinder;
+            op->accept(&typeFinder);
+            buffer_type = typeFinder.type;
+
+            debug(2) << " found type " << buffer_type << "\n";
+
+            Expr new_value = mutate(op->value);
+            debug(2) << " new struct value " << new_value;
+            debug(2) << " exit buffer " << op->name << "\n";
+            in_buffer = old_in_buffer;
+
+            return LetStmt::make(op->name, new_value, op->body);
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    Expr visit(const Call *op) override {
+        if (in_buffer) {
+            debug(2) << " in buffer call " << op->name << "\n";
+
+            if (op->is_intrinsic(Call::make_struct)) {
+                internal_assert(op->args.size() % 4 == 0) << "unknown format of make_struct for buffer";
+
+                vector<Expr> args = op->args;
+                if (args.size() >= 8) {
+                    Expr row_width = args[1];
+                    Expr current_stride = args[6];
+
+                    // This could be symbolically fetched from runtime I guess?
+                    int target_align_bytes = 32;
+
+                    int target_align_items = target_align_bytes / buffer_type.bytes();
+                    Expr target_align_expr = IntImm::make(Int(32), target_align_items);
+                    
+                    Expr row_tail_items = Mod::make(current_stride, target_align_expr);
+                    Expr row_extra_items = Sub::make(target_align_expr, row_tail_items);
+
+                    Expr padded_stride = Select::make(
+                        EQ::make(row_tail_items, IntImm::make(Int(32), 0)),
+                        current_stride,
+                        Add::make(current_stride, row_extra_items)
+                    );
+                    args[6] = padded_stride;
+
+                    debug(2) << " old struct: " << static_cast<Expr>(op) << "\n";
+                    Expr new_call = Call::make(op->type, op->name, args, op->call_type).as<Call>();
+                    debug(2) << " new struct: " << new_call << "\n";
+                    return new_call;
+                }
+            }
+
+            return IRMutator::visit(op);
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    string buffer;
+    bool in_buffer = false;
+
+public:
+    AdjustAllocationStride(string b)
+        : buffer(std::move(b)) {
+    }
+};
+
+}  // namespace
+
+Stmt align_gpu_buffers(Stmt s, const Target &t) {
+
+    // Handle inputs and outputs
+    FindTexturesInGPU finder;
+    s.accept(&finder);
+    for (const string& texture : finder.textures) {
+        s = AdjustAllocationStride(texture + ".buffer").mutate(s);
+    }
+
+    return s;
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/AlignGPUBuffers.h b/src/AlignGPUBuffers.h
new file mode 100644
index 000000000000..bcd72d7c6fec
--- /dev/null
+++ b/src/AlignGPUBuffers.h
@@ -0,0 +1,24 @@
+#ifndef HALIDE_ALIGN_GPU_BUFFERS_H
+#define HALIDE_ALIGN_GPU_BUFFERS_H
+
+/** \file
+ * Defines the lowering passes that deal with host and device buffer flow.
+ */
+
+#include <string>
+#include <vector>
+
+#include "Expr.h"
+#include "Target.h"
+
+namespace Halide {
+namespace Internal {
+
+/** Inject calls to halide_device_malloc, halide_copy_to_device, and
+ * halide_copy_to_host as needed. */
+Stmt align_gpu_buffers(Stmt s, const Target &t);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 29458c7db0d9..9e20d5a2ff2d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,6 +8,7 @@ set(HEADER_FILES
     AddAtomicMutex.h
     AddImageChecks.h
     AddParameterChecks.h
+    AlignGPUBuffers.h
     AlignLoads.h
     AllocationBoundsInference.h
     ApplySplit.h
@@ -173,6 +174,7 @@ set(SOURCE_FILES
     AddAtomicMutex.cpp
     AddImageChecks.cpp
     AddParameterChecks.cpp
+    AlignGPUBuffers.cpp
     AlignLoads.cpp
     AllocationBoundsInference.cpp
     ApplySplit.cpp
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 40160fa6ce32..13ddec62c1f2 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -211,9 +211,6 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
             coords.push_back(codegen(op->args[i]));
         }
         llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
-        // call->getCalledFunction()->setCallingConv(CallingConv::Tail);
-        // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords);
-        // call->setTailCall(true);
         value = builder->CreateExtractValue(call, {0});
 
     } else {
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 24fdbc47acf0..3bd946083504 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -9,6 +9,7 @@
 #include "AddAtomicMutex.h"
 #include "AddImageChecks.h"
 #include "AddParameterChecks.h"
+#include "AlignGPUBuffers.h"
 #include "AllocationBoundsInference.h"
 #include "AsyncProducers.h"
 #include "BoundSmallAllocations.h"
@@ -411,6 +412,11 @@ Module lower(const vector<Function> &output_funcs,
         s = lower_warp_shuffles(s);
         debug(2) << "Lowering after injecting warp shuffles:\n"
                  << s << "\n\n";
+
+        debug(1) << "Aligning GPU Buffers...\n";
+        s = align_gpu_buffers(s, t);
+        debug(2) << "Lowering after aligning GPU buffers:\n"
+                 << s << "\n\n";
     }
 
     debug(1) << "Simplifying...\n";
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 1db647d899e6..0f58c164d95b 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -371,6 +371,8 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
         int max_block_size[] = {0, 0, 0};
         int max_grid_size[] = {0, 0, 0};
         int max_shared_mem = 0, max_constant_mem = 0;
+        int max_texture1d = 0, max_texture2d_width = 0, max_texture2d_height = 0;
+        int texture_pitch_align = 0, max_texture2d_linear_pitch = 0;
         int cc_major = 0, cc_minor = 0;
 
         struct {
@@ -390,6 +392,11 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
             {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
             {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
             {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
+            {&max_texture1d, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH},
+            {&max_texture2d_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH},
+            {&max_texture2d_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT},
+            {&texture_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT},
+            {&max_texture2d_linear_pitch, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH},
             {nullptr, CU_DEVICE_ATTRIBUTE_MAX}};
 
         // Do all the queries.
@@ -441,7 +448,10 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
             << "      max constant memory per block: " << max_constant_mem << "\n"
             << "      compute capability " << cc_major << "." << cc_minor << "\n"
             << "      cuda cores: " << num_cores << " x " << threads_per_core
-            << " = " << num_cores * threads_per_core << "\n";
+            << " = " << num_cores * threads_per_core << "\n"
+            << "      texture pitch align: " << texture_pitch_align << "\n"
+            << "      texture max 2d pitch: " << max_texture2d_linear_pitch << "\n"
+            << "      texture max size: 1d: " << max_texture1d << " 2d: (" << max_texture2d_width << "," << max_texture2d_height << ") \n";
     }
 #endif
 
@@ -1101,6 +1111,8 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
 
 namespace {
 WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) {
+    CUresult err;
+    int texture_row_pitch_align_required = 0;
     debug(user_context)
         << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n";
 
@@ -1111,6 +1123,34 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         return 0;
     }
 
+    {
+        Context ctx(user_context);
+        if (ctx.error != 0) {
+            return 0;
+        }
+
+        CUresult err;
+
+        CUdevice dev;
+        err = cuCtxGetDevice(&dev);
+        if (err != CUDA_SUCCESS) {
+            error(user_context)
+                << "CUDA: cuCtxGetDevice failed ("
+                << Halide::Runtime::Internal::Cuda::get_error_name(err)
+                << ")";
+            return 0;
+        }
+
+        err = cuDeviceGetAttribute(&texture_row_pitch_align_required, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, dev);
+        if (err != CUDA_SUCCESS) {
+            error(user_context)
+                << "CUDA: cuDeviceGetAttribute failed ("
+                << get_error_name(err)
+                << ")";
+            return 0;
+        }
+    }
+
     CUDA_RESOURCE_DESC resourceDesc;
     CUDA_TEXTURE_DESC textureDesc;
     // CUDA_RESOURCE_VIEW_DESC resourceViewDesc;
@@ -1118,7 +1158,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
     memset(&resourceDesc, 0, sizeof(resourceDesc));
     memset(&textureDesc, 0, sizeof(textureDesc));
 
-    // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT;
+    // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT
 
     CUarray_format format = (CUarray_format)0;
     struct halide_type_t type = buf->type;
@@ -1130,6 +1170,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         } else if (type.bits == 32) {
             format = CU_AD_FORMAT_SIGNED_INT32;
         }
+        textureDesc.flags |= CU_TRSF_READ_AS_INTEGER;
     } else if (type.code == halide_type_uint) {
         if (type.bits == 8) {
             format = CU_AD_FORMAT_UNSIGNED_INT8;
@@ -1138,6 +1179,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         } else if (type.bits == 32) {
             format = CU_AD_FORMAT_UNSIGNED_INT32;
         }
+        textureDesc.flags |= CU_TRSF_READ_AS_INTEGER;
     } else if (type.code == halide_type_float) {
         if (type.bits == 16) {
             format = CU_AD_FORMAT_HALF;
@@ -1150,13 +1192,20 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         return 0;
     }
 
+    debug(user_context) << " buffer dims " << buf->dimensions;
+
+    if (buf->dim[0].stride != 1) {
+        error(user_context) << "CUDA requires inner stride to be 1";
+    }
+
     resourceDesc.flags = 0;
     if (buf->dimensions == 1) {
         resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-        resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; 
+        resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device;
         resourceDesc.res.linear.format = format;
         resourceDesc.res.linear.numChannels = 1;
         resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes();
+
     } else if (buf->dimensions == 2) {
         resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
         resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device;
@@ -1164,14 +1213,23 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         resourceDesc.res.pitch2D.numChannels = 1;
         resourceDesc.res.pitch2D.width = buf->dim[0].extent;
         resourceDesc.res.pitch2D.height = buf->dim[1].extent;
-        resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride;
+        resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride * type.bytes();
+
+        debug(user_context) << " type " << format << " width " << (int)resourceDesc.res.pitch2D.width
+                            << " height " << (int)resourceDesc.res.pitch2D.height << " pitch " << (int)resourceDesc.res.pitch2D.pitchInBytes << "\n";
+
+        if (resourceDesc.res.pitch2D.pitchInBytes % texture_row_pitch_align_required) {
+            error(user_context) << "row stride of " << (int)resourceDesc.res.pitch2D.pitchInBytes
+                                << " must be aligned to " << texture_row_pitch_align_required << " bytes for CUDA textures";
+            return 0;
+        }
     } else {
         error(user_context) << "cuda texture support only handles 1d and td textures";
         return 0;
     }
 
     CUtexObject texture = 0;
-    CUresult err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr);
+    err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr);
 
     if (err != CUDA_SUCCESS) {
         error(user_context)
@@ -1263,7 +1321,7 @@ WEAK int halide_cuda_run(void *user_context,
                 CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true);
 
                 if (!texture) {
-                    error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed";
+                    error(user_context) << "CUDA: halide_cuda_get_texture for arg " << (int)i << " failed";
                     free(dev_handles);
                     free(translated_args);
                     return -1;
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index a2be5f0aa4ce..3c78400d0e22 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -388,6 +388,31 @@ typedef struct CUDA_RESOURCE_DESC_st
 
 #define CU_POINTER_ATTRIBUTE_CONTEXT 1
 
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime

From 39f548a9c373880cceef157a2032915056a19656 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 19:32:54 -0700
Subject: [PATCH 05/13] works!

---
 src/Lower.cpp                    | 20 ++++++-----
 src/StorageFlattening.cpp        | 58 ++++++++++++++++++++++++++++++--
 test/correctness/gpu_texture.cpp | 40 +++++++++++++++-------
 3 files changed, 94 insertions(+), 24 deletions(-)

diff --git a/src/Lower.cpp b/src/Lower.cpp
index 3bd946083504..790e4e6a805e 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -272,6 +272,13 @@ Module lower(const vector<Function> &output_funcs,
     debug(2) << "Lowering after bounding small realizations:\n"
              << s << "\n\n";
 
+    if (will_inject_host_copies) {
+        debug(1) << "Selecting a GPU API for GPU loops...\n";
+        s = select_gpu_api(s, t);
+        debug(2) << "Lowering after selecting a GPU API:\n"
+                 << s << "\n\n";
+    }
+
     debug(1) << "Performing storage flattening...\n";
     s = storage_flattening(s, outputs, env, t);
     debug(2) << "Lowering after storage flattening:\n"
@@ -297,11 +304,6 @@ Module lower(const vector<Function> &output_funcs,
     }
 
     if (will_inject_host_copies) {
-        debug(1) << "Selecting a GPU API for GPU loops...\n";
-        s = select_gpu_api(s, t);
-        debug(2) << "Lowering after selecting a GPU API:\n"
-                 << s << "\n\n";
-
         debug(1) << "Injecting host <-> dev buffer copies...\n";
         s = inject_host_dev_buffer_copies(s, t);
         debug(2) << "Lowering after injecting host <-> dev buffer copies:\n"
@@ -413,10 +415,10 @@ Module lower(const vector<Function> &output_funcs,
         debug(2) << "Lowering after injecting warp shuffles:\n"
                  << s << "\n\n";
 
-        debug(1) << "Aligning GPU Buffers...\n";
-        s = align_gpu_buffers(s, t);
-        debug(2) << "Lowering after aligning GPU buffers:\n"
-                 << s << "\n\n";
+        // debug(1) << "Aligning GPU Buffers...\n";
+        // s = align_gpu_buffers(s, t);
+        // debug(2) << "Lowering after aligning GPU buffers:\n"
+        //          << s << "\n\n";
     }
 
     debug(1) << "Simplifying...\n";
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index aab178b4bc05..03a048e41e97 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -22,6 +22,39 @@ using std::string;
 using std::vector;
 
 namespace {
+class FindBuffersInGPU : public IRVisitor {
+public:
+    map<string, set<DeviceAPI>> buffer_device_usage;
+
+private:
+    bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
+    using IRVisitor::visit;
+
+    void visit(const Call *op) override {
+        debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n";
+        if (in_gpu &&
+            (op->call_type == Call::Halide || op->call_type == Call::Image)) {
+            debug(2) << " load call to " << op->name << " " << in_device_api << "\n";
+            buffer_device_usage[op->name].insert(in_device_api);
+        }
+
+        IRVisitor::visit(op);
+    }
+
+    void visit(const For *op) override {
+        bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
+        if (op->for_type == ForType::GPUBlock ||
+            op->for_type == ForType::GPUThread) {
+            in_gpu = true;
+            in_device_api = op->device_api;
+        }
+        IRVisitor::visit(op);
+        in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
+    }
+};
 
 class FlattenDimensions : public IRMutator {
 public:
@@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator {
         }
     }
 
+    map<string, set<DeviceAPI>> buffer_apis;
+
 private:
     const map<string, pair<Function, int>> &env;
     set<string> outputs;
@@ -117,7 +152,7 @@ class FlattenDimensions : public IRMutator {
 
         if (op->memory_type == MemoryType::GPUTexture) {
             textures.insert(op->name);
-            debug(2) << "found texture " << op->name << "\n";
+            debug(2) << "found texture " << op->name << " in " << in_device_api << "\n";
         }
 
         Stmt body = mutate(op->body);
@@ -153,11 +188,23 @@ class FlattenDimensions : public IRMutator {
                     if (args[j] == storage_dims[i].var) {
                         storage_permutation.push_back((int)j);
                         Expr alignment = storage_dims[i].alignment;
+
                         if (alignment.defined()) {
                             allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment;
                         } else {
                             allocation_extents[j] = extents[j];
                         }
+
+                        // Promote row alignment for buffers used as CUDA Textures
+                        if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) {
+                            // This could be symbolically fetched from runtime I guess?
+                            int target_align_bytes = 32;
+                            int target_align_items = target_align_bytes / op->types[0].bytes();
+
+                            debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n";
+
+                            allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items;
+                        }
                     }
                 }
                 internal_assert(storage_permutation.size() == i + 1);
@@ -260,7 +307,7 @@ class FlattenDimensions : public IRMutator {
             Expr store = Call::make(value.type(), Call::image_store,
                                     args, Call::Intrinsic);
             return Evaluate::make(store);
-        } else if (in_gpu && textures.count(op->name) && false && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory
+        } else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) {  // CUDA writes are still directly to memory
             Expr buffer_var =
                 Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
             vector<Expr> args(2);
@@ -487,7 +534,12 @@ Stmt storage_flattening(Stmt s,
         }
     }
 
-    s = FlattenDimensions(tuple_env, outputs, target).mutate(s);
+    FindBuffersInGPU finder;
+    s.accept(&finder);
+    FlattenDimensions flatten(tuple_env, outputs, target);
+    flatten.buffer_apis = finder.buffer_device_usage;
+
+    s = flatten.mutate(s);
     s = PromoteToMemoryType().mutate(s);
     return s;
 }
diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp
index 3de269d07fa2..a0862f9d64ec 100644
--- a/test/correctness/gpu_texture.cpp
+++ b/test/correctness/gpu_texture.cpp
@@ -7,6 +7,7 @@ using namespace Halide::Internal;
 
 int main(int argc, char **argv) {
     Target t = get_jit_target_from_environment();
+    bool success = true;
 
     if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) {
         printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n");
@@ -26,7 +27,7 @@ int main(int argc, char **argv) {
 
     // Check dynamic allocations into Heap and Texture memory
     for (auto memory_type : {MemoryType::GPUTexture, MemoryType::Heap}) {
-        {
+        if (false) {
             // 1D stores/loads
             Buffer<int> input(100);
             input.fill(10);
@@ -51,13 +52,18 @@ int main(int argc, char **argv) {
                 int correct = 2 * x + 10;
                 if (out(x) != correct) {
                     printf("out[1D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
         {
+            int size = 17;
             // 2D stores/loads
-            Buffer<int> input(10, 10);
+
+            // to get a buffer with 32-byte row pitch
+            Buffer<int> input(24, size);
+            input.crop(0, 0, 17);
+
             input.fill(10);
             ImageParam param(Int(32), 2);
             param.set(input);
@@ -70,21 +76,24 @@ int main(int argc, char **argv) {
             f(x, y) = cast<float>(x + y);
             g(x) = param(x, x) + cast<int>(f(2 * x, x));
 
-            g.gpu_tile(x, xi, 16, TailStrategy::GuardWithIf);
+            g.gpu_tile(x, xi, 8);
 
             f.compute_root().store_in(memory_type).gpu_blocks(x, y);  // store f as integer
             g.store_in(memory_type);
+            g.bound(x, 0, size);
 
-            Buffer<int> out = g.realize(10);
-            for (int x = 0; x < 10; x++) {
+            g.compile_to_lowered_stmt("/tmp/stmt.html", {param}, Halide::HTML);
+
+            Buffer<int> out = g.realize(size);
+            for (int x = 0; x < size; x++) {
                 int correct = 3 * x + 10;
                 if (out(x) != correct) {
                     printf("out[2D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
-        {
+        if (t.has_feature(halide_target_feature_opencl)) {  // no 3d in our cuda support right now
             // 3D stores/loads
             Buffer<int> input(10, 10, 10);
             input.fill(10);
@@ -110,7 +119,7 @@ int main(int argc, char **argv) {
                 int correct = 4 * x + 10;
                 if (out(x) != correct) {
                     printf("out[3D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
@@ -143,12 +152,19 @@ int main(int argc, char **argv) {
                 int correct = 2 * x + 10;
                 if (out(x) != correct) {
                     printf("out[1D-shift][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
+        if (!success) {
+            break;
+        }
     }
 
-    printf("Success!\n");
-    return 0;
+    if (success) {
+        printf("Success!\n");
+        return 0;
+    }
+    printf("Failed!\n");
+    return 1;
 }

From e16a5aff161f182def18ee019dd90ee611d8f4d4 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 19:41:26 -0700
Subject: [PATCH 06/13] Cleanup

---
 src/AlignGPUBuffers.cpp | 164 ----------------------------------------
 src/AlignGPUBuffers.h   |  24 ------
 src/CMakeLists.txt      |   2 -
 src/Lower.cpp           |   6 --
 4 files changed, 196 deletions(-)
 delete mode 100644 src/AlignGPUBuffers.cpp
 delete mode 100644 src/AlignGPUBuffers.h

diff --git a/src/AlignGPUBuffers.cpp b/src/AlignGPUBuffers.cpp
deleted file mode 100644
index f337024ffc2a..000000000000
--- a/src/AlignGPUBuffers.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-#include "InjectHostDevBufferCopies.h"
-
-#include "CodeGen_GPU_Dev.h"
-#include "Debug.h"
-#include "ExternFuncArgument.h"
-#include "IRMutator.h"
-#include "IROperator.h"
-#include "IRPrinter.h"
-#include "Substitute.h"
-
-#include <map>
-#include <utility>
-
-namespace Halide {
-namespace Internal {
-
-using std::set;
-using std::string;
-using std::vector;
-
-namespace {
-
-class FindTexturesInGPU : public IRVisitor {
-    public:
-    set<string> textures;
-
-    private:
-    bool in_gpu = false;
-    DeviceAPI in_device_api = DeviceAPI::None;
-
-    void visit(const Call *op) override {
-        if (in_gpu && op->is_intrinsic(Call::image_load)) {
-            debug(2) << " load call to " << op->name << " " << textures.count(op->name) << "\n";
-            textures.insert(op->args[0].as<StringImm>()->value);
-        }
-
-        IRVisitor::visit(op);
-    }
-
-    void visit(const For *op) override {
-        bool old_in_gpu = in_gpu;
-        DeviceAPI old_in_device_api = in_device_api;
-        if (op->for_type == ForType::GPUBlock ||
-            op->for_type == ForType::GPUThread) {
-            in_gpu = true;
-            in_device_api = op->device_api;
-        }
-        IRVisitor::visit(op);
-        in_gpu = old_in_gpu;
-        in_device_api = old_in_device_api;
-    }
-};
-
-class FindBufferInitType : public IRVisitor {
-    public:
-    Type type;
-
-    private:
-    void visit(const Call *op) override {
-        if (op->name == Call::buffer_init) {
-            internal_assert(op->args.size() == 10) << "don't understand the format of buffer_init";
-            
-            halide_type_code_t code = (halide_type_code_t)op->args[5].as<IntImm>()->value;
-            int bits = op->args[6].as<IntImm>()->value;
-            type = Type(code, bits, 1);
-        }
-
-        IRVisitor::visit(op);
-    }
-};
-
-class AdjustAllocationStride : public IRMutator {
-    Type buffer_type;
-private:
-    Stmt visit(const LetStmt *op) override {
-        if (op->name == buffer) {
-            bool old_in_buffer = in_buffer;
-            debug(2) << " enter buffer " << op->name << "\n";
-            internal_assert(!old_in_buffer) << " Already in buffer?!?";
-            in_buffer = true;
-
-            FindBufferInitType typeFinder;
-            op->accept(&typeFinder);
-            buffer_type = typeFinder.type;
-
-            debug(2) << " found type " << buffer_type << "\n";
-
-            Expr new_value = mutate(op->value);
-            debug(2) << " new struct value " << new_value;
-            debug(2) << " exit buffer " << op->name << "\n";
-            in_buffer = old_in_buffer;
-
-            return LetStmt::make(op->name, new_value, op->body);
-        } else {
-            return IRMutator::visit(op);
-        }
-    }
-
-    Expr visit(const Call *op) override {
-        if (in_buffer) {
-            debug(2) << " in buffer call " << op->name << "\n";
-
-            if (op->is_intrinsic(Call::make_struct)) {
-                internal_assert(op->args.size() % 4 == 0) << "unknown format of make_struct for buffer";
-
-                vector<Expr> args = op->args;
-                if (args.size() >= 8) {
-                    Expr row_width = args[1];
-                    Expr current_stride = args[6];
-
-                    // This could be symbolically fetched from runtime I guess?
-                    int target_align_bytes = 32;
-
-                    int target_align_items = target_align_bytes / buffer_type.bytes();
-                    Expr target_align_expr = IntImm::make(Int(32), target_align_items);
-                    
-                    Expr row_tail_items = Mod::make(current_stride, target_align_expr);
-                    Expr row_extra_items = Sub::make(target_align_expr, row_tail_items);
-
-                    Expr padded_stride = Select::make(
-                        EQ::make(row_tail_items, IntImm::make(Int(32), 0)),
-                        current_stride,
-                        Add::make(current_stride, row_extra_items)
-                    );
-                    args[6] = padded_stride;
-
-                    debug(2) << " old struct: " << static_cast<Expr>(op) << "\n";
-                    Expr new_call = Call::make(op->type, op->name, args, op->call_type).as<Call>();
-                    debug(2) << " new struct: " << new_call << "\n";
-                    return new_call;
-                }
-            }
-
-            return IRMutator::visit(op);
-        } else {
-            return IRMutator::visit(op);
-        }
-    }
-
-    string buffer;
-    bool in_buffer = false;
-
-public:
-    AdjustAllocationStride(string b)
-        : buffer(std::move(b)) {
-    }
-};
-
-}  // namespace
-
-Stmt align_gpu_buffers(Stmt s, const Target &t) {
-
-    // Handle inputs and outputs
-    FindTexturesInGPU finder;
-    s.accept(&finder);
-    for (const string& texture : finder.textures) {
-        s = AdjustAllocationStride(texture + ".buffer").mutate(s);
-    }
-
-    return s;
-}
-
-}  // namespace Internal
-}  // namespace Halide
diff --git a/src/AlignGPUBuffers.h b/src/AlignGPUBuffers.h
deleted file mode 100644
index bcd72d7c6fec..000000000000
--- a/src/AlignGPUBuffers.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef HALIDE_ALIGN_GPU_BUFFERS_H
-#define HALIDE_ALIGN_GPU_BUFFERS_H
-
-/** \file
- * Defines the lowering passes that deal with host and device buffer flow.
- */
-
-#include <string>
-#include <vector>
-
-#include "Expr.h"
-#include "Target.h"
-
-namespace Halide {
-namespace Internal {
-
-/** Inject calls to halide_device_malloc, halide_copy_to_device, and
- * halide_copy_to_host as needed. */
-Stmt align_gpu_buffers(Stmt s, const Target &t);
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9e20d5a2ff2d..29458c7db0d9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,7 +8,6 @@ set(HEADER_FILES
     AddAtomicMutex.h
     AddImageChecks.h
     AddParameterChecks.h
-    AlignGPUBuffers.h
     AlignLoads.h
     AllocationBoundsInference.h
     ApplySplit.h
@@ -174,7 +173,6 @@ set(SOURCE_FILES
     AddAtomicMutex.cpp
     AddImageChecks.cpp
     AddParameterChecks.cpp
-    AlignGPUBuffers.cpp
     AlignLoads.cpp
     AllocationBoundsInference.cpp
     ApplySplit.cpp
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 790e4e6a805e..773c84d1cabd 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -9,7 +9,6 @@
 #include "AddAtomicMutex.h"
 #include "AddImageChecks.h"
 #include "AddParameterChecks.h"
-#include "AlignGPUBuffers.h"
 #include "AllocationBoundsInference.h"
 #include "AsyncProducers.h"
 #include "BoundSmallAllocations.h"
@@ -414,11 +413,6 @@ Module lower(const vector<Function> &output_funcs,
         s = lower_warp_shuffles(s);
         debug(2) << "Lowering after injecting warp shuffles:\n"
                  << s << "\n\n";
-
-        // debug(1) << "Aligning GPU Buffers...\n";
-        // s = align_gpu_buffers(s, t);
-        // debug(2) << "Lowering after aligning GPU buffers:\n"
-        //          << s << "\n\n";
     }
 
     debug(1) << "Simplifying...\n";

From 43fd0ae101c0fcc9ec6280c0dc6685784b268f14 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 19:57:02 -0700
Subject: [PATCH 07/13] cleanup

---
 src/CodeGen_LLVM.cpp    |  2 +-
 src/CodeGen_PTX_Dev.cpp | 21 ++++++++++++---------
 src/runtime/cuda.cpp    |  6 +-----
 src/runtime/mini_cuda.h |  6 ------
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 5b9051e18706..e7b4d7954fbd 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1435,7 +1435,7 @@ Value *CodeGen_LLVM::codegen(const Expr &e) {
                     value->getType() == llvm_type_of(e.type()))
         << "Codegen of Expr " << e
         << " of type " << e.type()
-        << " did not produce llvm IR of the corresponding llvm type.  Llvm was " << llvm_type_of(e.type()) << "\n";
+        << " did not produce llvm IR of the corresponding llvm type.\n";
     return value;
 }
 
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 13ddec62c1f2..0b4a6543bff7 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -182,24 +182,28 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
 
         string res_desc = "";
         user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
-        llvm::Type *res_type;
+        llvm::Type *element_type;
         if (op->type.is_float()) {
             res_desc = "f32";
-            auto element = llvm_type_of(Float(32));
-            res_type = llvm::StructType::get(element, element, element, element);
+            element_type = llvm_type_of(Float(32));
         } else {
             res_desc = "s32";
-            auto element = llvm_type_of(Int(32));
-            res_type = llvm::StructType::get(element, element, element, element);
+            element_type = llvm_type_of(Int(32));
         }
+        // PTX returns a 4 element struct (not a vector!) regardless of 
+        llvm::Type *res_type = llvm::StructType::get(element, element, element, element);
 
         string coord_desc = "";
-        user_assert(op->args[2].type().bits() == 32) << "ptx texture sampler only supports 32 bit args";
-        if (op->args[2].type().is_float()) {
+        Type coord_type = op->args[2].type();
+        user_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
+        if (coord_type.is_float()) {
             coord_desc = ".f32";
-        } else {
+        } else if (coord_type.is_uint()) {
+            coord_desc = ".u32";
+        } else if (coord_type.is_int()) {
             coord_desc = ".s32";
         }
+        internal_assert(coord_type != "") << "unhandled coordinate type for ptx texture sampler " << coord_type;
 
         string dim = std::to_string(num_args) + "d";
         string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;
@@ -212,7 +216,6 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         }
         llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
         value = builder->CreateExtractValue(call, {0});
-
     } else {
         CodeGen_LLVM::visit(op);
     }
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 0f58c164d95b..8651200f6c6b 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1119,7 +1119,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
     halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device);
 
     if (!cuTexObjectCreate) {
-        error(user_context) << "requesting texture object but don't have runtime functions";
+        error(user_context) << "CUDA requesting texture object but don't have runtime functions (cuTexObjectCreate)";
         return 0;
     }
 
@@ -1153,13 +1153,10 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
 
     CUDA_RESOURCE_DESC resourceDesc;
     CUDA_TEXTURE_DESC textureDesc;
-    // CUDA_RESOURCE_VIEW_DESC resourceViewDesc;
 
     memset(&resourceDesc, 0, sizeof(resourceDesc));
     memset(&textureDesc, 0, sizeof(textureDesc));
 
-    // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT
-
     CUarray_format format = (CUarray_format)0;
     struct halide_type_t type = buf->type;
     if (type.code == halide_type_int) {
@@ -1205,7 +1202,6 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t
         resourceDesc.res.linear.format = format;
         resourceDesc.res.linear.numChannels = 1;
         resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes();
-
     } else if (buf->dimensions == 2) {
         resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
         resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device;
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index 3c78400d0e22..0ebfed0c29ee 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -273,12 +273,6 @@ typedef enum CUfilter_mode_enum {
     CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
 } CUfilter_mode;
 
-
-enum cudaTextureReadMode {
-    cudaReadModeElementType = 0,
-    cudaReadModeNormalizedFloat = 1
-};
-
 /**
  * CUDA texture resource view formats
  */

From c695593be2e52e43dcfc5145ab5ff83870e1cb76 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 19:57:48 -0700
Subject: [PATCH 08/13] cleanup formatting

---
 src/runtime/cuda_functions.h |   2 +-
 src/runtime/mini_cuda.h      | 133 +++++++++++++++++------------------
 2 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h
index ba6f352ebb0e..9766146a9e9d 100644
--- a/src/runtime/cuda_functions.h
+++ b/src/runtime/cuda_functions.h
@@ -47,7 +47,7 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p
 
 CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream));
 
-CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc));
+CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject * pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc));
 CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject));
 
 #undef CUDA_FN
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index 0ebfed0c29ee..8b61a786625c 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -235,32 +235,32 @@ typedef unsigned long long CUtexObject;
  * Array formats
  */
 typedef enum CUarray_format_enum {
-    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,  /**< Unsigned 8-bit integers */
     CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
     CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
-    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
-    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
-    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
-    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
-    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+    CU_AD_FORMAT_SIGNED_INT8 = 0x08,    /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16 = 0x09,   /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32 = 0x0a,   /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF = 0x10,           /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT = 0x20           /**< 32-bit floating point */
 } CUarray_format;
 
 /**
  * Resource types
  */
 typedef enum CUresourcetype_enum {
-    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    CU_RESOURCE_TYPE_ARRAY = 0x00,           /**< Array resoure */
     CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
-    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
-    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+    CU_RESOURCE_TYPE_LINEAR = 0x02,          /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D = 0x03          /**< Pitch 2D resource */
 } CUresourcetype;
 
 /**
  * Texture reference addressing modes
  */
 typedef enum CUaddress_mode_enum {
-    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
-    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_WRAP = 0,   /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP = 1,  /**< Clamp to edge address mode */
     CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
     CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
 } CUaddress_mode;
@@ -269,57 +269,55 @@ typedef enum CUaddress_mode_enum {
  * Texture reference filtering modes
  */
 typedef enum CUfilter_mode_enum {
-    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
-    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+    CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
 } CUfilter_mode;
 
 /**
  * CUDA texture resource view formats
  */
-typedef enum CUresourceViewFormat_enum
-{
-    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
-    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+typedef enum CUresourceViewFormat_enum {
+    CU_RES_VIEW_FORMAT_NONE = 0x00,          /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01,      /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02,      /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03,      /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04,      /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05,      /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06,      /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07,     /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08,     /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09,     /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a,     /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b,     /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c,     /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d,     /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e,     /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f,     /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10,     /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11,     /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12,     /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13,    /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14,    /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15,    /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16,    /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17,    /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18,    /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19,  /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a,  /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b,  /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c,  /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d,    /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e,  /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f,    /**< Block compressed 5 signed */
     CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
-    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21,   /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22   /**< Block compressed 7 */
 } CUresourceViewFormat;
 
 /**
  * Resource view descriptor
  */
-typedef struct CUDA_RESOURCE_VIEW_DESC_st
-{
+typedef struct CUDA_RESOURCE_VIEW_DESC_st {
     CUresourceViewFormat format;   /**< Resource view format */
     size_t width;                  /**< Width of the resource view */
     size_t height;                 /**< Height of the resource view */
@@ -347,9 +345,8 @@ typedef struct CUDA_TEXTURE_DESC_st {
     int reserved[12];
 } CUDA_TEXTURE_DESC;
 
-typedef struct CUDA_RESOURCE_DESC_st
-{
-    CUresourcetype resType;                   /**< Resource type */
+typedef struct CUDA_RESOURCE_DESC_st {
+    CUresourcetype resType; /**< Resource type */
 
     union {
         struct {
@@ -359,25 +356,25 @@ typedef struct CUDA_RESOURCE_DESC_st
             // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
         } mipmap;
         struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t sizeInBytes;               /**< Size in bytes */
+            CUdeviceptr devPtr;       /**< Device pointer */
+            CUarray_format format;    /**< Array format */
+            unsigned int numChannels; /**< Channels per array element */
+            size_t sizeInBytes;       /**< Size in bytes */
         } linear;
         struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t width;                     /**< Width of the array in elements */
-            size_t height;                    /**< Height of the array in elements */
-            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+            CUdeviceptr devPtr;       /**< Device pointer */
+            CUarray_format format;    /**< Array format */
+            unsigned int numChannels; /**< Channels per array element */
+            size_t width;             /**< Width of the array in elements */
+            size_t height;            /**< Height of the array in elements */
+            size_t pitchInBytes;      /**< Pitch between two rows in bytes */
         } pitch2D;
         struct {
             int reserved[32];
         } reserved;
     } res;
 
-    unsigned int flags;                       /**< Flags (must be zero) */
+    unsigned int flags; /**< Flags (must be zero) */
 } CUDA_RESOURCE_DESC;
 
 #define CU_POINTER_ATTRIBUTE_CONTEXT 1
@@ -393,19 +390,19 @@ typedef struct CUDA_RESOURCE_DESC_st
  * in the range [0,1].
  * Flag for ::cuTexRefSetFlags()
  */
-#define CU_TRSF_READ_AS_INTEGER         0x01
+#define CU_TRSF_READ_AS_INTEGER 0x01
 
 /**
  * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
  * Flag for ::cuTexRefSetFlags()
  */
-#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02
 
 /**
  * Perform sRGB->linear conversion during texture read.
  * Flag for ::cuTexRefSetFlags()
  */
-#define CU_TRSF_SRGB  0x10
+#define CU_TRSF_SRGB 0x10
 
 }  // namespace Cuda
 }  // namespace Internal

From 14fa13a653c044616ddf29a101ee6b6662c67de4 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 20:00:21 -0700
Subject: [PATCH 09/13] cleanup formatting

---
 src/CodeGen_PTX_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 0b4a6543bff7..0d4a8d7d8dc1 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -190,7 +190,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
             res_desc = "s32";
             element_type = llvm_type_of(Int(32));
         }
-        // PTX returns a 4 element struct (not a vector!) regardless of 
+        // PTX returns a 4 element struct (not a vector!) regardless of
         llvm::Type *res_type = llvm::StructType::get(element, element, element, element);
 
         string coord_desc = "";

From ade795255be5b345b80ce972cc0af35b813b2093 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 20:14:24 -0700
Subject: [PATCH 10/13] pasta

---
 src/CodeGen_PTX_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 0d4a8d7d8dc1..391f8f29b921 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -191,7 +191,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
             element_type = llvm_type_of(Int(32));
         }
         // PTX returns a 4 element struct (not a vector!) regardless of
-        llvm::Type *res_type = llvm::StructType::get(element, element, element, element);
+        llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type);
 
         string coord_desc = "";
         Type coord_type = op->args[2].type();

From 46b0de7231ef0d66912ea0ebfaa0e6c43399f057 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 20:17:40 -0700
Subject: [PATCH 11/13] pasta

---
 src/CodeGen_PTX_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 391f8f29b921..ec758a31f44d 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         } else if (coord_type.is_int()) {
             coord_desc = ".s32";
         }
-        internal_assert(coord_type != "") << "unhandled coordinate type for ptx texture sampler " << coord_type;
+        internal_assert(coord_desc != "") << "unhandled coordinate type for ptx texture sampler " << coord_type;
 
         string dim = std::to_string(num_args) + "d";
         string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;

From 009c1d82259281a89ea5f85677cf98d9d24b3405 Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 20:32:50 -0700
Subject: [PATCH 12/13] tidy

---
 src/CodeGen_PTX_Dev.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index ec758a31f44d..1fd3523aee4a 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -195,7 +195,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
 
         string coord_desc = "";
         Type coord_type = op->args[2].type();
-        user_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
+        internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
         if (coord_type.is_float()) {
             coord_desc = ".f32";
         } else if (coord_type.is_uint()) {
@@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         } else if (coord_type.is_int()) {
             coord_desc = ".s32";
         }
-        internal_assert(coord_desc != "") << "unhandled coordinate type for ptx texture sampler " << coord_type;
+        internal_assert(coord_desc.) << "unhandled coordinate type for ptx texture sampler " << coord_type;
 
         string dim = std::to_string(num_args) + "d";
         string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;

From d70d5f34b3dd2dce553c68da3c3a12abfb93083c Mon Sep 17 00:00:00 2001
From: John Laxson <jlaxson@mac.com>
Date: Mon, 2 Nov 2020 20:45:25 -0700
Subject: [PATCH 13/13] fix

---
 src/CodeGen_PTX_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 1fd3523aee4a..8b88802bcebc 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         } else if (coord_type.is_int()) {
             coord_desc = ".s32";
         }
-        internal_assert(coord_desc.) << "unhandled coordinate type for ptx texture sampler " << coord_type;
+        internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type;
 
         string dim = std::to_string(num_args) + "d";
         string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;