diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index c27ab4d0e788..3262a57e8491 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -441,7 +441,14 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
                                          i));
             }
 
-            builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
+            int8_t buffer_type = 0;
+            if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) {
+                buffer_type = 2;
+            } else if (closure_args[i].is_buffer) {
+                buffer_type = 1;
+            }
+
+            builder->CreateStore(ConstantInt::get(i8_t, buffer_type),
                                  builder->CreateConstGEP2_32(
                                      gpu_arg_is_buffer_arr_type,
                                      gpu_arg_is_buffer_arr,
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index ea876762b3ee..8b88802bcebc 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
     vector<llvm::Type *> arg_types(args.size());
     for (size_t i = 0; i < args.size(); i++) {
         if (args[i].is_buffer) {
-            arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) {
+                arg_types[i] = llvm_type_of(Int(64));
+            } else {
+                arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
+            }
         } else {
             arg_types[i] = llvm_type_of(args[i].type);
         }
@@ -83,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 
     // Mark the buffer args as no alias
     for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_buffer) {
+        if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) {
             function->addParamAttr(i, Attribute::NoAlias);
         }
     }
@@ -172,6 +176,46 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
         internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n";
         builder->CreateCall(barrier0);
         value = ConstantInt::get(i32_t, 0);
+    } else if (op->is_intrinsic(Call::image_load)) {
+        int num_args = (op->args.size() - 2) / 2;
+        user_assert(num_args >= 1 && num_args <= 2);
+
+        string res_desc = "";
+        user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results";
+        llvm::Type *element_type;
+        if (op->type.is_float()) {
+            res_desc = "f32";
+            element_type = llvm_type_of(Float(32));
+        } else {
+            res_desc = "s32";
+            element_type = llvm_type_of(Int(32));
+        }
+        // PTX returns a 4 element struct (not a vector!) regardless of
+        llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type);
+
+        string coord_desc = "";
+        Type coord_type = op->args[2].type();
+        internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args";
+        if (coord_type.is_float()) {
+            coord_desc = ".f32";
+        } else if (coord_type.is_uint()) {
+            coord_desc = ".u32";
+        } else if (coord_type.is_int()) {
+            coord_desc = ".s32";
+        }
+        internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type;
+
+        string dim = std::to_string(num_args) + "d";
+        string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;
+
+        vector<Value *> coords;
+        coords.push_back(codegen(Variable::make(Int(64), op->args[0].as<StringImm>()->value)));
+        for (size_t i = 2; i < op->args.size(); i += 2) {
+            internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type";
+            coords.push_back(codegen(op->args[i]));
+        }
+        llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords);
+        value = builder->CreateExtractValue(call, {0});
     } else {
         CodeGen_LLVM::visit(op);
     }
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 24fdbc47acf0..773c84d1cabd 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -271,6 +271,13 @@ Module lower(const vector<Function> &output_funcs,
     debug(2) << "Lowering after bounding small realizations:\n"
              << s << "\n\n";
 
+    if (will_inject_host_copies) {
+        debug(1) << "Selecting a GPU API for GPU loops...\n";
+        s = select_gpu_api(s, t);
+        debug(2) << "Lowering after selecting a GPU API:\n"
+                 << s << "\n\n";
+    }
+
     debug(1) << "Performing storage flattening...\n";
     s = storage_flattening(s, outputs, env, t);
     debug(2) << "Lowering after storage flattening:\n"
@@ -296,11 +303,6 @@ Module lower(const vector<Function> &output_funcs,
     }
 
     if (will_inject_host_copies) {
-        debug(1) << "Selecting a GPU API for GPU loops...\n";
-        s = select_gpu_api(s, t);
-        debug(2) << "Lowering after selecting a GPU API:\n"
-                 << s << "\n\n";
-
         debug(1) << "Injecting host <-> dev buffer copies...\n";
         s = inject_host_dev_buffer_copies(s, t);
         debug(2) << "Lowering after injecting host <-> dev buffer copies:\n"
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index e3ad51038666..03a048e41e97 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -22,6 +22,39 @@ using std::string;
 using std::vector;
 
 namespace {
+class FindBuffersInGPU : public IRVisitor {
+public:
+    map<string, set<DeviceAPI>> buffer_device_usage;
+
+private:
+    bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
+    using IRVisitor::visit;
+
+    void visit(const Call *op) override {
+        debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n";
+        if (in_gpu &&
+            (op->call_type == Call::Halide || op->call_type == Call::Image)) {
+            debug(2) << " load call to " << op->name << " " << in_device_api << "\n";
+            buffer_device_usage[op->name].insert(in_device_api);
+        }
+
+        IRVisitor::visit(op);
+    }
+
+    void visit(const For *op) override {
+        bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
+        if (op->for_type == ForType::GPUBlock ||
+            op->for_type == ForType::GPUThread) {
+            in_gpu = true;
+            in_device_api = op->device_api;
+        }
+        IRVisitor::visit(op);
+        in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
+    }
+};
 
 class FlattenDimensions : public IRMutator {
 public:
@@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator {
         }
     }
 
+    map<string, set<DeviceAPI>> buffer_apis;
+
 private:
     const map<string, pair<Function, int>> &env;
     set<string> outputs;
@@ -42,6 +77,7 @@ class FlattenDimensions : public IRMutator {
     Scope<> realizations, shader_scope_realizations;
     bool in_shader = false;
     bool in_gpu = false;
+    DeviceAPI in_device_api = DeviceAPI::None;
 
     Expr make_shape_var(string name, const string &field, size_t dim,
                         const Buffer<> &buf, const Parameter &param) {
@@ -116,7 +152,7 @@ class FlattenDimensions : public IRMutator {
 
         if (op->memory_type == MemoryType::GPUTexture) {
             textures.insert(op->name);
-            debug(2) << "found texture " << op->name << "\n";
+            debug(2) << "found texture " << op->name << " in " << in_device_api << "\n";
         }
 
         Stmt body = mutate(op->body);
@@ -152,11 +188,23 @@ class FlattenDimensions : public IRMutator {
                     if (args[j] == storage_dims[i].var) {
                         storage_permutation.push_back((int)j);
                         Expr alignment = storage_dims[i].alignment;
+
                         if (alignment.defined()) {
                             allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment;
                         } else {
                             allocation_extents[j] = extents[j];
                         }
+
+                        // Promote row alignment for buffers used as CUDA Textures
+                        if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) {
+                            // This could be symbolically fetched from runtime I guess?
+                            int target_align_bytes = 32;
+                            int target_align_items = target_align_bytes / op->types[0].bytes();
+
+                            debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n";
+
+                            allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items;
+                        }
                     }
                 }
                 internal_assert(storage_permutation.size() == i + 1);
@@ -259,7 +307,7 @@ class FlattenDimensions : public IRMutator {
             Expr store = Call::make(value.type(), Call::image_store,
                                     args, Call::Intrinsic);
             return Evaluate::make(store);
-        } else if (in_gpu && textures.count(op->name)) {
+        } else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) {  // CUDA writes are still directly to memory
             Expr buffer_var =
                 Variable::make(type_of<halide_buffer_t *>(), op->name + ".buffer", output_buf);
             vector<Expr> args(2);
@@ -398,6 +446,7 @@ class FlattenDimensions : public IRMutator {
     Stmt visit(const For *op) override {
         bool old_in_shader = in_shader;
         bool old_in_gpu = in_gpu;
+        DeviceAPI old_in_device_api = in_device_api;
         if ((op->for_type == ForType::GPUBlock ||
              op->for_type == ForType::GPUThread) &&
             op->device_api == DeviceAPI::GLSL) {
@@ -406,10 +455,12 @@ class FlattenDimensions : public IRMutator {
         if (op->for_type == ForType::GPUBlock ||
             op->for_type == ForType::GPUThread) {
             in_gpu = true;
+            in_device_api = op->device_api;
         }
         Stmt stmt = IRMutator::visit(op);
         in_shader = old_in_shader;
         in_gpu = old_in_gpu;
+        in_device_api = old_in_device_api;
         return stmt;
     }
 };
@@ -483,7 +534,12 @@ Stmt storage_flattening(Stmt s,
         }
     }
 
-    s = FlattenDimensions(tuple_env, outputs, target).mutate(s);
+    FindBuffersInGPU finder;
+    s.accept(&finder);
+    FlattenDimensions flatten(tuple_env, outputs, target);
+    flatten.buffer_apis = finder.buffer_device_usage;
+
+    s = flatten.mutate(s);
     s = PromoteToMemoryType().mutate(s);
     return s;
 }
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 7c423e179d85..8651200f6c6b 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -371,6 +371,8 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
         int max_block_size[] = {0, 0, 0};
         int max_grid_size[] = {0, 0, 0};
         int max_shared_mem = 0, max_constant_mem = 0;
+        int max_texture1d = 0, max_texture2d_width = 0, max_texture2d_height = 0;
+        int texture_pitch_align = 0, max_texture2d_linear_pitch = 0;
         int cc_major = 0, cc_minor = 0;
 
         struct {
@@ -390,6 +392,11 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
             {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
             {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
             {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
+            {&max_texture1d, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH},
+            {&max_texture2d_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH},
+            {&max_texture2d_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT},
+            {&texture_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT},
+            {&max_texture2d_linear_pitch, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH},
             {nullptr, CU_DEVICE_ATTRIBUTE_MAX}};
 
         // Do all the queries.
@@ -441,7 +448,10 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
             << "      max constant memory per block: " << max_constant_mem << "\n"
             << "      compute capability " << cc_major << "." << cc_minor << "\n"
             << "      cuda cores: " << num_cores << " x " << threads_per_core
-            << " = " << num_cores * threads_per_core << "\n";
+            << " = " << num_cores * threads_per_core << "\n"
+            << "      texture pitch align: " << texture_pitch_align << "\n"
+            << "      texture max 2d pitch: " << max_texture2d_linear_pitch << "\n"
+            << "      texture max size: 1d: " << max_texture1d << " 2d: (" << max_texture2d_width << "," << max_texture2d_height << ") \n";
     }
 #endif
 
@@ -1099,6 +1109,146 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
     return 0;
 }
 
+namespace {
+WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) {
+    CUresult err;
+    int texture_row_pitch_align_required = 0;
+    debug(user_context)
+        << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n";
+
+    halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device);
+
+    if (!cuTexObjectCreate) {
+        error(user_context) << "CUDA requesting texture object but don't have runtime functions (cuTexObjectCreate)";
+        return 0;
+    }
+
+    {
+        Context ctx(user_context);
+        if (ctx.error != 0) {
+            return 0;
+        }
+
+        CUresult err;
+
+        CUdevice dev;
+        err = cuCtxGetDevice(&dev);
+        if (err != CUDA_SUCCESS) {
+            error(user_context)
+                << "CUDA: cuCtxGetDevice failed ("
+                << Halide::Runtime::Internal::Cuda::get_error_name(err)
+                << ")";
+            return 0;
+        }
+
+        err = cuDeviceGetAttribute(&texture_row_pitch_align_required, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, dev);
+        if (err != CUDA_SUCCESS) {
+            error(user_context)
+                << "CUDA: cuDeviceGetAttribute failed ("
+                << get_error_name(err)
+                << ")";
+            return 0;
+        }
+    }
+
+    CUDA_RESOURCE_DESC resourceDesc;
+    CUDA_TEXTURE_DESC textureDesc;
+
+    memset(&resourceDesc, 0, sizeof(resourceDesc));
+    memset(&textureDesc, 0, sizeof(textureDesc));
+
+    CUarray_format format = (CUarray_format)0;
+    struct halide_type_t type = buf->type;
+    if (type.code == halide_type_int) {
+        if (type.bits == 8) {
+            format = CU_AD_FORMAT_SIGNED_INT8;
+        } else if (type.bits == 16) {
+            format = CU_AD_FORMAT_SIGNED_INT16;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_SIGNED_INT32;
+        }
+        textureDesc.flags |= CU_TRSF_READ_AS_INTEGER;
+    } else if (type.code == halide_type_uint) {
+        if (type.bits == 8) {
+            format = CU_AD_FORMAT_UNSIGNED_INT8;
+        } else if (type.bits == 16) {
+            format = CU_AD_FORMAT_UNSIGNED_INT16;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_UNSIGNED_INT32;
+        }
+        textureDesc.flags |= CU_TRSF_READ_AS_INTEGER;
+    } else if (type.code == halide_type_float) {
+        if (type.bits == 16) {
+            format = CU_AD_FORMAT_HALF;
+        } else if (type.bits == 32) {
+            format = CU_AD_FORMAT_FLOAT;
+        }
+    }
+    if (format == 0) {
+        error(user_context) << "Unhandled datatype for CUDA texture object: " << type;
+        return 0;
+    }
+
+    debug(user_context) << " buffer dims " << buf->dimensions;
+
+    if (buf->dim[0].stride != 1) {
+        error(user_context) << "CUDA requires inner stride to be 1";
+    }
+
+    resourceDesc.flags = 0;
+    if (buf->dimensions == 1) {
+        resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+        resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device;
+        resourceDesc.res.linear.format = format;
+        resourceDesc.res.linear.numChannels = 1;
+        resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes();
+    } else if (buf->dimensions == 2) {
+        resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+        resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device;
+        resourceDesc.res.pitch2D.format = format;
+        resourceDesc.res.pitch2D.numChannels = 1;
+        resourceDesc.res.pitch2D.width = buf->dim[0].extent;
+        resourceDesc.res.pitch2D.height = buf->dim[1].extent;
+        resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride * type.bytes();
+
+        debug(user_context) << " type " << format << " width " << (int)resourceDesc.res.pitch2D.width
+                            << " height " << (int)resourceDesc.res.pitch2D.height << " pitch " << (int)resourceDesc.res.pitch2D.pitchInBytes << "\n";
+
+        if (resourceDesc.res.pitch2D.pitchInBytes % texture_row_pitch_align_required) {
+            error(user_context) << "row stride of " << (int)resourceDesc.res.pitch2D.pitchInBytes
+                                << " must be aligned to " << texture_row_pitch_align_required << " bytes for CUDA textures";
+            return 0;
+        }
+    } else {
+        error(user_context) << "cuda texture support only handles 1d and td textures";
+        return 0;
+    }
+
+    CUtexObject texture = 0;
+    err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr);
+
+    if (err != CUDA_SUCCESS) {
+        error(user_context)
+            << "CUDA: cuTexObjectCreate failed ("
+            << Halide::Runtime::Internal::Cuda::get_error_name(err)
+            << ")";
+        return 0;
+    }
+
+    debug(user_context) << "    got texture " << texture << "\n";
+
+    return texture;
+}
+
+WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) {
+    if (!cuTexObjectDestroy && texture_object) {
+        error(user_context) << "attempting to free texture object but don't have runtime functions";
+    }
+
+    return cuTexObjectDestroy(texture_object);
+}
+}  // namespace
+
 WEAK int halide_cuda_run(void *user_context,
                          void *state_ptr,
                          const char *entry_name,
@@ -1163,8 +1313,21 @@ WEAK int halide_cuda_run(void *user_context,
     for (size_t i = 0; i <= num_args; i++) {  // Get nullptr at end.
         if (arg_is_buffer[i]) {
             halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
-            dev_handles[i] = ((halide_buffer_t *)args[i])->device;
-            translated_args[i] = &(dev_handles[i]);
+            if (arg_is_buffer[i] == 2) {
+                CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true);
+
+                if (!texture) {
+                    error(user_context) << "CUDA: halide_cuda_get_texture for arg " << (int)i << " failed";
+                    free(dev_handles);
+                    free(translated_args);
+                    return -1;
+                }
+                dev_handles[i] = texture;
+                translated_args[i] = &(dev_handles[i]);
+            } else {
+                dev_handles[i] = ((halide_buffer_t *)args[i])->device;
+                translated_args[i] = &(dev_handles[i]);
+            }
             debug(user_context) << "    halide_cuda_run translated arg" << (int)i
                                 << " [" << (*((void **)translated_args[i])) << " ...]\n";
         } else {
@@ -1192,6 +1355,14 @@ WEAK int halide_cuda_run(void *user_context,
                          stream,
                          translated_args,
                          nullptr);
+
+    for (size_t i = 0; i <= num_args; i++) {  // Get nullptr at end.
+        if (arg_is_buffer[i] == 2) {
+            CUtexObject texture = (CUtexObject)dev_handles[i];
+            halide_cuda_free_texture(user_context, (halide_buffer_t *)args[i], texture);
+        }
+    }
+
     free(dev_handles);
     free(translated_args);
     if (err != CUDA_SUCCESS) {
diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h
index 2f311bfd603e..9766146a9e9d 100644
--- a/src/runtime/cuda_functions.h
+++ b/src/runtime/cuda_functions.h
@@ -47,6 +47,9 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p
 
 CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream));
 
+CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject * pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc));
+CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject));
+
 #undef CUDA_FN
 #undef CUDA_FN_OPTIONAL
 #undef CUDA_FN_3020
diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h
index cfe21d70617a..8b61a786625c 100644
--- a/src/runtime/mini_cuda.h
+++ b/src/runtime/mini_cuda.h
@@ -229,8 +229,181 @@ typedef struct CUDA_MEMCPY3D_st {
     size_t Depth;        /**< Depth of 3D memory copy */
 } CUDA_MEMCPY3D;
 
+typedef unsigned long long CUtexObject;
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,  /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8 = 0x08,    /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16 = 0x09,   /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32 = 0x0a,   /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF = 0x10,           /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT = 0x20           /**< 32-bit floating point */
+} CUarray_format;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY = 0x00,           /**< Array resoure */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR = 0x02,          /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D = 0x03          /**< Pitch 2D resource */
+} CUresourcetype;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP = 0,   /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP = 1,  /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * CUDA texture resource view formats
+ */
+typedef enum CUresourceViewFormat_enum {
+    CU_RES_VIEW_FORMAT_NONE = 0x00,          /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01,      /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02,      /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03,      /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04,      /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05,      /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06,      /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07,     /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08,     /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09,     /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a,     /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b,     /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c,     /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d,     /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e,     /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f,     /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10,     /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11,     /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12,     /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13,    /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14,    /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15,    /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16,    /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17,    /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18,    /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19,  /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a,  /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b,  /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c,  /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d,    /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e,  /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f,    /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21,   /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22   /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st {
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC;
+
+typedef struct CUDA_RESOURCE_DESC_st {
+    CUresourcetype resType; /**< Resource type */
+
+    union {
+        struct {
+            // CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;       /**< Device pointer */
+            CUarray_format format;    /**< Array format */
+            unsigned int numChannels; /**< Channels per array element */
+            size_t sizeInBytes;       /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;       /**< Device pointer */
+            CUarray_format format;    /**< Array format */
+            unsigned int numChannels; /**< Channels per array element */
+            size_t width;             /**< Width of the array in elements */
+            size_t height;            /**< Height of the array in elements */
+            size_t pitchInBytes;      /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags; /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC;
+
 #define CU_POINTER_ATTRIBUTE_CONTEXT 1
 
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER 0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB 0x10
+
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime
diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp
index 62ae5feb77a2..a0862f9d64ec 100644
--- a/test/correctness/gpu_texture.cpp
+++ b/test/correctness/gpu_texture.cpp
@@ -7,24 +7,27 @@ using namespace Halide::Internal;
 
 int main(int argc, char **argv) {
     Target t = get_jit_target_from_environment();
+    bool success = true;
 
-    if (!t.has_feature(halide_target_feature_opencl)) {
-        printf("[SKIP] No OpenCL target enabled.\n");
+    if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) {
+        printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n");
         return 0;
     }
 
-    const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL);
-    assert(interface->compute_capability != nullptr);
-    int major, minor;
-    int err = interface->compute_capability(nullptr, &major, &minor);
-    if (err != 0 || (major == 1 && minor < 2)) {
-        printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor);
-        return 0;
+    if (t.has_feature(halide_target_feature_opencl)) {
+        const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL);
+        assert(interface->compute_capability != nullptr);
+        int major, minor;
+        int err = interface->compute_capability(nullptr, &major, &minor);
+        if (err != 0 || (major == 1 && minor < 2)) {
+            printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor);
+            return 0;
+        }
     }
 
     // Check dynamic allocations into Heap and Texture memory
     for (auto memory_type : {MemoryType::GPUTexture, MemoryType::Heap}) {
-        {
+        if (false) {
             // 1D stores/loads
             Buffer<int> input(100);
             input.fill(10);
@@ -49,13 +52,18 @@ int main(int argc, char **argv) {
                 int correct = 2 * x + 10;
                 if (out(x) != correct) {
                     printf("out[1D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
         {
+            int size = 17;
             // 2D stores/loads
-            Buffer<int> input(10, 10);
+
+            // to get a buffer with 32-byte row pitch
+            Buffer<int> input(24, size);
+            input.crop(0, 0, 17);
+
             input.fill(10);
             ImageParam param(Int(32), 2);
             param.set(input);
@@ -68,21 +76,24 @@ int main(int argc, char **argv) {
             f(x, y) = cast<float>(x + y);
             g(x) = param(x, x) + cast<int>(f(2 * x, x));
 
-            g.gpu_tile(x, xi, 16, TailStrategy::GuardWithIf);
+            g.gpu_tile(x, xi, 8);
 
             f.compute_root().store_in(memory_type).gpu_blocks(x, y);  // store f as integer
             g.store_in(memory_type);
+            g.bound(x, 0, size);
 
-            Buffer<int> out = g.realize(10);
-            for (int x = 0; x < 10; x++) {
+            g.compile_to_lowered_stmt("/tmp/stmt.html", {param}, Halide::HTML);
+
+            Buffer<int> out = g.realize(size);
+            for (int x = 0; x < size; x++) {
                 int correct = 3 * x + 10;
                 if (out(x) != correct) {
                     printf("out[2D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
-        {
+        if (t.has_feature(halide_target_feature_opencl)) {  // no 3d in our cuda support right now
             // 3D stores/loads
             Buffer<int> input(10, 10, 10);
             input.fill(10);
@@ -108,7 +119,7 @@ int main(int argc, char **argv) {
                 int correct = 4 * x + 10;
                 if (out(x) != correct) {
                     printf("out[3D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
@@ -141,12 +152,19 @@ int main(int argc, char **argv) {
                 int correct = 2 * x + 10;
                 if (out(x) != correct) {
                     printf("out[1D-shift][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct);
-                    return -1;
+                    success = false;
                 }
             }
         }
+        if (!success) {
+            break;
+        }
     }
 
-    printf("Success!\n");
-    return 0;
+    if (success) {
+        printf("Success!\n");
+        return 0;
+    }
+    printf("Failed!\n");
+    return 1;
 }