diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp index c27ab4d0e788..3262a57e8491 100644 --- a/src/CodeGen_GPU_Host.cpp +++ b/src/CodeGen_GPU_Host.cpp @@ -441,7 +441,14 @@ void CodeGen_GPU_Host::visit(const For *loop) { i)); } - builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer), + int8_t buffer_type = 0; + if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) { + buffer_type = 2; + } else if (closure_args[i].is_buffer) { + buffer_type = 1; + } + + builder->CreateStore(ConstantInt::get(i8_t, buffer_type), builder->CreateConstGEP2_32( gpu_arg_is_buffer_arr_type, gpu_arg_is_buffer_arr, diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index ea876762b3ee..8b88802bcebc 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, vector arg_types(args.size()); for (size_t i = 0; i < args.size(); i++) { if (args[i].is_buffer) { - arg_types[i] = llvm_type_of(UInt(8))->getPointerTo(); + if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) { + arg_types[i] = llvm_type_of(Int(64)); + } else { + arg_types[i] = llvm_type_of(UInt(8))->getPointerTo(); + } } else { arg_types[i] = llvm_type_of(args[i].type); } @@ -83,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, // Mark the buffer args as no alias for (size_t i = 0; i < args.size(); i++) { - if (args[i].is_buffer) { + if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) { function->addParamAttr(i, Attribute::NoAlias); } } @@ -172,6 +176,46 @@ void CodeGen_PTX_Dev::visit(const Call *op) { internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n"; builder->CreateCall(barrier0); value = ConstantInt::get(i32_t, 0); + } else if (op->is_intrinsic(Call::image_load)) { + int num_args = (op->args.size() - 2) / 2; + user_assert(num_args >= 1 && num_args <= 2); + + string res_desc = ""; + user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results"; + llvm::Type *element_type; + if (op->type.is_float()) { + res_desc = "f32"; + element_type = llvm_type_of(Float(32)); + } else { + res_desc = "s32"; + element_type = llvm_type_of(Int(32)); + } + // PTX returns a 4 element struct (not a vector!) regardless of + llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type); + + string coord_desc = ""; + Type coord_type = op->args[2].type(); + internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args"; + if (coord_type.is_float()) { + coord_desc = ".f32"; + } else if (coord_type.is_uint()) { + coord_desc = ".u32"; + } else if (coord_type.is_int()) { + coord_desc = ".s32"; + } + internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type; + + string dim = std::to_string(num_args) + "d"; + string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; + + vector coords; + coords.push_back(codegen(Variable::make(Int(64), op->args[0].as()->value))); + for (size_t i = 2; i < op->args.size(); i += 2) { + internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type"; + coords.push_back(codegen(op->args[i])); + } + llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords); + value = builder->CreateExtractValue(call, {0}); } else { CodeGen_LLVM::visit(op); } diff --git a/src/Lower.cpp b/src/Lower.cpp index 24fdbc47acf0..773c84d1cabd 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -271,6 +271,13 @@ Module lower(const vector &output_funcs, debug(2) << "Lowering after bounding small realizations:\n" << s << "\n\n"; + if (will_inject_host_copies) { + debug(1) << "Selecting a GPU API for GPU loops...\n"; + s = select_gpu_api(s, t); + debug(2) << "Lowering after selecting a GPU API:\n" + << s << "\n\n"; + } + debug(1) << "Performing storage flattening...\n"; s = storage_flattening(s, outputs, env, t); debug(2) << "Lowering after storage flattening:\n" @@ -296,11 +303,6 @@ Module lower(const vector &output_funcs, } if (will_inject_host_copies) { - debug(1) << "Selecting a GPU API for GPU loops...\n"; - s = select_gpu_api(s, t); - debug(2) << "Lowering after selecting a GPU API:\n" - << s << "\n\n"; - debug(1) << "Injecting host <-> dev buffer copies...\n"; s = inject_host_dev_buffer_copies(s, t); debug(2) << "Lowering after injecting host <-> dev buffer copies:\n" diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index e3ad51038666..03a048e41e97 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -22,6 +22,39 @@ using std::string; using std::vector; namespace { +class FindBuffersInGPU : public IRVisitor { +public: + map> buffer_device_usage; + +private: + bool in_gpu = false; + DeviceAPI in_device_api = DeviceAPI::None; + using IRVisitor::visit; + + void visit(const Call *op) override { + debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n"; + if (in_gpu && + (op->call_type == Call::Halide || op->call_type == Call::Image)) { + debug(2) << " load call to " << op->name << " " << in_device_api << "\n"; + buffer_device_usage[op->name].insert(in_device_api); + } + + IRVisitor::visit(op); + } + + void visit(const For *op) override { + bool old_in_gpu = in_gpu; + DeviceAPI old_in_device_api = in_device_api; + if (op->for_type == ForType::GPUBlock || + op->for_type == ForType::GPUThread) { + in_gpu = true; + in_device_api = op->device_api; + } + IRVisitor::visit(op); + in_gpu = old_in_gpu; + in_device_api = old_in_device_api; + } +}; class FlattenDimensions : public IRMutator { public: @@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator { } } + map> buffer_apis; + private: const map> &env; set outputs; @@ -42,6 +77,7 @@ class FlattenDimensions : public IRMutator { Scope<> realizations, shader_scope_realizations; bool in_shader = false; bool in_gpu = false; + DeviceAPI in_device_api = DeviceAPI::None; Expr make_shape_var(string name, const string &field, size_t dim, const Buffer<> &buf, const Parameter ¶m) { @@ -116,7 +152,7 @@ class FlattenDimensions : public IRMutator { if (op->memory_type == MemoryType::GPUTexture) { textures.insert(op->name); - debug(2) << "found texture " << op->name << "\n"; + debug(2) << "found texture " << op->name << " in " << in_device_api << "\n"; } Stmt body = mutate(op->body); @@ -152,11 +188,23 @@ class FlattenDimensions : public IRMutator { if (args[j] == storage_dims[i].var) { storage_permutation.push_back((int)j); Expr alignment = storage_dims[i].alignment; + if (alignment.defined()) { allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment; } else { allocation_extents[j] = extents[j]; } + + // Promote row alignment for buffers used as CUDA Textures + if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) { + // This could be symbolically fetched from runtime I guess? + int target_align_bytes = 32; + int target_align_items = target_align_bytes / op->types[0].bytes(); + + debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n"; + + allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items; + } } } internal_assert(storage_permutation.size() == i + 1); @@ -259,7 +307,7 @@ class FlattenDimensions : public IRMutator { Expr store = Call::make(value.type(), Call::image_store, args, Call::Intrinsic); return Evaluate::make(store); - } else if (in_gpu && textures.count(op->name)) { + } else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory Expr buffer_var = Variable::make(type_of(), op->name + ".buffer", output_buf); vector args(2); @@ -398,6 +446,7 @@ class FlattenDimensions : public IRMutator { Stmt visit(const For *op) override { bool old_in_shader = in_shader; bool old_in_gpu = in_gpu; + DeviceAPI old_in_device_api = in_device_api; if ((op->for_type == ForType::GPUBlock || op->for_type == ForType::GPUThread) && op->device_api == DeviceAPI::GLSL) { @@ -406,10 +455,12 @@ class FlattenDimensions : public IRMutator { if (op->for_type == ForType::GPUBlock || op->for_type == ForType::GPUThread) { in_gpu = true; + in_device_api = op->device_api; } Stmt stmt = IRMutator::visit(op); in_shader = old_in_shader; in_gpu = old_in_gpu; + in_device_api = old_in_device_api; return stmt; } }; @@ -483,7 +534,12 @@ Stmt storage_flattening(Stmt s, } } - s = FlattenDimensions(tuple_env, outputs, target).mutate(s); + FindBuffersInGPU finder; + s.accept(&finder); + FlattenDimensions flatten(tuple_env, outputs, target); + flatten.buffer_apis = finder.buffer_device_usage; + + s = flatten.mutate(s); s = PromoteToMemoryType().mutate(s); return s; } diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 7c423e179d85..8651200f6c6b 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -371,6 +371,8 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { int max_block_size[] = {0, 0, 0}; int max_grid_size[] = {0, 0, 0}; int max_shared_mem = 0, max_constant_mem = 0; + int max_texture1d = 0, max_texture2d_width = 0, max_texture2d_height = 0; + int texture_pitch_align = 0, max_texture2d_linear_pitch = 0; int cc_major = 0, cc_minor = 0; struct { @@ -390,6 +392,11 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY}, {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR}, {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR}, + {&max_texture1d, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH}, + {&max_texture2d_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH}, + {&max_texture2d_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT}, + {&texture_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT}, + {&max_texture2d_linear_pitch, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH}, {nullptr, CU_DEVICE_ATTRIBUTE_MAX}}; // Do all the queries. @@ -441,7 +448,10 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { << " max constant memory per block: " << max_constant_mem << "\n" << " compute capability " << cc_major << "." << cc_minor << "\n" << " cuda cores: " << num_cores << " x " << threads_per_core - << " = " << num_cores * threads_per_core << "\n"; + << " = " << num_cores * threads_per_core << "\n" + << " texture pitch align: " << texture_pitch_align << "\n" + << " texture max 2d pitch: " << max_texture2d_linear_pitch << "\n" + << " texture max size: 1d: " << max_texture1d << " 2d: (" << max_texture2d_width << "," << max_texture2d_height << ") \n"; } #endif @@ -1099,6 +1109,146 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { return 0; } +namespace { +WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) { + CUresult err; + int texture_row_pitch_align_required = 0; + debug(user_context) + << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n"; + + halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device); + + if (!cuTexObjectCreate) { + error(user_context) << "CUDA requesting texture object but don't have runtime functions (cuTexObjectCreate)"; + return 0; + } + + { + Context ctx(user_context); + if (ctx.error != 0) { + return 0; + } + + CUresult err; + + CUdevice dev; + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuCtxGetDevice failed (" + << Halide::Runtime::Internal::Cuda::get_error_name(err) + << ")"; + return 0; + } + + err = cuDeviceGetAttribute(&texture_row_pitch_align_required, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, dev); + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuDeviceGetAttribute failed (" + << get_error_name(err) + << ")"; + return 0; + } + } + + CUDA_RESOURCE_DESC resourceDesc; + CUDA_TEXTURE_DESC textureDesc; + + memset(&resourceDesc, 0, sizeof(resourceDesc)); + memset(&textureDesc, 0, sizeof(textureDesc)); + + CUarray_format format = (CUarray_format)0; + struct halide_type_t type = buf->type; + if (type.code == halide_type_int) { + if (type.bits == 8) { + format = CU_AD_FORMAT_SIGNED_INT8; + } else if (type.bits == 16) { + format = CU_AD_FORMAT_SIGNED_INT16; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_SIGNED_INT32; + } + textureDesc.flags |= CU_TRSF_READ_AS_INTEGER; + } else if (type.code == halide_type_uint) { + if (type.bits == 8) { + format = CU_AD_FORMAT_UNSIGNED_INT8; + } else if (type.bits == 16) { + format = CU_AD_FORMAT_UNSIGNED_INT16; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_UNSIGNED_INT32; + } + textureDesc.flags |= CU_TRSF_READ_AS_INTEGER; + } else if (type.code == halide_type_float) { + if (type.bits == 16) { + format = CU_AD_FORMAT_HALF; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_FLOAT; + } + } + if (format == 0) { + error(user_context) << "Unhandled datatype for CUDA texture object: " << type; + return 0; + } + + debug(user_context) << " buffer dims " << buf->dimensions; + + if (buf->dim[0].stride != 1) { + error(user_context) << "CUDA requires inner stride to be 1"; + } + + resourceDesc.flags = 0; + if (buf->dimensions == 1) { + resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; + resourceDesc.res.linear.format = format; + resourceDesc.res.linear.numChannels = 1; + resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes(); + } else if (buf->dimensions == 2) { + resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device; + resourceDesc.res.pitch2D.format = format; + resourceDesc.res.pitch2D.numChannels = 1; + resourceDesc.res.pitch2D.width = buf->dim[0].extent; + resourceDesc.res.pitch2D.height = buf->dim[1].extent; + resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride * type.bytes(); + + debug(user_context) << " type " << format << " width " << (int)resourceDesc.res.pitch2D.width + << " height " << (int)resourceDesc.res.pitch2D.height << " pitch " << (int)resourceDesc.res.pitch2D.pitchInBytes << "\n"; + + if (resourceDesc.res.pitch2D.pitchInBytes % texture_row_pitch_align_required) { + error(user_context) << "row stride of " << (int)resourceDesc.res.pitch2D.pitchInBytes + << " must be aligned to " << texture_row_pitch_align_required << " bytes for CUDA textures"; + return 0; + } + } else { + error(user_context) << "cuda texture support only handles 1d and td textures"; + return 0; + } + + CUtexObject texture = 0; + err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr); + + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuTexObjectCreate failed (" + << Halide::Runtime::Internal::Cuda::get_error_name(err) + << ")"; + return 0; + } + + debug(user_context) << " got texture " << texture << "\n"; + + return texture; +} + +WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) { + if (!cuTexObjectDestroy && texture_object) { + error(user_context) << "attempting to free texture object but don't have runtime functions"; + } + + return cuTexObjectDestroy(texture_object); +} +} // namespace + WEAK int halide_cuda_run(void *user_context, void *state_ptr, const char *entry_name, @@ -1163,8 +1313,21 @@ WEAK int halide_cuda_run(void *user_context, for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end. if (arg_is_buffer[i]) { halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t)); - dev_handles[i] = ((halide_buffer_t *)args[i])->device; - translated_args[i] = &(dev_handles[i]); + if (arg_is_buffer[i] == 2) { + CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true); + + if (!texture) { + error(user_context) << "CUDA: halide_cuda_get_texture for arg " << (int)i << " failed"; + free(dev_handles); + free(translated_args); + return -1; + } + dev_handles[i] = texture; + translated_args[i] = &(dev_handles[i]); + } else { + dev_handles[i] = ((halide_buffer_t *)args[i])->device; + translated_args[i] = &(dev_handles[i]); + } debug(user_context) << " halide_cuda_run translated arg" << (int)i << " [" << (*((void **)translated_args[i])) << " ...]\n"; } else { @@ -1192,6 +1355,14 @@ WEAK int halide_cuda_run(void *user_context, stream, translated_args, nullptr); + + for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end. + if (arg_is_buffer[i] == 2) { + CUtexObject texture = (CUtexObject)dev_handles[i]; + halide_cuda_free_texture(user_context, (halide_buffer_t *)args[i], texture); + } + } + free(dev_handles); free(translated_args); if (err != CUDA_SUCCESS) { diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h index 2f311bfd603e..9766146a9e9d 100644 --- a/src/runtime/cuda_functions.h +++ b/src/runtime/cuda_functions.h @@ -47,6 +47,9 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream)); +CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject * pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc)); +CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject)); + #undef CUDA_FN #undef CUDA_FN_OPTIONAL #undef CUDA_FN_3020 diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index cfe21d70617a..8b61a786625c 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -229,8 +229,181 @@ typedef struct CUDA_MEMCPY3D_st { size_t Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D; +typedef unsigned long long CUtexObject; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ +} CUarray_format; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * CUDA texture resource view formats + */ +typedef enum CUresourceViewFormat_enum { + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st { + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC; + +typedef struct CUDA_RESOURCE_DESC_st { + CUresourcetype resType; /**< Resource type */ + + union { + struct { + // CUarray hArray; /**< CUDA array */ + } array; + struct { + // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC; + #define CU_POINTER_ATTRIBUTE_CONTEXT 1 +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_SRGB 0x10 + } // namespace Cuda } // namespace Internal } // namespace Runtime diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp index 62ae5feb77a2..a0862f9d64ec 100644 --- a/test/correctness/gpu_texture.cpp +++ b/test/correctness/gpu_texture.cpp @@ -7,24 +7,27 @@ using namespace Halide::Internal; int main(int argc, char **argv) { Target t = get_jit_target_from_environment(); + bool success = true; - if (!t.has_feature(halide_target_feature_opencl)) { - printf("[SKIP] No OpenCL target enabled.\n"); + if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) { + printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n"); return 0; } - const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL); - assert(interface->compute_capability != nullptr); - int major, minor; - int err = interface->compute_capability(nullptr, &major, &minor); - if (err != 0 || (major == 1 && minor < 2)) { - printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor); - return 0; + if (t.has_feature(halide_target_feature_opencl)) { + const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL); + assert(interface->compute_capability != nullptr); + int major, minor; + int err = interface->compute_capability(nullptr, &major, &minor); + if (err != 0 || (major == 1 && minor < 2)) { + printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor); + return 0; + } } // Check dynamic allocations into Heap and Texture memory for (auto memory_type : {MemoryType::GPUTexture, MemoryType::Heap}) { - { + if (false) { // 1D stores/loads Buffer input(100); input.fill(10); @@ -49,13 +52,18 @@ int main(int argc, char **argv) { int correct = 2 * x + 10; if (out(x) != correct) { printf("out[1D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } { + int size = 17; // 2D stores/loads - Buffer input(10, 10); + + // to get a buffer with 32-byte row pitch + Buffer input(24, size); + input.crop(0, 0, 17); + input.fill(10); ImageParam param(Int(32), 2); param.set(input); @@ -68,21 +76,24 @@ int main(int argc, char **argv) { f(x, y) = cast(x + y); g(x) = param(x, x) + cast(f(2 * x, x)); - g.gpu_tile(x, xi, 16, TailStrategy::GuardWithIf); + g.gpu_tile(x, xi, 8); f.compute_root().store_in(memory_type).gpu_blocks(x, y); // store f as integer g.store_in(memory_type); + g.bound(x, 0, size); - Buffer out = g.realize(10); - for (int x = 0; x < 10; x++) { + g.compile_to_lowered_stmt("/tmp/stmt.html", {param}, Halide::HTML); + + Buffer out = g.realize(size); + for (int x = 0; x < size; x++) { int correct = 3 * x + 10; if (out(x) != correct) { printf("out[2D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } - { + if (t.has_feature(halide_target_feature_opencl)) { // no 3d in our cuda support right now // 3D stores/loads Buffer input(10, 10, 10); input.fill(10); @@ -108,7 +119,7 @@ int main(int argc, char **argv) { int correct = 4 * x + 10; if (out(x) != correct) { printf("out[3D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } @@ -141,12 +152,19 @@ int main(int argc, char **argv) { int correct = 2 * x + 10; if (out(x) != correct) { printf("out[1D-shift][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } + if (!success) { + break; + } } - printf("Success!\n"); - return 0; + if (success) { + printf("Success!\n"); + return 0; + } + printf("Failed!\n"); + return 1; }