From 7940fa5c304b5051d1b38757777f805cacb904ee Mon Sep 17 00:00:00 2001 From: John Laxson Date: Sun, 1 Nov 2020 21:37:28 -0700 Subject: [PATCH 01/13] some stuff --- src/CodeGen_GPU_Host.cpp | 9 ++- src/CodeGen_LLVM.cpp | 2 +- src/CodeGen_PTX_Dev.cpp | 43 +++++++++- src/StorageFlattening.cpp | 6 +- src/runtime/cuda.cpp | 60 +++++++++++++- src/runtime/cuda_functions.h | 3 + src/runtime/mini_cuda.h | 130 +++++++++++++++++++++++++++++++ test/correctness/gpu_texture.cpp | 20 ++--- 8 files changed, 258 insertions(+), 15 deletions(-) diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp index c27ab4d0e788..3262a57e8491 100644 --- a/src/CodeGen_GPU_Host.cpp +++ b/src/CodeGen_GPU_Host.cpp @@ -441,7 +441,14 @@ void CodeGen_GPU_Host::visit(const For *loop) { i)); } - builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer), + int8_t buffer_type = 0; + if (closure_args[i].is_buffer && closure_args[i].memory_type == MemoryType::GPUTexture) { + buffer_type = 2; + } else if (closure_args[i].is_buffer) { + buffer_type = 1; + } + + builder->CreateStore(ConstantInt::get(i8_t, buffer_type), builder->CreateConstGEP2_32( gpu_arg_is_buffer_arr_type, gpu_arg_is_buffer_arr, diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index e7b4d7954fbd..5b9051e18706 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1435,7 +1435,7 @@ Value *CodeGen_LLVM::codegen(const Expr &e) { value->getType() == llvm_type_of(e.type())) << "Codegen of Expr " << e << " of type " << e.type() - << " did not produce llvm IR of the corresponding llvm type.\n"; + << " did not produce llvm IR of the corresponding llvm type. Llvm was " << llvm_type_of(e.type()) << "\n"; return value; } diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index ea876762b3ee..79ff9f8763ae 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -70,7 +70,11 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, vector arg_types(args.size()); for (size_t i = 0; i < args.size(); i++) { if (args[i].is_buffer) { - arg_types[i] = llvm_type_of(UInt(8))->getPointerTo(); + if (args[i].memory_type == MemoryType::GPUTexture) { + arg_types[i] = llvm_type_of(Int(64)); + } else { + arg_types[i] = llvm_type_of(UInt(8))->getPointerTo(); + } } else { arg_types[i] = llvm_type_of(args[i].type); } @@ -172,6 +176,43 @@ void CodeGen_PTX_Dev::visit(const Call *op) { internal_assert(barrier0) << "Could not find PTX barrier intrinsic (llvm.nvvm.barrier0)\n"; builder->CreateCall(barrier0); value = ConstantInt::get(i32_t, 0); + } else if (op->is_intrinsic(Call::image_load)) { + int num_args = (op->args.size() - 2) / 2; + user_assert(num_args >= 1 && num_args <= 2); + + string res_desc = ""; + user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results"; + Type res_type; + if (op->type.is_float()) { + res_desc = "f32"; + res_type = Type(Type::Float, 32, 4); + } else { + res_desc = "s32"; + res_type = Type(Type::Int, 32, 4); + } + + string coord_desc = ""; + if (op->args[2].type().is_float()) { + coord_desc = ".f32"; + } else { + coord_desc = ".s32"; + } + + string dim = std::to_string(num_args) + "d"; + string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; + + vector coords; + coords.push_back(Variable::make(Int(64), op->args[0].as()->value)); + for (size_t i = 2; i < op->args.size(); i += 2) { + internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type"; + coords.push_back(op->args[i]); + } + llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords); + // call->getCalledFunction()->setCallingConv(CallingConv::Tail); + // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords); + // call->setTailCall(true); + value = builder->CreateExtractElement(call, ConstantInt::get(i32_t, 0)); + } else { CodeGen_LLVM::visit(op); } diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index e3ad51038666..aab178b4bc05 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -42,6 +42,7 @@ class FlattenDimensions : public IRMutator { Scope<> realizations, shader_scope_realizations; bool in_shader = false; bool in_gpu = false; + DeviceAPI in_device_api = DeviceAPI::None; Expr make_shape_var(string name, const string &field, size_t dim, const Buffer<> &buf, const Parameter ¶m) { @@ -259,7 +260,7 @@ class FlattenDimensions : public IRMutator { Expr store = Call::make(value.type(), Call::image_store, args, Call::Intrinsic); return Evaluate::make(store); - } else if (in_gpu && textures.count(op->name)) { + } else if (in_gpu && textures.count(op->name) && false && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory Expr buffer_var = Variable::make(type_of(), op->name + ".buffer", output_buf); vector args(2); @@ -398,6 +399,7 @@ class FlattenDimensions : public IRMutator { Stmt visit(const For *op) override { bool old_in_shader = in_shader; bool old_in_gpu = in_gpu; + DeviceAPI old_in_device_api = in_device_api; if ((op->for_type == ForType::GPUBlock || op->for_type == ForType::GPUThread) && op->device_api == DeviceAPI::GLSL) { @@ -406,10 +408,12 @@ class FlattenDimensions : public IRMutator { if (op->for_type == ForType::GPUBlock || op->for_type == ForType::GPUThread) { in_gpu = true; + in_device_api = op->device_api; } Stmt stmt = IRMutator::visit(op); in_shader = old_in_shader; in_gpu = old_in_gpu; + in_device_api = old_in_device_api; return stmt; } }; diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 7c423e179d85..10f14595b86a 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1163,8 +1163,24 @@ WEAK int halide_cuda_run(void *user_context, for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end. if (arg_is_buffer[i]) { halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t)); - dev_handles[i] = ((halide_buffer_t *)args[i])->device; - translated_args[i] = &(dev_handles[i]); + if (arg_is_buffer[i] == 2) { + cudaResourceDesc rdesc; + cudaTextureDesc tdesc; + cudaResourceViewDesc rviewdesc; + cudaTextureObject_t *texture = (cudaTextureObject_t *)&dev_handles[i]; + err = cudaCreateTextureObject(texture, &rdesc, &tdesc, &rviewdesc); + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed: " + << get_error_name(err); + free(dev_handles); + free(translated_args); + return err; + } + translated_args[i] = (void *)*texture; + } else { + dev_handles[i] = ((halide_buffer_t *)args[i])->device; + translated_args[i] = &(dev_handles[i]); + } debug(user_context) << " halide_cuda_run translated arg" << (int)i << " [" << (*((void **)translated_args[i])) << " ...]\n"; } else { @@ -1192,6 +1208,14 @@ WEAK int halide_cuda_run(void *user_context, stream, translated_args, nullptr); + + for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end. + if (arg_is_buffer[i] == 2) { + cudaTextureObject_t texture = (cudaTextureObject_t)translated_args[i]; + cudaDestroyTextureObject(texture); + } + } + free(dev_handles); free(translated_args); if (err != CUDA_SUCCESS) { @@ -1311,6 +1335,38 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min return 0; } +WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) { + if (!cudaCreateTextureObject) { + debug(user_context) << "requesting texture object but don't have runtime functions"; + return -1; + } + + struct cudaResourceDesc resourceDesc; + struct cudaTextureDesc textureDesc; + struct cudaResourceViewDesc resourceViewDesc; + + cudaTextureObject_t texture; + CUresult err = cudaCreateTextureObject(&texture, &resourceDesc, &textureDesc, &resourceViewDesc); + + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cudaCreateTextureObject failed (" + << Halide::Runtime::Internal::Cuda::get_error_name(err) + << ")"; + return 0; + } + + return texture; +} + +WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) { + if (!cudaDestroyTextureObject && texture_object) { + error(user_context) << "attempting to free texture object but don't have runtime functions"; + } + + return cudaDestroyTextureObject(texture_object); +} + namespace { WEAK __attribute__((destructor)) void halide_cuda_cleanup() { halide_cuda_device_release(nullptr); diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h index 2f311bfd603e..b0d32755f707 100644 --- a/src/runtime/cuda_functions.h +++ b/src/runtime/cuda_functions.h @@ -47,6 +47,9 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream)); +CUDA_FN_OPTIONAL(CUresult, cudaCreateTextureObject, (cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc)); +CUDA_FN_OPTIONAL(CUresult, cudaDestroyTextureObject, (cudaTextureObject_t texObject)); + #undef CUDA_FN #undef CUDA_FN_OPTIONAL #undef CUDA_FN_3020 diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index cfe21d70617a..0598a146c944 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -229,6 +229,136 @@ typedef struct CUDA_MEMCPY3D_st { size_t Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D; +typedef unsigned long long cudaTextureObject_t; + +enum cudaChannelFormatKind { + cudaChannelFormatKindSigned = 0, /**< Signed channel format */ + cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ + cudaChannelFormatKindFloat = 2, /**< Float channel format */ + cudaChannelFormatKindNone = 3 /**< No channel format */ +}; + +enum cudaResourceType { + cudaResourceTypeArray = 0x00, + cudaResourceTypeMipmappedArray = 0x01, + cudaResourceTypeLinear = 0x02, + cudaResourceTypePitch2D = 0x03 +}; + +struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; +}; + +enum cudaTextureAddressMode { + cudaAddressModeWrap = 0, + cudaAddressModeClamp = 1, + cudaAddressModeMirror = 2, + cudaAddressModeBorder = 3 +}; + +enum cudaTextureFilterMode { + cudaFilterModePoint = 0, + cudaFilterModeLinear = 1 +}; + +enum cudaTextureReadMode { + cudaReadModeElementType = 0, + cudaReadModeNormalizedFloat = 1 +}; + +/** + * CUDA texture resource view formats + */ +enum cudaResourceViewFormat +{ + cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ + cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ + cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ + cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ + cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ + cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ + cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ + cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ + cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ + cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ + cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ + cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ + cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ + cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ + cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ + cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ + cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ + cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ + cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ + cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ + cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ + cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ + cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ + cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ + cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ + cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ + cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ + cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ +}; + +struct cudaResourceViewDesc { + enum cudaResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; +}; + +struct cudaTextureDesc { + enum cudaTextureAddressMode addressMode[3]; + enum cudaTextureFilterMode filterMode; + enum cudaTextureReadMode readMode; + int sRGB; + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum cudaTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; +}; + +struct cudaResourceDesc { + enum cudaResourceType resType; + + union { + struct { + // cudaArray_t array; + } array; + struct { + // cudaMipmappedArray_t mipmap; + } mipmap; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t sizeInBytes; + } linear; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; +}; + #define CU_POINTER_ATTRIBUTE_CONTEXT 1 } // namespace Cuda diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp index 62ae5feb77a2..3de269d07fa2 100644 --- a/test/correctness/gpu_texture.cpp +++ b/test/correctness/gpu_texture.cpp @@ -8,18 +8,20 @@ using namespace Halide::Internal; int main(int argc, char **argv) { Target t = get_jit_target_from_environment(); - if (!t.has_feature(halide_target_feature_opencl)) { - printf("[SKIP] No OpenCL target enabled.\n"); + if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) { + printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n"); return 0; } - const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL); - assert(interface->compute_capability != nullptr); - int major, minor; - int err = interface->compute_capability(nullptr, &major, &minor); - if (err != 0 || (major == 1 && minor < 2)) { - printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor); - return 0; + if (t.has_feature(halide_target_feature_opencl)) { + const auto *interface = get_device_interface_for_device_api(DeviceAPI::OpenCL); + assert(interface->compute_capability != nullptr); + int major, minor; + int err = interface->compute_capability(nullptr, &major, &minor); + if (err != 0 || (major == 1 && minor < 2)) { + printf("[SKIP] OpenCL %d.%d is less than required 1.2.\n", major, minor); + return 0; + } } // Check dynamic allocations into Heap and Texture memory From 9a2239a1eb3b325cffb35d252f08bc67e1dacddf Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 00:39:00 -0700 Subject: [PATCH 02/13] codegen struct x4 not vector --- src/CodeGen_PTX_Dev.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 79ff9f8763ae..40160fa6ce32 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -70,7 +70,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, vector arg_types(args.size()); for (size_t i = 0; i < args.size(); i++) { if (args[i].is_buffer) { - if (args[i].memory_type == MemoryType::GPUTexture) { + if (args[i].read && args[i].memory_type == MemoryType::GPUTexture) { arg_types[i] = llvm_type_of(Int(64)); } else { arg_types[i] = llvm_type_of(UInt(8))->getPointerTo(); @@ -87,7 +87,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, // Mark the buffer args as no alias for (size_t i = 0; i < args.size(); i++) { - if (args[i].is_buffer) { + if (args[i].is_buffer && (args[i].write || args[i].memory_type != MemoryType::GPUTexture)) { function->addParamAttr(i, Attribute::NoAlias); } } @@ -182,16 +182,19 @@ void CodeGen_PTX_Dev::visit(const Call *op) { string res_desc = ""; user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results"; - Type res_type; + llvm::Type *res_type; if (op->type.is_float()) { res_desc = "f32"; - res_type = Type(Type::Float, 32, 4); + auto element = llvm_type_of(Float(32)); + res_type = llvm::StructType::get(element, element, element, element); } else { res_desc = "s32"; - res_type = Type(Type::Int, 32, 4); + auto element = llvm_type_of(Int(32)); + res_type = llvm::StructType::get(element, element, element, element); } string coord_desc = ""; + user_assert(op->args[2].type().bits() == 32) << "ptx texture sampler only supports 32 bit args"; if (op->args[2].type().is_float()) { coord_desc = ".f32"; } else { @@ -201,17 +204,17 @@ void CodeGen_PTX_Dev::visit(const Call *op) { string dim = std::to_string(num_args) + "d"; string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; - vector coords; - coords.push_back(Variable::make(Int(64), op->args[0].as()->value)); + vector coords; + coords.push_back(codegen(Variable::make(Int(64), op->args[0].as()->value))); for (size_t i = 2; i < op->args.size(); i += 2) { internal_assert(op->args[i].type() == op->args[2].type()) << "all coordinates must be same type"; - coords.push_back(op->args[i]); + coords.push_back(codegen(op->args[i])); } - llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords); + llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords); // call->getCalledFunction()->setCallingConv(CallingConv::Tail); // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords); // call->setTailCall(true); - value = builder->CreateExtractElement(call, ConstantInt::get(i32_t, 0)); + value = builder->CreateExtractValue(call, {0}); } else { CodeGen_LLVM::visit(op); From 63288d497399bc9809ad0e781446c82fa663d016 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 09:11:21 -0700 Subject: [PATCH 03/13] it kinda works --- src/runtime/cuda.cpp | 149 ++++++++++++++++------- src/runtime/cuda_functions.h | 4 +- src/runtime/mini_cuda.h | 227 ++++++++++++++++++++--------------- 3 files changed, 234 insertions(+), 146 deletions(-) diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 10f14595b86a..1db647d899e6 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1099,6 +1099,102 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { return 0; } +namespace { +WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) { + debug(user_context) + << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n"; + + halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device); + + if (!cuTexObjectCreate) { + error(user_context) << "requesting texture object but don't have runtime functions"; + return 0; + } + + CUDA_RESOURCE_DESC resourceDesc; + CUDA_TEXTURE_DESC textureDesc; + // CUDA_RESOURCE_VIEW_DESC resourceViewDesc; + + memset(&resourceDesc, 0, sizeof(resourceDesc)); + memset(&textureDesc, 0, sizeof(textureDesc)); + + // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT; + + CUarray_format format = (CUarray_format)0; + struct halide_type_t type = buf->type; + if (type.code == halide_type_int) { + if (type.bits == 8) { + format = CU_AD_FORMAT_SIGNED_INT8; + } else if (type.bits == 16) { + format = CU_AD_FORMAT_SIGNED_INT16; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_SIGNED_INT32; + } + } else if (type.code == halide_type_uint) { + if (type.bits == 8) { + format = CU_AD_FORMAT_UNSIGNED_INT8; + } else if (type.bits == 16) { + format = CU_AD_FORMAT_UNSIGNED_INT16; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_UNSIGNED_INT32; + } + } else if (type.code == halide_type_float) { + if (type.bits == 16) { + format = CU_AD_FORMAT_HALF; + } else if (type.bits == 32) { + format = CU_AD_FORMAT_FLOAT; + } + } + if (format == 0) { + error(user_context) << "Unhandled datatype for CUDA texture object: " << type; + return 0; + } + + resourceDesc.flags = 0; + if (buf->dimensions == 1) { + resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; + resourceDesc.res.linear.format = format; + resourceDesc.res.linear.numChannels = 1; + resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes(); + } else if (buf->dimensions == 2) { + resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device; + resourceDesc.res.pitch2D.format = format; + resourceDesc.res.pitch2D.numChannels = 1; + resourceDesc.res.pitch2D.width = buf->dim[0].extent; + resourceDesc.res.pitch2D.height = buf->dim[1].extent; + resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride; + } else { + error(user_context) << "cuda texture support only handles 1d and td textures"; + return 0; + } + + CUtexObject texture = 0; + CUresult err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr); + + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuTexObjectCreate failed (" + << Halide::Runtime::Internal::Cuda::get_error_name(err) + << ")"; + return 0; + } + + debug(user_context) << " got texture " << texture << "\n"; + + return texture; +} + +WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) { + if (!cuTexObjectDestroy && texture_object) { + error(user_context) << "attempting to free texture object but don't have runtime functions"; + } + + return cuTexObjectDestroy(texture_object); +} +} // namespace + WEAK int halide_cuda_run(void *user_context, void *state_ptr, const char *entry_name, @@ -1164,19 +1260,16 @@ WEAK int halide_cuda_run(void *user_context, if (arg_is_buffer[i]) { halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t)); if (arg_is_buffer[i] == 2) { - cudaResourceDesc rdesc; - cudaTextureDesc tdesc; - cudaResourceViewDesc rviewdesc; - cudaTextureObject_t *texture = (cudaTextureObject_t *)&dev_handles[i]; - err = cudaCreateTextureObject(texture, &rdesc, &tdesc, &rviewdesc); - if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed: " - << get_error_name(err); + CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true); + + if (!texture) { + error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed"; free(dev_handles); free(translated_args); - return err; + return -1; } - translated_args[i] = (void *)*texture; + dev_handles[i] = texture; + translated_args[i] = &(dev_handles[i]); } else { dev_handles[i] = ((halide_buffer_t *)args[i])->device; translated_args[i] = &(dev_handles[i]); @@ -1211,8 +1304,8 @@ WEAK int halide_cuda_run(void *user_context, for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end. if (arg_is_buffer[i] == 2) { - cudaTextureObject_t texture = (cudaTextureObject_t)translated_args[i]; - cudaDestroyTextureObject(texture); + CUtexObject texture = (CUtexObject)dev_handles[i]; + halide_cuda_free_texture(user_context, (halide_buffer_t *)args[i], texture); } } @@ -1335,38 +1428,6 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min return 0; } -WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) { - if (!cudaCreateTextureObject) { - debug(user_context) << "requesting texture object but don't have runtime functions"; - return -1; - } - - struct cudaResourceDesc resourceDesc; - struct cudaTextureDesc textureDesc; - struct cudaResourceViewDesc resourceViewDesc; - - cudaTextureObject_t texture; - CUresult err = cudaCreateTextureObject(&texture, &resourceDesc, &textureDesc, &resourceViewDesc); - - if (err != CUDA_SUCCESS) { - error(user_context) - << "CUDA: cudaCreateTextureObject failed (" - << Halide::Runtime::Internal::Cuda::get_error_name(err) - << ")"; - return 0; - } - - return texture; -} - -WEAK int halide_cuda_free_texture(void *user_context, struct halide_buffer_t *buf, uint64_t texture_object) { - if (!cudaDestroyTextureObject && texture_object) { - error(user_context) << "attempting to free texture object but don't have runtime functions"; - } - - return cudaDestroyTextureObject(texture_object); -} - namespace { WEAK __attribute__((destructor)) void halide_cuda_cleanup() { halide_cuda_device_release(nullptr); diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h index b0d32755f707..ba6f352ebb0e 100644 --- a/src/runtime/cuda_functions.h +++ b/src/runtime/cuda_functions.h @@ -47,8 +47,8 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream)); -CUDA_FN_OPTIONAL(CUresult, cudaCreateTextureObject, (cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc)); -CUDA_FN_OPTIONAL(CUresult, cudaDestroyTextureObject, (cudaTextureObject_t texObject)); +CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc)); +CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject)); #undef CUDA_FN #undef CUDA_FN_OPTIONAL diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index 0598a146c944..a2be5f0aa4ce 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -229,38 +229,50 @@ typedef struct CUDA_MEMCPY3D_st { size_t Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D; -typedef unsigned long long cudaTextureObject_t; +typedef unsigned long long CUtexObject; -enum cudaChannelFormatKind { - cudaChannelFormatKindSigned = 0, /**< Signed channel format */ - cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ - cudaChannelFormatKindFloat = 2, /**< Float channel format */ - cudaChannelFormatKindNone = 3 /**< No channel format */ -}; +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ +} CUarray_format; -enum cudaResourceType { - cudaResourceTypeArray = 0x00, - cudaResourceTypeMipmappedArray = 0x01, - cudaResourceTypeLinear = 0x02, - cudaResourceTypePitch2D = 0x03 -}; +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; -struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; -}; +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; -enum cudaTextureAddressMode { - cudaAddressModeWrap = 0, - cudaAddressModeClamp = 1, - cudaAddressModeMirror = 2, - cudaAddressModeBorder = 3 -}; +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; -enum cudaTextureFilterMode { - cudaFilterModePoint = 0, - cudaFilterModeLinear = 1 -}; enum cudaTextureReadMode { cudaReadModeElementType = 0, @@ -270,94 +282,109 @@ enum cudaTextureReadMode { /** * CUDA texture resource view formats */ -enum cudaResourceViewFormat +typedef enum CUresourceViewFormat_enum { - cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ - cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ - cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ - cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ - cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ - cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ - cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ - cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ - cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ - cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ - cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ - cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ - cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ - cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ - cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ - cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ - cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ - cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ - cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ - cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ - cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ - cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ - cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ - cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ - cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ - cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ - cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ - cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ - cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ - cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ - cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ - cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ -}; + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; -struct cudaResourceViewDesc { - enum cudaResourceViewFormat format; - size_t width; - size_t height; - size_t depth; - unsigned int firstMipmapLevel; - unsigned int lastMipmapLevel; - unsigned int firstLayer; - unsigned int lastLayer; -}; +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; -struct cudaTextureDesc { - enum cudaTextureAddressMode addressMode[3]; - enum cudaTextureFilterMode filterMode; - enum cudaTextureReadMode readMode; - int sRGB; - float borderColor[4]; - int normalizedCoords; - unsigned int maxAnisotropy; - enum cudaTextureFilterMode mipmapFilterMode; - float mipmapLevelBias; - float minMipmapLevelClamp; - float maxMipmapLevelClamp; -}; +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC; -struct cudaResourceDesc { - enum cudaResourceType resType; +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ union { struct { - // cudaArray_t array; + // CUarray hArray; /**< CUDA array */ } array; struct { - // cudaMipmappedArray_t mipmap; + // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ } mipmap; struct { - void *devPtr; - struct cudaChannelFormatDesc desc; - size_t sizeInBytes; + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ } linear; struct { - void *devPtr; - struct cudaChannelFormatDesc desc; - size_t width; - size_t height; - size_t pitchInBytes; + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ } pitch2D; + struct { + int reserved[32]; + } reserved; } res; -}; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC; #define CU_POINTER_ATTRIBUTE_CONTEXT 1 From c80ff231e33dd842a09bc393cda74bac4449d3c8 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 16:37:29 -0700 Subject: [PATCH 04/13] alignment v1 --- src/AlignGPUBuffers.cpp | 164 ++++++++++++++++++++++++++++++++++++++++ src/AlignGPUBuffers.h | 24 ++++++ src/CMakeLists.txt | 2 + src/CodeGen_PTX_Dev.cpp | 3 - src/Lower.cpp | 6 ++ src/runtime/cuda.cpp | 70 +++++++++++++++-- src/runtime/mini_cuda.h | 25 ++++++ 7 files changed, 285 insertions(+), 9 deletions(-) create mode 100644 src/AlignGPUBuffers.cpp create mode 100644 src/AlignGPUBuffers.h diff --git a/src/AlignGPUBuffers.cpp b/src/AlignGPUBuffers.cpp new file mode 100644 index 000000000000..f337024ffc2a --- /dev/null +++ b/src/AlignGPUBuffers.cpp @@ -0,0 +1,164 @@ +#include "InjectHostDevBufferCopies.h" + +#include "CodeGen_GPU_Dev.h" +#include "Debug.h" +#include "ExternFuncArgument.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "IRPrinter.h" +#include "Substitute.h" + +#include +#include + +namespace Halide { +namespace Internal { + +using std::set; +using std::string; +using std::vector; + +namespace { + +class FindTexturesInGPU : public IRVisitor { + public: + set textures; + + private: + bool in_gpu = false; + DeviceAPI in_device_api = DeviceAPI::None; + + void visit(const Call *op) override { + if (in_gpu && op->is_intrinsic(Call::image_load)) { + debug(2) << " load call to " << op->name << " " << textures.count(op->name) << "\n"; + textures.insert(op->args[0].as()->value); + } + + IRVisitor::visit(op); + } + + void visit(const For *op) override { + bool old_in_gpu = in_gpu; + DeviceAPI old_in_device_api = in_device_api; + if (op->for_type == ForType::GPUBlock || + op->for_type == ForType::GPUThread) { + in_gpu = true; + in_device_api = op->device_api; + } + IRVisitor::visit(op); + in_gpu = old_in_gpu; + in_device_api = old_in_device_api; + } +}; + +class FindBufferInitType : public IRVisitor { + public: + Type type; + + private: + void visit(const Call *op) override { + if (op->name == Call::buffer_init) { + internal_assert(op->args.size() == 10) << "don't understand the format of buffer_init"; + + halide_type_code_t code = (halide_type_code_t)op->args[5].as()->value; + int bits = op->args[6].as()->value; + type = Type(code, bits, 1); + } + + IRVisitor::visit(op); + } +}; + +class AdjustAllocationStride : public IRMutator { + Type buffer_type; +private: + Stmt visit(const LetStmt *op) override { + if (op->name == buffer) { + bool old_in_buffer = in_buffer; + debug(2) << " enter buffer " << op->name << "\n"; + internal_assert(!old_in_buffer) << " Already in buffer?!?"; + in_buffer = true; + + FindBufferInitType typeFinder; + op->accept(&typeFinder); + buffer_type = typeFinder.type; + + debug(2) << " found type " << buffer_type << "\n"; + + Expr new_value = mutate(op->value); + debug(2) << " new struct value " << new_value; + debug(2) << " exit buffer " << op->name << "\n"; + in_buffer = old_in_buffer; + + return LetStmt::make(op->name, new_value, op->body); + } else { + return IRMutator::visit(op); + } + } + + Expr visit(const Call *op) override { + if (in_buffer) { + debug(2) << " in buffer call " << op->name << "\n"; + + if (op->is_intrinsic(Call::make_struct)) { + internal_assert(op->args.size() % 4 == 0) << "unknown format of make_struct for buffer"; + + vector args = op->args; + if (args.size() >= 8) { + Expr row_width = args[1]; + Expr current_stride = args[6]; + + // This could be symbolically fetched from runtime I guess? + int target_align_bytes = 32; + + int target_align_items = target_align_bytes / buffer_type.bytes(); + Expr target_align_expr = IntImm::make(Int(32), target_align_items); + + Expr row_tail_items = Mod::make(current_stride, target_align_expr); + Expr row_extra_items = Sub::make(target_align_expr, row_tail_items); + + Expr padded_stride = Select::make( + EQ::make(row_tail_items, IntImm::make(Int(32), 0)), + current_stride, + Add::make(current_stride, row_extra_items) + ); + args[6] = padded_stride; + + debug(2) << " old struct: " << static_cast(op) << "\n"; + Expr new_call = Call::make(op->type, op->name, args, op->call_type).as(); + debug(2) << " new struct: " << new_call << "\n"; + return new_call; + } + } + + return IRMutator::visit(op); + } else { + return IRMutator::visit(op); + } + } + + string buffer; + bool in_buffer = false; + +public: + AdjustAllocationStride(string b) + : buffer(std::move(b)) { + } +}; + +} // namespace + +Stmt align_gpu_buffers(Stmt s, const Target &t) { + + // Handle inputs and outputs + FindTexturesInGPU finder; + s.accept(&finder); + for (const string& texture : finder.textures) { + s = AdjustAllocationStride(texture + ".buffer").mutate(s); + } + + return s; +} + +} // namespace Internal +} // namespace Halide diff --git a/src/AlignGPUBuffers.h b/src/AlignGPUBuffers.h new file mode 100644 index 000000000000..bcd72d7c6fec --- /dev/null +++ b/src/AlignGPUBuffers.h @@ -0,0 +1,24 @@ +#ifndef HALIDE_ALIGN_GPU_BUFFERS_H +#define HALIDE_ALIGN_GPU_BUFFERS_H + +/** \file + * Defines the lowering passes that deal with host and device buffer flow. + */ + +#include +#include + +#include "Expr.h" +#include "Target.h" + +namespace Halide { +namespace Internal { + +/** Inject calls to halide_device_malloc, halide_copy_to_device, and + * halide_copy_to_host as needed. */ +Stmt align_gpu_buffers(Stmt s, const Target &t); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 29458c7db0d9..9e20d5a2ff2d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,6 +8,7 @@ set(HEADER_FILES AddAtomicMutex.h AddImageChecks.h AddParameterChecks.h + AlignGPUBuffers.h AlignLoads.h AllocationBoundsInference.h ApplySplit.h @@ -173,6 +174,7 @@ set(SOURCE_FILES AddAtomicMutex.cpp AddImageChecks.cpp AddParameterChecks.cpp + AlignGPUBuffers.cpp AlignLoads.cpp AllocationBoundsInference.cpp ApplySplit.cpp diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 40160fa6ce32..13ddec62c1f2 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -211,9 +211,6 @@ void CodeGen_PTX_Dev::visit(const Call *op) { coords.push_back(codegen(op->args[i])); } llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords); - // call->getCalledFunction()->setCallingConv(CallingConv::Tail); - // call = (llvm::CallInst *)call_intrin(res_type, 4, intrinsic, coords); - // call->setTailCall(true); value = builder->CreateExtractValue(call, {0}); } else { diff --git a/src/Lower.cpp b/src/Lower.cpp index 24fdbc47acf0..3bd946083504 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -9,6 +9,7 @@ #include "AddAtomicMutex.h" #include "AddImageChecks.h" #include "AddParameterChecks.h" +#include "AlignGPUBuffers.h" #include "AllocationBoundsInference.h" #include "AsyncProducers.h" #include "BoundSmallAllocations.h" @@ -411,6 +412,11 @@ Module lower(const vector &output_funcs, s = lower_warp_shuffles(s); debug(2) << "Lowering after injecting warp shuffles:\n" << s << "\n\n"; + + debug(1) << "Aligning GPU Buffers...\n"; + s = align_gpu_buffers(s, t); + debug(2) << "Lowering after aligning GPU buffers:\n" + << s << "\n\n"; } debug(1) << "Simplifying...\n"; diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 1db647d899e6..0f58c164d95b 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -371,6 +371,8 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { int max_block_size[] = {0, 0, 0}; int max_grid_size[] = {0, 0, 0}; int max_shared_mem = 0, max_constant_mem = 0; + int max_texture1d = 0, max_texture2d_width = 0, max_texture2d_height = 0; + int texture_pitch_align = 0, max_texture2d_linear_pitch = 0; int cc_major = 0, cc_minor = 0; struct { @@ -390,6 +392,11 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { {&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY}, {&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR}, {&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR}, + {&max_texture1d, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH}, + {&max_texture2d_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH}, + {&max_texture2d_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT}, + {&texture_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT}, + {&max_texture2d_linear_pitch, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH}, {nullptr, CU_DEVICE_ATTRIBUTE_MAX}}; // Do all the queries. @@ -441,7 +448,10 @@ WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { << " max constant memory per block: " << max_constant_mem << "\n" << " compute capability " << cc_major << "." << cc_minor << "\n" << " cuda cores: " << num_cores << " x " << threads_per_core - << " = " << num_cores * threads_per_core << "\n"; + << " = " << num_cores * threads_per_core << "\n" + << " texture pitch align: " << texture_pitch_align << "\n" + << " texture max 2d pitch: " << max_texture2d_linear_pitch << "\n" + << " texture max size: 1d: " << max_texture1d << " 2d: (" << max_texture2d_width << "," << max_texture2d_height << ") \n"; } #endif @@ -1101,6 +1111,8 @@ WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) { namespace { WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t *buf, bool sampled) { + CUresult err; + int texture_row_pitch_align_required = 0; debug(user_context) << "CUDA: halide_cuda_get_texture (user_context: " << user_context << ", buffer: " << buf << ")\n"; @@ -1111,6 +1123,34 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t return 0; } + { + Context ctx(user_context); + if (ctx.error != 0) { + return 0; + } + + CUresult err; + + CUdevice dev; + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuCtxGetDevice failed (" + << Halide::Runtime::Internal::Cuda::get_error_name(err) + << ")"; + return 0; + } + + err = cuDeviceGetAttribute(&texture_row_pitch_align_required, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, dev); + if (err != CUDA_SUCCESS) { + error(user_context) + << "CUDA: cuDeviceGetAttribute failed (" + << get_error_name(err) + << ")"; + return 0; + } + } + CUDA_RESOURCE_DESC resourceDesc; CUDA_TEXTURE_DESC textureDesc; // CUDA_RESOURCE_VIEW_DESC resourceViewDesc; @@ -1118,7 +1158,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t memset(&resourceDesc, 0, sizeof(resourceDesc)); memset(&textureDesc, 0, sizeof(textureDesc)); - // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT; + // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT CUarray_format format = (CUarray_format)0; struct halide_type_t type = buf->type; @@ -1130,6 +1170,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t } else if (type.bits == 32) { format = CU_AD_FORMAT_SIGNED_INT32; } + textureDesc.flags |= CU_TRSF_READ_AS_INTEGER; } else if (type.code == halide_type_uint) { if (type.bits == 8) { format = CU_AD_FORMAT_UNSIGNED_INT8; @@ -1138,6 +1179,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t } else if (type.bits == 32) { format = CU_AD_FORMAT_UNSIGNED_INT32; } + textureDesc.flags |= CU_TRSF_READ_AS_INTEGER; } else if (type.code == halide_type_float) { if (type.bits == 16) { format = CU_AD_FORMAT_HALF; @@ -1150,13 +1192,20 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t return 0; } + debug(user_context) << " buffer dims " << buf->dimensions; + + if (buf->dim[0].stride != 1) { + error(user_context) << "CUDA requires inner stride to be 1"; + } + resourceDesc.flags = 0; if (buf->dimensions == 1) { resourceDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; + resourceDesc.res.linear.devPtr = (CUdeviceptr)buf->device; resourceDesc.res.linear.format = format; resourceDesc.res.linear.numChannels = 1; resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes(); + } else if (buf->dimensions == 2) { resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D; resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device; @@ -1164,14 +1213,23 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t resourceDesc.res.pitch2D.numChannels = 1; resourceDesc.res.pitch2D.width = buf->dim[0].extent; resourceDesc.res.pitch2D.height = buf->dim[1].extent; - resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride; + resourceDesc.res.pitch2D.pitchInBytes = buf->dim[1].stride * type.bytes(); + + debug(user_context) << " type " << format << " width " << (int)resourceDesc.res.pitch2D.width + << " height " << (int)resourceDesc.res.pitch2D.height << " pitch " << (int)resourceDesc.res.pitch2D.pitchInBytes << "\n"; + + if (resourceDesc.res.pitch2D.pitchInBytes % texture_row_pitch_align_required) { + error(user_context) << "row stride of " << (int)resourceDesc.res.pitch2D.pitchInBytes + << " must be aligned to " << texture_row_pitch_align_required << " bytes for CUDA textures"; + return 0; + } } else { error(user_context) << "cuda texture support only handles 1d and td textures"; return 0; } CUtexObject texture = 0; - CUresult err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr); + err = cuTexObjectCreate(&texture, &resourceDesc, &textureDesc, nullptr); if (err != CUDA_SUCCESS) { error(user_context) @@ -1263,7 +1321,7 @@ WEAK int halide_cuda_run(void *user_context, CUtexObject texture = halide_cuda_get_texture(user_context, (halide_buffer_t *)args[i], true); if (!texture) { - error(user_context) << "CUDA: cudaCreateTextureObject for arg " << (int)i << "failed"; + error(user_context) << "CUDA: halide_cuda_get_texture for arg " << (int)i << " failed"; free(dev_handles); free(translated_args); return -1; diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index a2be5f0aa4ce..3c78400d0e22 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -388,6 +388,31 @@ typedef struct CUDA_RESOURCE_DESC_st #define CU_POINTER_ATTRIBUTE_CONTEXT 1 +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_SRGB 0x10 + } // namespace Cuda } // namespace Internal } // namespace Runtime From 39f548a9c373880cceef157a2032915056a19656 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 19:32:54 -0700 Subject: [PATCH 05/13] works! --- src/Lower.cpp | 20 ++++++----- src/StorageFlattening.cpp | 58 ++++++++++++++++++++++++++++++-- test/correctness/gpu_texture.cpp | 40 +++++++++++++++------- 3 files changed, 94 insertions(+), 24 deletions(-) diff --git a/src/Lower.cpp b/src/Lower.cpp index 3bd946083504..790e4e6a805e 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -272,6 +272,13 @@ Module lower(const vector &output_funcs, debug(2) << "Lowering after bounding small realizations:\n" << s << "\n\n"; + if (will_inject_host_copies) { + debug(1) << "Selecting a GPU API for GPU loops...\n"; + s = select_gpu_api(s, t); + debug(2) << "Lowering after selecting a GPU API:\n" + << s << "\n\n"; + } + debug(1) << "Performing storage flattening...\n"; s = storage_flattening(s, outputs, env, t); debug(2) << "Lowering after storage flattening:\n" @@ -297,11 +304,6 @@ Module lower(const vector &output_funcs, } if (will_inject_host_copies) { - debug(1) << "Selecting a GPU API for GPU loops...\n"; - s = select_gpu_api(s, t); - debug(2) << "Lowering after selecting a GPU API:\n" - << s << "\n\n"; - debug(1) << "Injecting host <-> dev buffer copies...\n"; s = inject_host_dev_buffer_copies(s, t); debug(2) << "Lowering after injecting host <-> dev buffer copies:\n" @@ -413,10 +415,10 @@ Module lower(const vector &output_funcs, debug(2) << "Lowering after injecting warp shuffles:\n" << s << "\n\n"; - debug(1) << "Aligning GPU Buffers...\n"; - s = align_gpu_buffers(s, t); - debug(2) << "Lowering after aligning GPU buffers:\n" - << s << "\n\n"; + // debug(1) << "Aligning GPU Buffers...\n"; + // s = align_gpu_buffers(s, t); + // debug(2) << "Lowering after aligning GPU buffers:\n" + // << s << "\n\n"; } debug(1) << "Simplifying...\n"; diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index aab178b4bc05..03a048e41e97 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -22,6 +22,39 @@ using std::string; using std::vector; namespace { +class FindBuffersInGPU : public IRVisitor { +public: + map> buffer_device_usage; + +private: + bool in_gpu = false; + DeviceAPI in_device_api = DeviceAPI::None; + using IRVisitor::visit; + + void visit(const Call *op) override { + debug(2) << " candidate load to " << op->name << " " << in_device_api << "\n"; + if (in_gpu && + (op->call_type == Call::Halide || op->call_type == Call::Image)) { + debug(2) << " load call to " << op->name << " " << in_device_api << "\n"; + buffer_device_usage[op->name].insert(in_device_api); + } + + IRVisitor::visit(op); + } + + void visit(const For *op) override { + bool old_in_gpu = in_gpu; + DeviceAPI old_in_device_api = in_device_api; + if (op->for_type == ForType::GPUBlock || + op->for_type == ForType::GPUThread) { + in_gpu = true; + in_device_api = op->device_api; + } + IRVisitor::visit(op); + in_gpu = old_in_gpu; + in_device_api = old_in_device_api; + } +}; class FlattenDimensions : public IRMutator { public: @@ -34,6 +67,8 @@ class FlattenDimensions : public IRMutator { } } + map> buffer_apis; + private: const map> &env; set outputs; @@ -117,7 +152,7 @@ class FlattenDimensions : public IRMutator { if (op->memory_type == MemoryType::GPUTexture) { textures.insert(op->name); - debug(2) << "found texture " << op->name << "\n"; + debug(2) << "found texture " << op->name << " in " << in_device_api << "\n"; } Stmt body = mutate(op->body); @@ -153,11 +188,23 @@ class FlattenDimensions : public IRMutator { if (args[j] == storage_dims[i].var) { storage_permutation.push_back((int)j); Expr alignment = storage_dims[i].alignment; + if (alignment.defined()) { allocation_extents[j] = ((extents[j] + alignment - 1) / alignment) * alignment; } else { allocation_extents[j] = extents[j]; } + + // Promote row alignment for buffers used as CUDA Textures + if (j == 0 && textures.count(op->name) && buffer_apis[op->name].count(DeviceAPI::CUDA)) { + // This could be symbolically fetched from runtime I guess? + int target_align_bytes = 32; + int target_align_items = target_align_bytes / op->types[0].bytes(); + + debug(2) << "promoting alignment for " << op->name << " to " << target_align_items << "\n"; + + allocation_extents[j] = ((allocation_extents[j] + target_align_items - 1) / target_align_items) * target_align_items; + } } } internal_assert(storage_permutation.size() == i + 1); @@ -260,7 +307,7 @@ class FlattenDimensions : public IRMutator { Expr store = Call::make(value.type(), Call::image_store, args, Call::Intrinsic); return Evaluate::make(store); - } else if (in_gpu && textures.count(op->name) && false && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory + } else if (in_gpu && textures.count(op->name) && in_device_api != DeviceAPI::CUDA) { // CUDA writes are still directly to memory Expr buffer_var = Variable::make(type_of(), op->name + ".buffer", output_buf); vector args(2); @@ -487,7 +534,12 @@ Stmt storage_flattening(Stmt s, } } - s = FlattenDimensions(tuple_env, outputs, target).mutate(s); + FindBuffersInGPU finder; + s.accept(&finder); + FlattenDimensions flatten(tuple_env, outputs, target); + flatten.buffer_apis = finder.buffer_device_usage; + + s = flatten.mutate(s); s = PromoteToMemoryType().mutate(s); return s; } diff --git a/test/correctness/gpu_texture.cpp b/test/correctness/gpu_texture.cpp index 3de269d07fa2..a0862f9d64ec 100644 --- a/test/correctness/gpu_texture.cpp +++ b/test/correctness/gpu_texture.cpp @@ -7,6 +7,7 @@ using namespace Halide::Internal; int main(int argc, char **argv) { Target t = get_jit_target_from_environment(); + bool success = true; if (!(t.has_feature(halide_target_feature_opencl) || t.has_feature(halide_target_feature_cuda_capability30))) { printf("[SKIP] No OpenCL or CUDA 3.0+ target enabled.\n"); @@ -26,7 +27,7 @@ int main(int argc, char **argv) { // Check dynamic allocations into Heap and Texture memory for (auto memory_type : {MemoryType::GPUTexture, MemoryType::Heap}) { - { + if (false) { // 1D stores/loads Buffer input(100); input.fill(10); @@ -51,13 +52,18 @@ int main(int argc, char **argv) { int correct = 2 * x + 10; if (out(x) != correct) { printf("out[1D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } { + int size = 17; // 2D stores/loads - Buffer input(10, 10); + + // to get a buffer with 32-byte row pitch + Buffer input(24, size); + input.crop(0, 0, 17); + input.fill(10); ImageParam param(Int(32), 2); param.set(input); @@ -70,21 +76,24 @@ int main(int argc, char **argv) { f(x, y) = cast(x + y); g(x) = param(x, x) + cast(f(2 * x, x)); - g.gpu_tile(x, xi, 16, TailStrategy::GuardWithIf); + g.gpu_tile(x, xi, 8); f.compute_root().store_in(memory_type).gpu_blocks(x, y); // store f as integer g.store_in(memory_type); + g.bound(x, 0, size); - Buffer out = g.realize(10); - for (int x = 0; x < 10; x++) { + g.compile_to_lowered_stmt("/tmp/stmt.html", {param}, Halide::HTML); + + Buffer out = g.realize(size); + for (int x = 0; x < size; x++) { int correct = 3 * x + 10; if (out(x) != correct) { printf("out[2D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } - { + if (t.has_feature(halide_target_feature_opencl)) { // no 3d in our cuda support right now // 3D stores/loads Buffer input(10, 10, 10); input.fill(10); @@ -110,7 +119,7 @@ int main(int argc, char **argv) { int correct = 4 * x + 10; if (out(x) != correct) { printf("out[3D][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } @@ -143,12 +152,19 @@ int main(int argc, char **argv) { int correct = 2 * x + 10; if (out(x) != correct) { printf("out[1D-shift][%d](%d) = %d instead of %d\n", (int)memory_type, x, out(x), correct); - return -1; + success = false; } } } + if (!success) { + break; + } } - printf("Success!\n"); - return 0; + if (success) { + printf("Success!\n"); + return 0; + } + printf("Failed!\n"); + return 1; } From e16a5aff161f182def18ee019dd90ee611d8f4d4 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 19:41:26 -0700 Subject: [PATCH 06/13] Cleanup --- src/AlignGPUBuffers.cpp | 164 ---------------------------------------- src/AlignGPUBuffers.h | 24 ------ src/CMakeLists.txt | 2 - src/Lower.cpp | 6 -- 4 files changed, 196 deletions(-) delete mode 100644 src/AlignGPUBuffers.cpp delete mode 100644 src/AlignGPUBuffers.h diff --git a/src/AlignGPUBuffers.cpp b/src/AlignGPUBuffers.cpp deleted file mode 100644 index f337024ffc2a..000000000000 --- a/src/AlignGPUBuffers.cpp +++ /dev/null @@ -1,164 +0,0 @@ -#include "InjectHostDevBufferCopies.h" - -#include "CodeGen_GPU_Dev.h" -#include "Debug.h" -#include "ExternFuncArgument.h" -#include "IRMutator.h" -#include "IROperator.h" -#include "IRPrinter.h" -#include "Substitute.h" - -#include -#include - -namespace Halide { -namespace Internal { - -using std::set; -using std::string; -using std::vector; - -namespace { - -class FindTexturesInGPU : public IRVisitor { - public: - set textures; - - private: - bool in_gpu = false; - DeviceAPI in_device_api = DeviceAPI::None; - - void visit(const Call *op) override { - if (in_gpu && op->is_intrinsic(Call::image_load)) { - debug(2) << " load call to " << op->name << " " << textures.count(op->name) << "\n"; - textures.insert(op->args[0].as()->value); - } - - IRVisitor::visit(op); - } - - void visit(const For *op) override { - bool old_in_gpu = in_gpu; - DeviceAPI old_in_device_api = in_device_api; - if (op->for_type == ForType::GPUBlock || - op->for_type == ForType::GPUThread) { - in_gpu = true; - in_device_api = op->device_api; - } - IRVisitor::visit(op); - in_gpu = old_in_gpu; - in_device_api = old_in_device_api; - } -}; - -class FindBufferInitType : public IRVisitor { - public: - Type type; - - private: - void visit(const Call *op) override { - if (op->name == Call::buffer_init) { - internal_assert(op->args.size() == 10) << "don't understand the format of buffer_init"; - - halide_type_code_t code = (halide_type_code_t)op->args[5].as()->value; - int bits = op->args[6].as()->value; - type = Type(code, bits, 1); - } - - IRVisitor::visit(op); - } -}; - -class AdjustAllocationStride : public IRMutator { - Type buffer_type; -private: - Stmt visit(const LetStmt *op) override { - if (op->name == buffer) { - bool old_in_buffer = in_buffer; - debug(2) << " enter buffer " << op->name << "\n"; - internal_assert(!old_in_buffer) << " Already in buffer?!?"; - in_buffer = true; - - FindBufferInitType typeFinder; - op->accept(&typeFinder); - buffer_type = typeFinder.type; - - debug(2) << " found type " << buffer_type << "\n"; - - Expr new_value = mutate(op->value); - debug(2) << " new struct value " << new_value; - debug(2) << " exit buffer " << op->name << "\n"; - in_buffer = old_in_buffer; - - return LetStmt::make(op->name, new_value, op->body); - } else { - return IRMutator::visit(op); - } - } - - Expr visit(const Call *op) override { - if (in_buffer) { - debug(2) << " in buffer call " << op->name << "\n"; - - if (op->is_intrinsic(Call::make_struct)) { - internal_assert(op->args.size() % 4 == 0) << "unknown format of make_struct for buffer"; - - vector args = op->args; - if (args.size() >= 8) { - Expr row_width = args[1]; - Expr current_stride = args[6]; - - // This could be symbolically fetched from runtime I guess? - int target_align_bytes = 32; - - int target_align_items = target_align_bytes / buffer_type.bytes(); - Expr target_align_expr = IntImm::make(Int(32), target_align_items); - - Expr row_tail_items = Mod::make(current_stride, target_align_expr); - Expr row_extra_items = Sub::make(target_align_expr, row_tail_items); - - Expr padded_stride = Select::make( - EQ::make(row_tail_items, IntImm::make(Int(32), 0)), - current_stride, - Add::make(current_stride, row_extra_items) - ); - args[6] = padded_stride; - - debug(2) << " old struct: " << static_cast(op) << "\n"; - Expr new_call = Call::make(op->type, op->name, args, op->call_type).as(); - debug(2) << " new struct: " << new_call << "\n"; - return new_call; - } - } - - return IRMutator::visit(op); - } else { - return IRMutator::visit(op); - } - } - - string buffer; - bool in_buffer = false; - -public: - AdjustAllocationStride(string b) - : buffer(std::move(b)) { - } -}; - -} // namespace - -Stmt align_gpu_buffers(Stmt s, const Target &t) { - - // Handle inputs and outputs - FindTexturesInGPU finder; - s.accept(&finder); - for (const string& texture : finder.textures) { - s = AdjustAllocationStride(texture + ".buffer").mutate(s); - } - - return s; -} - -} // namespace Internal -} // namespace Halide diff --git a/src/AlignGPUBuffers.h b/src/AlignGPUBuffers.h deleted file mode 100644 index bcd72d7c6fec..000000000000 --- a/src/AlignGPUBuffers.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef HALIDE_ALIGN_GPU_BUFFERS_H -#define HALIDE_ALIGN_GPU_BUFFERS_H - -/** \file - * Defines the lowering passes that deal with host and device buffer flow. - */ - -#include -#include - -#include "Expr.h" -#include "Target.h" - -namespace Halide { -namespace Internal { - -/** Inject calls to halide_device_malloc, halide_copy_to_device, and - * halide_copy_to_host as needed. */ -Stmt align_gpu_buffers(Stmt s, const Target &t); - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9e20d5a2ff2d..29458c7db0d9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,7 +8,6 @@ set(HEADER_FILES AddAtomicMutex.h AddImageChecks.h AddParameterChecks.h - AlignGPUBuffers.h AlignLoads.h AllocationBoundsInference.h ApplySplit.h @@ -174,7 +173,6 @@ set(SOURCE_FILES AddAtomicMutex.cpp AddImageChecks.cpp AddParameterChecks.cpp - AlignGPUBuffers.cpp AlignLoads.cpp AllocationBoundsInference.cpp ApplySplit.cpp diff --git a/src/Lower.cpp b/src/Lower.cpp index 790e4e6a805e..773c84d1cabd 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -9,7 +9,6 @@ #include "AddAtomicMutex.h" #include "AddImageChecks.h" #include "AddParameterChecks.h" -#include "AlignGPUBuffers.h" #include "AllocationBoundsInference.h" #include "AsyncProducers.h" #include "BoundSmallAllocations.h" @@ -414,11 +413,6 @@ Module lower(const vector &output_funcs, s = lower_warp_shuffles(s); debug(2) << "Lowering after injecting warp shuffles:\n" << s << "\n\n"; - - // debug(1) << "Aligning GPU Buffers...\n"; - // s = align_gpu_buffers(s, t); - // debug(2) << "Lowering after aligning GPU buffers:\n" - // << s << "\n\n"; } debug(1) << "Simplifying...\n"; From 43fd0ae101c0fcc9ec6280c0dc6685784b268f14 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 19:57:02 -0700 Subject: [PATCH 07/13] cleanup --- src/CodeGen_LLVM.cpp | 2 +- src/CodeGen_PTX_Dev.cpp | 21 ++++++++++++--------- src/runtime/cuda.cpp | 6 +----- src/runtime/mini_cuda.h | 6 ------ 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 5b9051e18706..e7b4d7954fbd 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1435,7 +1435,7 @@ Value *CodeGen_LLVM::codegen(const Expr &e) { value->getType() == llvm_type_of(e.type())) << "Codegen of Expr " << e << " of type " << e.type() - << " did not produce llvm IR of the corresponding llvm type. Llvm was " << llvm_type_of(e.type()) << "\n"; + << " did not produce llvm IR of the corresponding llvm type.\n"; return value; } diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 13ddec62c1f2..0b4a6543bff7 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -182,24 +182,28 @@ void CodeGen_PTX_Dev::visit(const Call *op) { string res_desc = ""; user_assert(op->type.bits() == 32) << "ptx texture sampler only supports 32 bit results"; - llvm::Type *res_type; + llvm::Type *element_type; if (op->type.is_float()) { res_desc = "f32"; - auto element = llvm_type_of(Float(32)); - res_type = llvm::StructType::get(element, element, element, element); + element_type = llvm_type_of(Float(32)); } else { res_desc = "s32"; - auto element = llvm_type_of(Int(32)); - res_type = llvm::StructType::get(element, element, element, element); + element_type = llvm_type_of(Int(32)); } + // PTX returns a 4 element struct (not a vector!) regardless of + llvm::Type *res_type = llvm::StructType::get(element, element, element, element); string coord_desc = ""; - user_assert(op->args[2].type().bits() == 32) << "ptx texture sampler only supports 32 bit args"; - if (op->args[2].type().is_float()) { + Type coord_type = op->args[2].type(); + user_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args"; + if (coord_type.is_float()) { coord_desc = ".f32"; - } else { + } else if (coord_type.is_uint()) { + coord_desc = ".u32"; + } else if (coord_type.is_int()) { coord_desc = ".s32"; } + internal_assert(coord_type != "") << "unhandled coordinate type for ptx texture sampler " << coord_type; string dim = std::to_string(num_args) + "d"; string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; @@ -212,7 +216,6 @@ void CodeGen_PTX_Dev::visit(const Call *op) { } llvm::CallInst *call = (llvm::CallInst *)call_intrin(res_type, 1, intrinsic, coords); value = builder->CreateExtractValue(call, {0}); - } else { CodeGen_LLVM::visit(op); } diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 0f58c164d95b..8651200f6c6b 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1119,7 +1119,7 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t halide_assert(user_context, buf->device_interface == halide_cuda_device_interface() && buf->device); if (!cuTexObjectCreate) { - error(user_context) << "requesting texture object but don't have runtime functions"; + error(user_context) << "CUDA requesting texture object but don't have runtime functions (cuTexObjectCreate)"; return 0; } @@ -1153,13 +1153,10 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t CUDA_RESOURCE_DESC resourceDesc; CUDA_TEXTURE_DESC textureDesc; - // CUDA_RESOURCE_VIEW_DESC resourceViewDesc; memset(&resourceDesc, 0, sizeof(resourceDesc)); memset(&textureDesc, 0, sizeof(textureDesc)); - // textureDesc.filterMode = CU_TR_FILTER_MODE_POINT - CUarray_format format = (CUarray_format)0; struct halide_type_t type = buf->type; if (type.code == halide_type_int) { @@ -1205,7 +1202,6 @@ WEAK uint64_t halide_cuda_get_texture(void *user_context, struct halide_buffer_t resourceDesc.res.linear.format = format; resourceDesc.res.linear.numChannels = 1; resourceDesc.res.linear.sizeInBytes = buf->size_in_bytes(); - } else if (buf->dimensions == 2) { resourceDesc.resType = CU_RESOURCE_TYPE_PITCH2D; resourceDesc.res.pitch2D.devPtr = (CUdeviceptr)buf->device; diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index 3c78400d0e22..0ebfed0c29ee 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -273,12 +273,6 @@ typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ } CUfilter_mode; - -enum cudaTextureReadMode { - cudaReadModeElementType = 0, - cudaReadModeNormalizedFloat = 1 -}; - /** * CUDA texture resource view formats */ From c695593be2e52e43dcfc5145ab5ff83870e1cb76 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 19:57:48 -0700 Subject: [PATCH 08/13] cleanup formatting --- src/runtime/cuda_functions.h | 2 +- src/runtime/mini_cuda.h | 133 +++++++++++++++++------------------ 2 files changed, 66 insertions(+), 69 deletions(-) diff --git a/src/runtime/cuda_functions.h b/src/runtime/cuda_functions.h index ba6f352ebb0e..9766146a9e9d 100644 --- a/src/runtime/cuda_functions.h +++ b/src/runtime/cuda_functions.h @@ -47,7 +47,7 @@ CUDA_FN(CUresult, cuPointerGetAttribute, (void *result, int query, CUdeviceptr p CUDA_FN_OPTIONAL(CUresult, cuStreamSynchronize, (CUstream hStream)); -CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc)); +CUDA_FN_OPTIONAL(CUresult, cuTexObjectCreate, (CUtexObject * pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc)); CUDA_FN_OPTIONAL(CUresult, cuTexObjectDestroy, (CUtexObject texObject)); #undef CUDA_FN diff --git a/src/runtime/mini_cuda.h b/src/runtime/mini_cuda.h index 0ebfed0c29ee..8b61a786625c 100644 --- a/src/runtime/mini_cuda.h +++ b/src/runtime/mini_cuda.h @@ -235,32 +235,32 @@ typedef unsigned long long CUtexObject; * Array formats */ typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ - CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ - CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ - CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ - CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ } CUarray_format; /** * Resource types */ typedef enum CUresourcetype_enum { - CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ - CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ - CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ } CUresourcetype; /** * Texture reference addressing modes */ typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ - CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ } CUaddress_mode; @@ -269,57 +269,55 @@ typedef enum CUaddress_mode_enum { * Texture reference filtering modes */ typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ - CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ } CUfilter_mode; /** * CUDA texture resource view formats */ -typedef enum CUresourceViewFormat_enum -{ - CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ - CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ +typedef enum CUresourceViewFormat_enum { + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ - CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ } CUresourceViewFormat; /** * Resource view descriptor */ -typedef struct CUDA_RESOURCE_VIEW_DESC_st -{ +typedef struct CUDA_RESOURCE_VIEW_DESC_st { CUresourceViewFormat format; /**< Resource view format */ size_t width; /**< Width of the resource view */ size_t height; /**< Height of the resource view */ @@ -347,9 +345,8 @@ typedef struct CUDA_TEXTURE_DESC_st { int reserved[12]; } CUDA_TEXTURE_DESC; -typedef struct CUDA_RESOURCE_DESC_st -{ - CUresourcetype resType; /**< Resource type */ +typedef struct CUDA_RESOURCE_DESC_st { + CUresourcetype resType; /**< Resource type */ union { struct { @@ -359,25 +356,25 @@ typedef struct CUDA_RESOURCE_DESC_st // CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ } mipmap; struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t sizeInBytes; /**< Size in bytes */ + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ } linear; struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ } pitch2D; struct { int reserved[32]; } reserved; } res; - unsigned int flags; /**< Flags (must be zero) */ + unsigned int flags; /**< Flags (must be zero) */ } CUDA_RESOURCE_DESC; #define CU_POINTER_ATTRIBUTE_CONTEXT 1 @@ -393,19 +390,19 @@ typedef struct CUDA_RESOURCE_DESC_st * in the range [0,1]. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_READ_AS_INTEGER 0x01 +#define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_SRGB 0x10 +#define CU_TRSF_SRGB 0x10 } // namespace Cuda } // namespace Internal From 14fa13a653c044616ddf29a101ee6b6662c67de4 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 20:00:21 -0700 Subject: [PATCH 09/13] cleanup formatting --- src/CodeGen_PTX_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 0b4a6543bff7..0d4a8d7d8dc1 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -190,7 +190,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { res_desc = "s32"; element_type = llvm_type_of(Int(32)); } - // PTX returns a 4 element struct (not a vector!) regardless of + // PTX returns a 4 element struct (not a vector!) regardless of llvm::Type *res_type = llvm::StructType::get(element, element, element, element); string coord_desc = ""; From ade795255be5b345b80ce972cc0af35b813b2093 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 20:14:24 -0700 Subject: [PATCH 10/13] pasta --- src/CodeGen_PTX_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 0d4a8d7d8dc1..391f8f29b921 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -191,7 +191,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { element_type = llvm_type_of(Int(32)); } // PTX returns a 4 element struct (not a vector!) regardless of - llvm::Type *res_type = llvm::StructType::get(element, element, element, element); + llvm::Type *res_type = llvm::StructType::get(element_type, element_type, element_type, element_type); string coord_desc = ""; Type coord_type = op->args[2].type(); From 46b0de7231ef0d66912ea0ebfaa0e6c43399f057 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 20:17:40 -0700 Subject: [PATCH 11/13] pasta --- src/CodeGen_PTX_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 391f8f29b921..ec758a31f44d 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { } else if (coord_type.is_int()) { coord_desc = ".s32"; } - internal_assert(coord_type != "") << "unhandled coordinate type for ptx texture sampler " << coord_type; + internal_assert(coord_desc != "") << "unhandled coordinate type for ptx texture sampler " << coord_type; string dim = std::to_string(num_args) + "d"; string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; From 009c1d82259281a89ea5f85677cf98d9d24b3405 Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 20:32:50 -0700 Subject: [PATCH 12/13] tidy --- src/CodeGen_PTX_Dev.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index ec758a31f44d..1fd3523aee4a 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -195,7 +195,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { string coord_desc = ""; Type coord_type = op->args[2].type(); - user_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args"; + internal_assert(coord_type.bits() == 32) << "ptx texture sampler only supports 32 bit args"; if (coord_type.is_float()) { coord_desc = ".f32"; } else if (coord_type.is_uint()) { @@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { } else if (coord_type.is_int()) { coord_desc = ".s32"; } - internal_assert(coord_desc != "") << "unhandled coordinate type for ptx texture sampler " << coord_type; + internal_assert(coord_desc.) << "unhandled coordinate type for ptx texture sampler " << coord_type; string dim = std::to_string(num_args) + "d"; string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc; From d70d5f34b3dd2dce553c68da3c3a12abfb93083c Mon Sep 17 00:00:00 2001 From: John Laxson Date: Mon, 2 Nov 2020 20:45:25 -0700 Subject: [PATCH 13/13] fix --- src/CodeGen_PTX_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 1fd3523aee4a..8b88802bcebc 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -203,7 +203,7 @@ void CodeGen_PTX_Dev::visit(const Call *op) { } else if (coord_type.is_int()) { coord_desc = ".s32"; } - internal_assert(coord_desc.) << "unhandled coordinate type for ptx texture sampler " << coord_type; + internal_assert(!coord_desc.empty()) << "unhandled coordinate type for ptx texture sampler " << coord_type; string dim = std::to_string(num_args) + "d"; string intrinsic = "llvm.nvvm.tex.unified." + dim + ".v4" + res_desc + coord_desc;