diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 2e7f05f91020..36224cb416c1 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -162,6 +162,29 @@ inline const char* CLGetErrorString(cl_int error) { } } +inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) { + DataType dtype(data_type); + if (dtype == DataType::Float(32)) { + return CL_FLOAT; + } else if (dtype == DataType::Float(16)) { + return CL_HALF_FLOAT; + } else if (dtype == DataType::Int(8)) { + return CL_SIGNED_INT8; + } else if (dtype == DataType::Int(16)) { + return CL_SIGNED_INT16; + } else if (dtype == DataType::Int(32)) { + return CL_SIGNED_INT32; + } else if (dtype == DataType::UInt(8)) { + return CL_UNSIGNED_INT8; + } else if (dtype == DataType::UInt(16)) { + return CL_UNSIGNED_INT16; + } else if (dtype == DataType::UInt(32)) { + return CL_UNSIGNED_INT32; + } + LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype; + return CL_FLOAT; +} + /*! * \brief Protected OpenCL call * \param func Expression to call. @@ -231,6 +254,8 @@ class OpenCLWorkspace : public DeviceAPI { void SetDevice(TVMContext ctx) final; void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final; + void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope = NullOpt) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; @@ -337,6 +362,14 @@ class OpenCLModuleNode : public ModuleNode { std::vector kernels_; }; +inline cl_mem_object_type GetMemObjectType(const void* mem_ptr) { + cl_mem mem = static_cast(const_cast(mem_ptr)); + cl_mem_info param_name = CL_MEM_TYPE; + cl_mem_object_type mem_type; + OPENCL_CALL(clGetMemObjectInfo(mem, param_name, sizeof(mem_type), &mem_type, NULL)); + return mem_type; +} + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_ diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index a3ec21e28f1d..5db3d2e5cb20 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -126,6 +126,81 @@ void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignm return mptr; } +static inline size_t GetDataAlignment(const DLDataType dtype) { + size_t align = (dtype.bits / 8) * dtype.lanes; + if (align < kAllocAlignment) return kAllocAlignment; + return align; +} + +static std::tuple FlatShapeTo2D(std::vector shape) { + ICHECK(shape.size() >= 1 && shape.back() == 4); + while (shape.size() < 3) { + shape.insert(shape.end() - 1, 1); + } + int64_t width = 1; + for (auto it = shape.begin(); it < shape.end() - 2; ++it) { + width *= *it; + } + int64_t height = *(shape.end() - 2); + return std::make_tuple(width, height); +} + +void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, Optional mem_scope) { + if (!mem_scope.defined() || mem_scope.value() == "global") { + // by default, we can always redirect to the flat memory allocations + DLTensor temp; + temp.data = nullptr; + temp.ctx = ctx; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + size_t size = GetDataSize(temp); + size_t alignment = GetDataAlignment(temp.dtype); + return AllocDataSpace(ctx, size, alignment, dtype); + } else if (mem_scope.value() == "global:texture-act") { + this->Init(); + ICHECK(this->context != nullptr) << "No OpenCL device"; + cl_image_format image_format; + image_format.image_channel_data_type = DTypeToOpenCLChannelType(dtype); + cl_image_desc image_desc; + + // shape must be (?, ..., ?, 4) + ICHECK_GT(ndim, 1); + ICHECK_EQ(shape[ndim - 1], 4); + // prepare descriptors + image_format.image_channel_order = CL_RGBA; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; + // flat the tensor shape to 2D image + size_t width, height; + std::vector vshape(shape, shape + ndim); + std::tie(width, height) = FlatShapeTo2D(vshape); + // LOG(INFO) << "width = " << width; + // LOG(INFO) << "height = " << height; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = 0; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + cl_int err_code; + cl_mem mptr = clCreateImage(this->context, CL_MEM_READ_WRITE, &image_format, &image_desc, + nullptr, &err_code); + OPENCL_CHECK_ERROR(err_code); + return mptr; + } else { + LOG(FATAL) << "Device does not support allocate data space with " + << "specified memory scope: " << mem_scope.value(); + return nullptr; + } +} + void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { // We have to make sure that the memory object is not in the command queue // for some OpenCL platforms. @@ -135,6 +210,17 @@ void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { OPENCL_CALL(clReleaseMemObject(mptr)); } +static inline void GetImageShape(const void* mem_ptr, size_t* region) { + cl_mem mem = static_cast(const_cast(mem_ptr)); + size_t width, height; + OPENCL_CALL(clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL)); + OPENCL_CALL(clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL)); + region[0] = width; + region[1] = height; + region[2] = 1; + return; +} + void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, @@ -142,21 +228,74 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* this->Init(); ICHECK(stream == nullptr); if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) { - OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to), - static_cast((void*)from), // NOLINT(*) - static_cast(to), from_offset, to_offset, size, 0, - nullptr, nullptr)); + cl_mem_object_type from_type = GetMemObjectType(from); + cl_mem_object_type to_type = GetMemObjectType(to); + if (from_type == CL_MEM_OBJECT_BUFFER && to_type == CL_MEM_OBJECT_BUFFER) { + OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to), + static_cast((void*)from), // NOLINT(*) + static_cast(to), from_offset, to_offset, size, 0, + nullptr, nullptr)); + } else if (from_type == CL_MEM_OBJECT_IMAGE2D && to_type == CL_MEM_OBJECT_IMAGE2D) { + size_t from_origin[3] = {0, 0, 0}; + size_t to_origin[3] = {0, 0, 0}; + size_t region[3]; + GetImageShape(from, region); + OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(ctx_to), + static_cast((void*)from), // NOLINT(*) + static_cast(to), from_origin, to_origin, region, 0, + nullptr, nullptr)); + } else { + LOG(FATAL) << "OpenCL memory object type is wrong."; + } } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) { - OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from), - static_cast((void*)from), // NOLINT(*) - CL_FALSE, from_offset, size, static_cast(to) + to_offset, - 0, nullptr, nullptr)); - OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); + cl_mem_object_type from_type = GetMemObjectType(from); + switch (from_type) { + case CL_MEM_OBJECT_BUFFER: + OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from), + static_cast((void*)from), // NOLINT(*) + CL_FALSE, from_offset, size, + static_cast(to) + to_offset, 0, nullptr, nullptr)); + OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); + break; + case CL_MEM_OBJECT_IMAGE2D: { + size_t origin[3] = {0, 0, 0}; + size_t region[3]; + GetImageShape(from, region); + OPENCL_CALL(clEnqueueReadImage(this->GetQueue(ctx_from), + static_cast((void*)from), // NOLINT(*) + CL_FALSE, origin, region, 0, 0, + static_cast(to) + to_offset, 0, nullptr, nullptr)); + OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); + break; + } + default: + LOG(FATAL) << "OpenCL memory object type is wrong."; + } } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) { - OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(ctx_to), static_cast(to), CL_FALSE, - to_offset, size, static_cast(from) + from_offset, - 0, nullptr, nullptr)); - OPENCL_CALL(clFinish(this->GetQueue(ctx_to))); + cl_mem_object_type to_type = GetMemObjectType(to); + switch (to_type) { + case CL_MEM_OBJECT_BUFFER: + OPENCL_CALL(clEnqueueWriteBuffer( + this->GetQueue(ctx_to), static_cast(to), CL_FALSE, to_offset, size, + static_cast(from) + from_offset, 0, nullptr, nullptr)); + OPENCL_CALL(clFinish(this->GetQueue(ctx_to))); + break; + case CL_MEM_OBJECT_IMAGE2D: { + size_t origin[3] = {0, 0, 0}; + size_t region[3]; + GetImageShape(to, region); + OPENCL_CALL(clEnqueueWriteImage(this->GetQueue(ctx_to), + static_cast((void*)to), // NOLINT(*) + CL_FALSE, origin, region, 0, 0, + static_cast(from) + from_offset, 0, nullptr, + nullptr)); + OPENCL_CALL(clFinish(this->GetQueue(ctx_to))); + break; + } + default: + LOG(FATAL) << "OpenCL memory type is wrong."; + } + } else { LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL"; } diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py index 8a070da89641..48d84ca36618 100644 --- a/tests/python/unittest/test_target_codegen_opencl.py +++ b/tests/python/unittest/test_target_codegen_opencl.py @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. import tvm -from tvm import te +from tvm import te, nd import tvm.testing +import numpy as np target = "opencl" @@ -120,6 +121,23 @@ def check_max(ctx, n, dtype): check_max(ctx, 1, "float64") +@tvm.testing.requires_gpu +@tvm.testing.requires_opencl +def test_opencl_texture_memory(): + def check_allocate_and_copy(shape): + cpu_arr = nd.array(np.random.rand(*shape).astype("float32"), tvm.cpu(0)) + opencl_arr0 = nd.empty(cpu_arr.shape, cpu_arr.dtype, tvm.opencl(0), "global:texture-act") + opencl_arr1 = nd.empty(cpu_arr.shape, cpu_arr.dtype, tvm.opencl(0), "global:texture-act") + cpu_arr.copyto(opencl_arr0) + opencl_arr0.copyto(opencl_arr1) + np.testing.assert_equal(cpu_arr.asnumpy(), opencl_arr1.asnumpy()) + + check_allocate_and_copy((3, 4)) + check_allocate_and_copy((5, 6, 4)) + check_allocate_and_copy((8, 5, 6, 4)) + + if __name__ == "__main__": test_opencl_ternary_expression() test_opencl_inf_nan() + test_opencl_texture_memory()