diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 467e69a60827..59316a0bace0 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -559,6 +559,23 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr); TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data); +/*! + * \brief Allocate a data space on device with special memory scope. + * \note The memory could use a special multi-dimensional memory layout. + * That is why we pass shape and dtype instead of raw number of bytes. + * \param ctx The device context to perform operation. + * \param ndim The number of dimension of the tensor. + * \param shape The shape of the tensor. + * \param dtype The type of elements. + * \param mem_scope The memory scope of the tensor, + * can be nullptr, which indicate the default global DRAM + * \param out_data The allocated device pointer. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, + void** out_data); + /*! * \brief Free a data space on device. * \param ctx The device context to perform operation. @@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr); /*! * \brief Copy data from one place to another. - * \param from The source array. - * \param from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. - * \param num_bytes The size of the memory in bytes - * \param ctx_from The source context - * \param ctx_to The target context - * \param type_hint The type of elements, only neded by certain backends. - * can be useful for cross device endian converison. + * \note This API is designed to support special memory with shape dependent layout. + * We pass in DLTensor* with shape information to support these cases. + * \param from The source tensor. + * \param to The target tensor. * \param stream Optional stream object. * \return 0 when success, -1 when failure happens. */ -TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, - size_t to_offset, size_t num_bytes, TVMContext ctx_from, - TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream); +TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream); /*! * \brief Check that an object is derived from another. diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index a6f5624de084..1276663a2bc3 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -90,6 +90,17 @@ class TVM_DLL DeviceAPI { */ virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) = 0; + /*! + * \brief Allocate a data space on device with memory scope support. + * \param ctx The device context to perform operation. + * \param ndim The number of dimension of allocated tensor. + * \param shape The shape of allocated tensor. + * \param dtype The type of elements. + * \param mem_scope The memory scope of allocated tensor. + * \return The allocated device pointer. + */ + virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope = NullOpt); /*! * \brief Free a data space on device. * \param ctx The device context to perform operation. @@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI { virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0; /*! * \brief copy data from one place to another + * \note This API is designed to support special memory with shape dependent layout. + * We pass in DLTensor* with shape information to support these cases. * \param from The source array. - * \param from_offset The byte offeset in the from. * \param to The target array. - * \param to_offset The byte offset in the to. - * \param num_bytes The size of the memory in bytes - * \param ctx_from The source context - * \param ctx_to The target context - * \param type_hint The type of elements, only neded by certain backends. - * can be useful for cross device endian converison. * \param stream Optional stream object. */ - virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) = 0; + virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream); /*! * \brief Create a new stream of execution. * @@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI { static bool NeedSetDeviceContext(int device_type) { return device_type != kDLCPU && device_type != kDLMicroDev; } + + protected: + /*! + * \brief copy data from one place to another + * \param from The source array. + * \param from_offset The byte offeset in the from. + * \param to The target array. + * \param to_offset The byte offset in the to. + * \param num_bytes The size of the memory in bytes + * \param ctx_from The source context + * \param ctx_to The target context + * \param type_hint The type of elements, only neded by certain backends. + * can be useful for cross device endian converison. + * \param stream Optional stream object. + */ + virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream); }; /*! \brief The device type bigger than this is RPC device */ diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 0ff171d4821f..a884b5c6838f 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_NDARRAY_H_ #include +#include #include #include #include @@ -133,10 +134,12 @@ class NDArray : public ObjectRef { * \brief Create an empty NDArray. * \param shape The shape of the new array. * \param dtype The data type of the new array. - * \param ctx The context of the Array. + * \param ctx The context of the array. + * \param mem_scope The memory scope of the array. * \return The created Array */ - TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx); + TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx, + Optional mem_scope = NullOpt); /*! * \brief Create a NDArray backed by a dlpack tensor. * diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 2f616ce879c9..75da3d4a5c17 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -23,6 +23,7 @@ from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t +from . import _ffi_api try: # pylint: disable=wrong-import-position @@ -253,42 +254,41 @@ def numpyasarray(np_data): return arr, shape -def empty(shape, dtype="float32", ctx=context(1, 0)): +def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None): """Create an empty array given shape and device Parameters ---------- shape : tuple of int - The shape of the array + The shape of the array. dtype : type or str The data type of the array. ctx : TVMContext - The context of the array + The context of the array. + + mem_scope : Optional[str] + The memory scope of the array. Returns ------- arr : tvm.nd.NDArray The array tvm supported. """ - shape = c_array(tvm_shape_index_t, shape) - ndim = ctypes.c_int(len(shape)) - handle = TVMArrayHandle() + shape_imm = [] + for s in shape: + if isinstance(s, tvm.tir.IntImm): + shape_imm.append(s.value) + else: + shape_imm.append(int(s)) + arr = np.array(shape_imm, "int64") + ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + shape_ptr = ctypes.cast(ptr, ctypes.c_void_p) + ndim = len(shape_imm) dtype = DataType(dtype) - check_call( - _LIB.TVMArrayAlloc( - shape, - ndim, - ctypes.c_int(dtype.type_code), - ctypes.c_int(dtype.bits), - ctypes.c_int(dtype.lanes), - ctx.device_type, - ctx.device_id, - ctypes.byref(handle), - ) - ) - return _make_array(handle, False, False) + arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope) + return arr def from_dlpack(dltensor): diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index b4457bf66614..7fd27cba6136 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); } +static size_t GetDataAlignment(const DLDataType dtype) { + size_t align = (dtype.bits / 8) * dtype.lanes; + if (align < kAllocAlignment) return kAllocAlignment; + return align; +} + +void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) { + if (!mem_scope.defined() || mem_scope.value() == "global") { + // by default, we can always redirect to the flat memory allocations + DLTensor temp; + temp.data = nullptr; + temp.ctx = ctx; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + size_t size = GetDataSize(temp); + size_t alignment = GetDataAlignment(temp.dtype); + return AllocDataSpace(ctx, size, alignment, dtype); + } + LOG(FATAL) << "Device does not support allocate data space with " + << "specified memory scope: " << mem_scope.value(); + return nullptr; +} + +void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { + // by default, we can always redirect to the flat memory copy operation. + size_t nbytes = GetDataSize(*from); + ICHECK_EQ(nbytes, GetDataSize(*to)); + + ICHECK(IsContiguous(*from) && IsContiguous(*to)) + << "CopyDataFromTo only support contiguous array for now"; + CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx, + to->ctx, from->dtype, stream); +} + +void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) { + LOG(FATAL) << "Device does not support CopyDataFromTo."; +} + void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); } TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) { @@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa API_END(); } +int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, void** out_data) { + API_BEGIN(); + Optional scope; + if (mem_scope != nullptr) { + scope = String(std::string(mem_scope)); + } + out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope); + API_END(); +} + int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) { API_BEGIN(); DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr); API_END(); } -int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) { +int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { API_BEGIN(); + TVMContext ctx_from = from->ctx; + TVMContext ctx_to = to->ctx; TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to; - DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from, - ctx_to, type_hint, stream); + DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream); API_END(); } diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 146bfa804785..b745be33b456 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI { #endif } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { - memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); - } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {} void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; @@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI { static auto* inst = new CPUDeviceAPI(); return inst; } + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final { + memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); + } }; struct CPUWorkspacePool : public WorkspacePool { diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index bc47f995eac0..c2eb1ff903e3 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa if (alignment != 1) { nbytes = (nbytes + alignment - 1) / alignment * alignment; } - return TVMPlatformMemoryAllocate(nbytes, ctx, out_data); } +int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, void** out_data) { + size_t nbytes = 1; + for (int i = 0; i < ndim; ++i) { + nbytes *= shape[i]; + } + nbytes *= (dtype.bits * dtype.lanes + 7) / 8; + + int kAllocAlignment = 128; + size_t align = (dtype.bits / 8) * dtype.lanes; + if (align < kAllocAlignment) align = kAllocAlignment; + return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data); +} + int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); } -int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) { - memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes); +static bool IsContiguous(const DLTensor* arr) { + if (arr->strides == NULL) return true; + int64_t expected_stride = 1; + for (int32_t i = arr->ndim; i != 0; --i) { + int32_t k = i - 1; + if (arr->strides[k] != expected_stride) return false; + expected_stride *= arr->shape[k]; + } + return true; +} + +int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { + assert(IsContiguous(from) && IsContiguous(to)); + size_t size = 1; + for (int i = 0; i < from->ndim; ++i) { + size *= from->shape[i]; + } + size *= (from->dtype.bits * from->dtype.lanes + 7) / 8; + memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size); return 0; } diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 30abfc8dc559..c77395422e87 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI { } } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI { } } + public: TVMStreamHandle CreateStream(TVMContext ctx) { CUDA_CALL(cudaSetDevice(ctx.device_id)); cudaStream_t retval; diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 605c55eb89b9..70cebf5afa44 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -35,9 +35,6 @@ class HexagonDeviceAPI : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final; void FreeWorkspace(TVMContext ctx, void* ptr) final; @@ -48,6 +45,11 @@ class HexagonDeviceAPI : public DeviceAPI { static HexagonDeviceAPI* inst = new HexagonDeviceAPI(); return inst; } + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) final; }; // HexagonDeviceAPI. diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index d13ac7e78982..bd07dbfde9d0 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -84,14 +84,16 @@ class MetalWorkspace final : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; // get the global workspace static MetalWorkspace* Global(); + + protected: + void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final; }; /*! \brief Thread local workspace */ diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h index d28e0c396e36..d5c61eccfd6d 100644 --- a/src/runtime/minrpc/minrpc_server.h +++ b/src/runtime/minrpc/minrpc_server.h @@ -169,28 +169,39 @@ class MinRPCServer { } void HandleCopyFromRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); + DLTensor* arr = this->ArenaAlloc(1); + uint64_t data_handle; + this->Read(&data_handle); + arr->data = reinterpret_cast(data_handle); + this->Read(&(arr->ctx)); + this->Read(&(arr->ndim)); + this->Read(&(arr->dtype)); + arr->shape = this->ArenaAlloc(arr->ndim); + this->ReadArray(arr->shape, arr->ndim); + arr->strides = nullptr; + this->Read(&(arr->byte_offset)); + + uint64_t num_bytes; this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); uint8_t* data_ptr; int call_ecode = 0; - if (ctx.device_type == kDLCPU) { - data_ptr = reinterpret_cast(handle) + offset; + if (arr->ctx.device_type == kDLCPU) { + data_ptr = reinterpret_cast(data_handle) + arr->byte_offset; } else { data_ptr = this->ArenaAlloc(num_bytes); - call_ecode = - TVMDeviceCopyDataFromTo(reinterpret_cast(handle), offset, data_ptr, 0, num_bytes, - ctx, DLContext{kDLCPU, 0}, type_hint, nullptr); + DLTensor temp; + temp.data = reinterpret_cast(data_ptr); + temp.ctx = arr->ctx; + temp.ndim = arr->ndim; + temp.dtype = arr->dtype; + temp.shape = arr->shape; + temp.strides = nullptr; + temp.byte_offset = 0; + call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); } } @@ -209,30 +220,39 @@ class MinRPCServer { } void HandleCopyToRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); + DLTensor* arr = this->ArenaAlloc(1); + uint64_t data_handle; + this->Read(&data_handle); + arr->data = reinterpret_cast(data_handle); + this->Read(&(arr->ctx)); + this->Read(&(arr->ndim)); + this->Read(&(arr->dtype)); + arr->shape = this->ArenaAlloc(arr->ndim); + this->ReadArray(arr->shape, arr->ndim); + arr->strides = nullptr; + this->Read(&(arr->byte_offset)); + uint64_t num_bytes; this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - int call_ecode = 0; - if (ctx.device_type == kDLCPU) { - uint8_t* dptr = reinterpret_cast(handle) + offset; + int call_ecode = 0; + if (arr->ctx.device_type == kDLCPU) { + uint8_t* dptr = reinterpret_cast(data_handle) + arr->byte_offset; this->ReadArray(dptr, num_bytes); } else { uint8_t* temp_data = this->ArenaAlloc(num_bytes); this->ReadArray(temp_data, num_bytes); - - call_ecode = - TVMDeviceCopyDataFromTo(temp_data, 0, reinterpret_cast(handle), offset, num_bytes, - DLContext{kDLCPU, 0}, ctx, type_hint, nullptr); + DLTensor temp; + temp.data = temp_data; + temp.ctx = DLContext{kDLCPU, 0}; + temp.ndim = arr->ndim; + temp.dtype = arr->dtype; + temp.shape = arr->shape; + temp.strides = nullptr; + temp.byte_offset = 0; + call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); } } @@ -269,6 +289,10 @@ class MinRPCServer { this->SyscallDevAllocData(values, tcodes, num_args); break; } + case RPCCode::kDevAllocDataWithScope: { + this->SyscallDevAllocDataWithScope(values, tcodes, num_args); + break; + } case RPCCode::kDevFreeData: { this->SyscallDevFreeData(values, tcodes, num_args); break; @@ -342,34 +366,20 @@ class MinRPCServer { } void SyscallCopyAmongRemote(TVMValue* values, int* tcodes, int num_args) { - MINRPC_CHECK(num_args == 9); - // from, from_offset - MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle); - MINRPC_CHECK(tcodes[1] == kDLInt); - // to, to_offset + MINRPC_CHECK(num_args == 3); + // from dltensor + MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle); + // to dltensor + MINRPC_CHECK(tcodes[1] == kTVMDLTensorHandle); + // stream MINRPC_CHECK(tcodes[2] == kTVMOpaqueHandle); - MINRPC_CHECK(tcodes[3] == kDLInt); - // size - MINRPC_CHECK(tcodes[4] == kDLInt); - // ctx_from, ctx_to - MINRPC_CHECK(tcodes[5] == kTVMContext); - MINRPC_CHECK(tcodes[6] == kTVMContext); - // type_hint, stream - MINRPC_CHECK(tcodes[7] == kTVMDataType); - MINRPC_CHECK(tcodes[8] == kTVMOpaqueHandle); void* from = values[0].v_handle; - int64_t from_offset = values[1].v_int64; - void* to = values[2].v_handle; - int64_t to_offset = values[3].v_int64; - int64_t size = values[4].v_int64; - TVMContext ctx_from = values[5].v_ctx; - TVMContext ctx_to = values[6].v_ctx; - DLDataType type_hint = values[7].v_type; - TVMStreamHandle stream = values[8].v_handle; - - int call_ecode = TVMDeviceCopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from, - ctx_to, type_hint, stream); + void* to = values[1].v_handle; + TVMStreamHandle stream = values[2].v_handle; + + int call_ecode = TVMDeviceCopyDataFromTo(reinterpret_cast(from), + reinterpret_cast(to), stream); if (call_ecode == 0) { this->ReturnVoid(); @@ -400,6 +410,23 @@ class MinRPCServer { } } + void SyscallDevAllocDataWithScope(TVMValue* values, int* tcodes, int num_args) { + MINRPC_CHECK(num_args == 2); + MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle); + MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr); + + DLTensor* arr = reinterpret_cast(values[0].v_handle); + const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str); + void* handle; + int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype, + mem_scope, &handle); + if (call_ecode == 0) { + this->ReturnHandle(handle); + } else { + this->ReturnLastTVMError(); + } + } + void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) { MINRPC_CHECK(num_args == 2); MINRPC_CHECK(tcodes[0] == kTVMContext); diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h index e195b9ca9e89..07d13a7ff67b 100644 --- a/src/runtime/minrpc/rpc_reference.h +++ b/src/runtime/minrpc/rpc_reference.h @@ -28,7 +28,7 @@ namespace tvm { namespace runtime { /*! \brief The current RPC procotol version. */ -constexpr const char* kRPCProtocolVer = "0.7.0"; +constexpr const char* kRPCProtocolVer = "0.8.0"; /*! \brief The RPC code */ enum class RPCCode : int { @@ -51,6 +51,7 @@ enum class RPCCode : int { kDevFreeData, kDevStreamSync, kCopyAmongRemote, + kDevAllocDataWithScope, }; /*! @@ -107,6 +108,8 @@ inline const char* RPCCodeToString(RPCCode code) { return "kDevStreamSync"; case RPCCode::kCopyAmongRemote: return "kCopyAmongRemote"; + case RPCCode::kDevAllocDataWithScope: + return "kDevAllocDataWithScope"; default: return ""; } @@ -218,6 +221,44 @@ struct RPCReference { return getter.num_bytes(); } + template + static void SendDLTensor(TChannelPtr channel, DLTensor* arr) { + TVMContext ctx; + uint64_t data; + // When we return NDArray, we directly return + // the space and the context + // The client will be further wrapping + ctx = arr->ctx; + data = reinterpret_cast(arr->data); + channel->Write(data); + channel->Write(ctx); + channel->Write(arr->ndim); + channel->Write(arr->dtype); + channel->WriteArray(arr->shape, arr->ndim); + if (arr->strides != nullptr) { + channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride); + } + channel->Write(arr->byte_offset); + return; + } + + template + static DLTensor* ReceiveDLTensor(TChannelPtr channel) { + uint64_t handle; + channel->Read(&handle); + DLTensor* arr = channel->template ArenaAlloc(1); + DLTensor& tensor = *arr; + tensor.data = reinterpret_cast(handle); + channel->Read(&(tensor.ctx)); + channel->Read(&(tensor.ndim)); + channel->Read(&(tensor.dtype)); + tensor.shape = channel->template ArenaAlloc(tensor.ndim); + channel->ReadArray(tensor.shape, tensor.ndim); + tensor.strides = nullptr; + channel->Read(&(tensor.byte_offset)); + return arr; + } + /*! * \brief Send packed argument sequnce to the other peer. * @@ -292,24 +333,7 @@ struct RPCReference { } case kTVMDLTensorHandle: { DLTensor* arr = static_cast(value.v_handle); - TVMContext ctx; - uint64_t data; - // When we return NDArray, we directly return - // the space and the context - // The client will be further wrapping - ctx = arr->ctx; - data = reinterpret_cast(arr->data); - channel->Write(data); - channel->Write(ctx); - channel->Write(arr->ndim); - channel->Write(arr->dtype); - channel->WriteArray(arr->shape, arr->ndim); - if (arr->strides != nullptr) { - channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride); - } - if (arr->byte_offset != 0) { - channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldByteOffset); - } + SendDLTensor(channel, arr); break; } case kTVMNullptr: @@ -422,19 +446,7 @@ struct RPCReference { break; } case kTVMDLTensorHandle: { - uint64_t handle; - channel->Read(&handle); - DLTensor* arr = channel->template ArenaAlloc(1); - DLTensor& tensor = *arr; - tensor.data = reinterpret_cast(handle); - channel->Read(&(tensor.ctx)); - channel->Read(&(tensor.ndim)); - channel->Read(&(tensor.dtype)); - tensor.shape = channel->template ArenaAlloc(tensor.ndim); - channel->ReadArray(tensor.shape, tensor.ndim); - tensor.strides = nullptr; - tensor.byte_offset = 0; - value.v_handle = arr; + value.v_handle = ReceiveDLTensor(channel); break; } default: { diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index dae775606a7e..d3ddbf8c0229 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "runtime_base.h" @@ -58,36 +59,39 @@ inline void VerifyDataType(DLDataType dtype) { ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0); } -inline size_t GetDataAlignment(const DLTensor& arr) { - size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes; - if (align < kAllocAlignment) return kAllocAlignment; - return align; -} - void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(*handle); ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch"; ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now"; - DeviceAPI::Get(handle->ctx) - ->CopyDataFromTo(data, 0, handle->data, static_cast(handle->byte_offset), nbytes, - cpu_ctx, handle->ctx, handle->dtype, nullptr); + + DLTensor from; + from.data = const_cast(data); + from.ctx = DLContext{kDLCPU, 0}; + from.ndim = handle->ndim; + from.dtype = handle->dtype; + from.shape = handle->shape; + from.strides = nullptr; + from.byte_offset = 0; + DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr); // Synchronize in case data become unavailable later. DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); } void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(*handle); ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch"; ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now"; - DeviceAPI::Get(handle->ctx) - ->CopyDataFromTo(handle->data, static_cast(handle->byte_offset), data, 0, nbytes, - handle->ctx, cpu_ctx, handle->dtype, nullptr); + + DLTensor to; + to.data = const_cast(data); + to.ctx = DLContext{kDLCPU, 0}; + to.ndim = handle->ndim; + to.dtype = handle->dtype; + to.shape = handle->shape; + to.strides = nullptr; + to.byte_offset = 0; + + DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast(handle), &to, nullptr); // Synchronize in case data become unavailable later. DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); } @@ -186,13 +190,11 @@ NDArray NDArray::CreateView(std::vector shape, DLDataType dtype) { DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); } -NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx) { +NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx, + Optional mem_scope) { NDArray ret = Internal::Create(shape, dtype, ctx); - // setup memory content - size_t size = GetDataSize(ret.get_mutable()->dl_tensor); - size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor); - ret.get_mutable()->dl_tensor.data = - DeviceAPI::Get(ret->ctx)->AllocDataSpace(ret->ctx, size, alignment, ret->dtype); + ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace( + ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope); return ret; } @@ -236,9 +238,7 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str // api manager. TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx; - DeviceAPI::Get(ctx)->CopyDataFromTo(from->data, static_cast(from->byte_offset), to->data, - static_cast(to->byte_offset), from_size, from->ctx, - to->ctx, from->dtype, stream); + DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast(from), to, stream); } std::vector NDArray::Shape() const { return get_mutable()->shape_; } @@ -279,6 +279,17 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_ API_END(); } +TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) { + int64_t* shape_ptr = static_cast(static_cast(args[0])); + int ndim = args[1]; + std::vector shape(shape_ptr, shape_ptr + ndim); + DataType dtype = args[2]; + TVMContext ctx = args[3]; + Optional mem_scope = args[4]; + auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope); + *ret = ndarray; +}); + int TVMArrayFree(TVMArrayHandle handle) { API_BEGIN(); NDArray::Internal::FFIDecRef(handle); diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index fa118ed9525b..2e7f05f91020 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -232,9 +232,6 @@ class OpenCLWorkspace : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; @@ -246,6 +243,11 @@ class OpenCLWorkspace : public DeviceAPI { // get the global workspace static OpenCLWorkspace* Global(); + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final; }; /*! \brief Thread local workspace */ diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index a1e96e92b4e0..06737f99a4de 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -43,6 +43,18 @@ class RPCDeviceAPI final : public DeviceAPI { GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv); } + void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) final { + auto sess = GetSess(ctx); + auto remote_ctx = RemoveRPCSessionMask(ctx); + void* data = + sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope); + RemoteSpace* space = new RemoteSpace(); + space->data = data; + space->sess = std::move(sess); + return space; + } + void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final { auto sess = GetSess(ctx); @@ -65,30 +77,36 @@ class RPCDeviceAPI final : public DeviceAPI { } delete space; } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { + + void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + DLContext ctx_from = from->ctx; + DLContext ctx_to = to->ctx; if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) { ICHECK(ctx_from.device_type == ctx_to.device_type) << "Cannot copy across two different remote session"; - auto remote_ctx_from = RemoveRPCSessionMask(ctx_from); - auto remote_ctx_to = RemoveRPCSessionMask(ctx_to); - auto remote_ctx = remote_ctx_from; - if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to; - GetSess(ctx_from) - ->GetDeviceAPI(remote_ctx) - ->CopyDataFromTo(static_cast(from)->data, from_offset, - static_cast(to)->data, to_offset, size, - remote_ctx_from, remote_ctx_to, type_hint, stream); + DLTensor from_tensor = *from; + from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.data = static_cast(from->data)->data; + DLTensor to_tensor = *to; + to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.data = static_cast(to->data)->data; + auto remote_ctx = from_tensor.ctx; + if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx; + GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream); } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) { - auto remote_ctx_from = RemoveRPCSessionMask(ctx_from); - GetSess(ctx_from)->CopyFromRemote(static_cast(from)->data, from_offset, - to, to_offset, size, remote_ctx_from, type_hint); + DLTensor from_tensor = *from; + from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.data = static_cast(from->data)->data; + void* to_bytes = static_cast(to->data) + to->byte_offset; + size_t nbytes = GetDataSize(*to); + GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes); } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) { - auto remote_ctx_to = RemoveRPCSessionMask(ctx_to); - GetSess(ctx_to)->CopyToRemote(const_cast(from), from_offset, - static_cast(to)->data, to_offset, size, - remote_ctx_to, type_hint); + DLTensor to_tensor = *to; + to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.data = static_cast(to->data)->data; + void* from_bytes = static_cast(from->data) + from->byte_offset; + size_t nbytes = GetDataSize(*from); + GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes); } else { LOG(FATAL) << "expect copy from/to remote or between remote"; } @@ -99,6 +117,13 @@ class RPCDeviceAPI final : public DeviceAPI { GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream); } + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) final { + LOG(FATAL) << "Not implemented."; + } + private: std::shared_ptr GetSess(TVMContext ctx) { int tbl_index = GetRPCSessionIndex(ctx); diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc index fbdd93fb4f62..8716355fd68f 100644 --- a/src/runtime/rpc/rpc_endpoint.cc +++ b/src/runtime/rpc/rpc_endpoint.cc @@ -387,88 +387,72 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { void HandleSyscall(RPCCode code); void HandleCopyFromRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - this->Read(&handle); - this->Read(&offset); - this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8; - + DLTensor* arr = RPCReference::ReceiveDLTensor(this); + uint64_t data_bytes; + this->Read(&data_bytes); + size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8; auto* sess = GetServingSession(); - // Return Copy Ack with the given data - auto fcopyack = [this](char* data_ptr, size_t num_bytes) { + auto fcopyack = [this](char* dptr, size_t num_bytes) { RPCCode code = RPCCode::kCopyAck; uint64_t packet_nbytes = sizeof(code) + num_bytes; this->Write(packet_nbytes); this->Write(code); - this->WriteArray(data_ptr, num_bytes); + this->WriteArray(dptr, num_bytes); this->SwitchToState(kRecvPacketNumBytes); }; // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { - char* data_ptr = reinterpret_cast(handle) + offset; - fcopyack(data_ptr, num_bytes); + if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { + char* data_ptr = reinterpret_cast(arr->data) + arr->byte_offset; + fcopyack(data_ptr, data_bytes); } else { - char* data_ptr = this->ArenaAlloc(num_bytes); - - auto on_copy_complete = [this, elem_bytes, num_bytes, data_ptr, fcopyack](RPCCode status, - TVMArgs args) { + char* temp_data = this->ArenaAlloc(data_bytes); + auto on_copy_complete = [this, elem_bytes, data_bytes, temp_data, fcopyack](RPCCode status, + TVMArgs args) { if (status == RPCCode::kException) { this->ReturnException(args.values[0].v_str); this->SwitchToState(kRecvPacketNumBytes); } else { // endian aware handling if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(data_ptr, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes); } - fcopyack(data_ptr, num_bytes); + fcopyack(temp_data, data_bytes); } }; this->SwitchToState(kWaitForAsyncCallback); - sess->AsyncCopyFromRemote(reinterpret_cast(handle), offset, data_ptr, 0, num_bytes, - ctx, type_hint, on_copy_complete); + sess->AsyncCopyFromRemote(arr, static_cast(temp_data), data_bytes, on_copy_complete); } } void HandleCopyToRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); - this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - - size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8; + DLTensor* arr = RPCReference::ReceiveDLTensor(this); + uint64_t data_bytes; + this->Read(&data_bytes); + size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8; auto* sess = GetServingSession(); // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (ctx.device_type == kDLCPU && sess->IsLocalSession()) { - char* dptr = reinterpret_cast(handle) + offset; - this->ReadArray(dptr, num_bytes); + if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) { + char* dptr = reinterpret_cast(arr->data) + arr->byte_offset; + this->ReadArray(dptr, data_bytes); if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(dptr, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(dptr, elem_bytes, data_bytes / elem_bytes); } this->ReturnVoid(); this->SwitchToState(kRecvPacketNumBytes); } else { - char* temp_data = this->ArenaAlloc(num_bytes); - this->ReadArray(temp_data, num_bytes); + char* temp_data = this->ArenaAlloc(data_bytes); + this->ReadArray(temp_data, data_bytes); if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(temp_data, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes); } auto on_copy_complete = [this](RPCCode status, TVMArgs args) { @@ -482,8 +466,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { }; this->SwitchToState(kWaitForAsyncCallback); - sess->AsyncCopyToRemote(temp_data, 0, reinterpret_cast(handle), offset, num_bytes, ctx, - type_hint, on_copy_complete); + sess->AsyncCopyToRemote(static_cast(temp_data), arr, data_bytes, on_copy_complete); } } @@ -815,51 +798,47 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v ICHECK(code == RPCCode::kReturn) << "code=" << static_cast(code); } -void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t data_size, TVMContext ctx_to, DLDataType type_hint) { +void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) { std::lock_guard lock(mutex_); RPCCode code = RPCCode::kCopyToRemote; - uint64_t handle = reinterpret_cast(to); - uint64_t offset = static_cast(to_offset); - uint64_t size = static_cast(data_size); - uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) + - sizeof(ctx_to) + sizeof(type_hint) + data_size; + uint64_t num_data_bytes = static_cast(GetDataSize(*to)); + ICHECK_EQ(nbytes, num_data_bytes); + + uint64_t to_data = reinterpret_cast(to->data); + uint64_t shape_bytes = to->ndim * sizeof(int64_t); + uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) + + sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes + + sizeof(nbytes) + num_data_bytes; handler_->Write(packet_nbytes); handler_->Write(code); - handler_->Write(handle); - handler_->Write(offset); - handler_->Write(size); - handler_->Write(ctx_to); - handler_->Write(type_hint); - handler_->WriteArray(reinterpret_cast(from) + from_offset, data_size); - + RPCReference::SendDLTensor(handler_, to); + handler_->Write(nbytes); + handler_->WriteArray(reinterpret_cast(from_bytes), nbytes); ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn); } -void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t data_size, TVMContext ctx_from, DLDataType type_hint) { +void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) { std::lock_guard lock(mutex_); RPCCode code = RPCCode::kCopyFromRemote; - uint64_t handle = reinterpret_cast(from); - uint64_t offset = static_cast(from_offset); - uint64_t size = static_cast(data_size); - uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) + - sizeof(ctx_from) + sizeof(type_hint); + uint64_t num_data_bytes = static_cast(GetDataSize(*from)); + CHECK_EQ(nbytes, num_data_bytes); + + uint64_t from_data = reinterpret_cast(from->data); + uint64_t shape_bytes = from->ndim * sizeof(int64_t); + uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) + + sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) + + shape_bytes + sizeof(nbytes); handler_->Write(packet_nbytes); handler_->Write(code); - handler_->Write(handle); - handler_->Write(offset); - handler_->Write(size); - handler_->Write(ctx_from); - handler_->Write(type_hint); - - TVMRetValue rv; + RPCReference::SendDLTensor(handler_, from); + handler_->Write(nbytes); ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck); - handler_->ReadArray(reinterpret_cast(to) + to_offset, data_size); + + handler_->ReadArray(reinterpret_cast(to_bytes), nbytes); handler_->FinishCopyAck(); } @@ -904,6 +883,23 @@ void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { *rv = data; } +void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { + DLTensor* arr = args[0]; + TVMContext ctx = arr->ctx; + int ndim = arr->ndim; + int64_t* shape = arr->shape; + DLDataType dtype = arr->dtype; + int tcode = args[1].type_code(); + Optional mem_scope = NullOpt; + if (tcode == kTVMStr) { + mem_scope = args[1].operator String(); + } else { + ICHECK_EQ(tcode, kTVMNullptr); + } + void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope); + *rv = data; +} + void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { TVMContext ctx = args[0]; void* ptr = args[1]; @@ -911,25 +907,18 @@ void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { } void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - void* from = args[0]; - uint64_t from_offset = args[1]; - void* to = args[2]; - uint64_t to_offset = args[3]; - uint64_t size = args[4]; - TVMContext ctx_from = args[5]; - TVMContext ctx_to = args[6]; - DLDataType type_hint = args[7]; - TVMStreamHandle stream = args[8]; - TVMContext ctx = ctx_from; + DLTensor* from = args[0]; + DLTensor* to = args[1]; + TVMStreamHandle stream = args[2]; + TVMContext ctx = from->ctx; if (ctx.device_type == kDLCPU) { - ctx = ctx_to; + ctx = to->ctx; } else { - ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type) + ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type) << "Can not copy across different ctx types directly"; } - handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from, - ctx_to, type_hint, stream); + handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream); } void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) { @@ -951,6 +940,9 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) { case RPCCode::kDevAllocData: SysCallHandler(RPCDevAllocData); break; + case RPCCode::kDevAllocDataWithScope: + SysCallHandler(RPCDevAllocDataWithScope); + break; case RPCCode::kDevFreeData: SysCallHandler(RPCDevFreeData); break; @@ -989,14 +981,12 @@ class RPCClientSession : public RPCSession, public DeviceAPI { endpoint_->CallFunc(func, arg_values, arg_type_codes, num_args, fencode_return); } - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint) final { - endpoint_->CopyToRemote(from, from_offset, to, to_offset, nbytes, ctx_to, type_hint); + void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) final { + endpoint_->CopyToRemote(local_from_bytes, remote_to, nbytes); } - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint) final { - endpoint_->CopyFromRemote(from, from_offset, to, to_offset, nbytes, ctx_from, type_hint); + void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) final { + endpoint_->CopyFromRemote(remote_from, local_to_bytes, nbytes); } void FreeHandle(void* handle, int type_code) final { @@ -1019,15 +1009,30 @@ class RPCClientSession : public RPCSession, public DeviceAPI { return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint); } + void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) final { + DLTensor temp; + temp.data = nullptr; + temp.ctx = ctx; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + if (mem_scope.defined()) { + return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, + static_cast(mem_scope.value())); + } else { + return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, nullptr); + } + } + void FreeDataSpace(TVMContext ctx, void* ptr) final { endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr); } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { - endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, const_cast(from), from_offset, to, - to_offset, size, ctx_from, ctx_to, type_hint, stream); + void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream); } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h index 031435fc8ef9..8e08bfa75623 100644 --- a/src/runtime/rpc/rpc_endpoint.h +++ b/src/runtime/rpc/rpc_endpoint.h @@ -135,8 +135,7 @@ class RPCEndpoint { * \param ctx_to The target context. * \param type_hint Hint of content data type. */ - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint); + void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes); /*! * \brief Copy bytes from remote array content. * \param from The source host data. @@ -147,8 +146,7 @@ class RPCEndpoint { * \param ctx_from The source context. * \param type_hint Hint of content data type. */ - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint); + void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes); /*! * \brief Call a remote defined system function with arguments. diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc index b35c62d255fc..0650b55d0d7c 100644 --- a/src/runtime/rpc/rpc_local_session.cc +++ b/src/runtime/rpc/rpc_local_session.cc @@ -87,26 +87,36 @@ void LocalSession::CallFunc(RPCSession::PackedFuncHandle func, const TVMValue* a this->EncodeReturn(std::move(rv), encode_return); } -void LocalSession::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t nbytes, TVMContext ctx_to, DLDataType type_hint) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - this->GetDeviceAPI(ctx_to)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, cpu_ctx, - ctx_to, type_hint, nullptr); +void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) { + ICHECK_EQ(nbytes, GetDataSize(*to)); + DLTensor from; + from.data = from_bytes; + from.ctx = {kDLCPU, 0}; + from.ndim = to->ndim; + from.shape = to->shape; + from.dtype = to->dtype; + from.strides = nullptr; + from.byte_offset = 0; + TVMContext ctx_to = to->ctx; + this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr); } -void LocalSession::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t nbytes, TVMContext ctx_from, DLDataType type_hint) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - - this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, ctx_from, - cpu_ctx, type_hint, nullptr); +void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) { + ICHECK_EQ(nbytes, GetDataSize(*from)); + DLTensor to; + to.data = to_bytes; + to.ctx = {kDLCPU, 0}; + to.ndim = from->ndim; + to.shape = from->shape; + to.dtype = from->dtype; + to.strides = nullptr; + to.byte_offset = 0; + + TVMContext ctx_from = from->ctx; + this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr); diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h index 7a67ce86bf80..ea070e34bd35 100644 --- a/src/runtime/rpc/rpc_local_session.h +++ b/src/runtime/rpc/rpc_local_session.h @@ -48,11 +48,9 @@ class LocalSession : public RPCSession { void CallFunc(PackedFuncHandle func, const TVMValue* arg_values, const int* arg_type_codes, int num_args, const FEncodeReturn& fencode_return) override; - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint) override; + void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) override; - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint) override; + void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) override; void FreeHandle(void* handle, int type_code) override; diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index f5405f0c2fa0..0ac5b8dc74ef 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -51,33 +51,28 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values } } -void RPCSession::AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, RPCSession::FAsyncCallback callback) { +void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + RPCSession::FAsyncCallback callback) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; try { - this->CopyToRemote(local_from, local_from_offset, remote_to, remote_to_offset, nbytes, - remote_ctx_to, type_hint); + this->CopyToRemote(local_from_bytes, remote_to, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); } catch (const std::runtime_error& e) { this->SendException(callback, e.what()); } } -void RPCSession::AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, - TVMContext remote_ctx_from, DLDataType type_hint, +void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, RPCSession::FAsyncCallback callback) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; try { - this->CopyFromRemote(remote_from, remote_from_offset, local_to, local_to_offset, nbytes, - remote_ctx_from, type_hint); + this->CopyFromRemote(remote_from, local_to_bytes, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); } catch (const std::runtime_error& e) { this->SendException(callback, e.what()); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 4ea937acc6ef..4b942f2230ba 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -127,30 +127,18 @@ class RPCSession { /*! * \brief Copy bytes into remote array content. - * \param local_from The source host data. - * \param local_from_offset The byte offeset in the from. + * \param local_from_bytes The source host data. * \param remote_to The target array. - * \param remote_to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_to The target context. - * \param type_hint Hint of content data type. */ - virtual void CopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint) = 0; + virtual void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) = 0; /*! * \brief Copy bytes from remote array content. * \param remote_from The source host data. - * \param remote_from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. + * \param local_to_bytes The target array. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_from The source context in the remote. - * \param type_hint Hint of content data type. */ - virtual void CopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from, - DLDataType type_hint) = 0; + virtual void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) = 0; /*! * \brief Free a remote function. @@ -223,40 +211,27 @@ class RPCSession { /*! * \brief Asynchrous version of CopyToRemote. * - * \param local_from The source host data. - * \param local_from_offset The byte offeset in the from. + * \param local_from_bytes The source host data. * \param remote_to The target array. - * \param remote_to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_to The target context. - * \param type_hint Hint of content data type. - * * \param on_complete The callback to signal copy complete. * \note All the allocated memory in local_from, and remote_to * must stay alive until on_compelete is called. */ - virtual void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, FAsyncCallback on_complete); + virtual void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + FAsyncCallback on_complete); /*! * \brief Asynchrous version of CopyFromRemote. * * \param remote_from The source host data. - * \param remote_from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. + * \param local_to_bytes The target array. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_from The source context in the remote. - * \param type_hint Hint of content data type. - * * \param on_complete The callback to signal copy complete. * \note All the allocated memory in remote_from, and local_to * must stay alive until on_compelete is called. */ - virtual void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, - TVMContext remote_ctx_from, DLDataType type_hint, + virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, FAsyncCallback on_complete); /*! * \brief Asynchrously wait for all events in ctx, stream compeletes. diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index cbf1974ee3c7..f40fd80f38b5 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -199,6 +199,7 @@ class VulkanDeviceAPI final : public DeviceAPI { delete pbuf; } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -307,6 +308,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } } + public: // Always use the default stream TVMStreamHandle CreateStream(TVMContext ctx) { LOG(FATAL) << "Not implemented"; diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 6abd12252d1d..b72caad1e3df 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -177,33 +177,37 @@ class AsyncLocalSession : public LocalSession { } } - void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, FAsyncCallback on_complete) final { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; + void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + FAsyncCallback on_complete) final { try { - this->GetDeviceAPI(remote_ctx_to) - ->CopyDataFromTo(local_from, local_from_offset, remote_to, remote_to_offset, nbytes, - cpu_ctx, remote_ctx_to, type_hint, nullptr); - this->AsyncStreamWait(remote_ctx_to, nullptr, on_complete); + DLTensor local_from; + local_from.data = local_from_bytes; + local_from.ctx = TVMContext{kDLCPU, 0}; + local_from.ndim = remote_to->ndim; + local_from.shape = remote_to->shape; + local_from.dtype = remote_to->dtype; + local_from.strides = nullptr; + local_from.byte_offset = 0; + this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr); + this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } } - void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from, - DLDataType type_hint, FAsyncCallback on_complete) final { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; + void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, + FAsyncCallback on_complete) final { try { - this->GetDeviceAPI(remote_ctx_from) - ->CopyDataFromTo(remote_from, remote_from_offset, local_to, local_to_offset, nbytes, - remote_ctx_from, cpu_ctx, type_hint, nullptr); - this->AsyncStreamWait(remote_ctx_from, nullptr, on_complete); + DLTensor local_to; + local_to.data = local_to_bytes; + local_to.ctx = TVMContext{kDLCPU, 0}; + local_to.ndim = remote_from->ndim; + local_to.shape = remote_from->shape; + local_to.dtype = remote_from->dtype; + local_to.strides = nullptr; + local_to.byte_offset = 0; + this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr); + this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index 54601e37d037..62b87af01774 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -82,6 +82,7 @@ class WebGPUDeviceAPI : public DeviceAPI { void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -102,6 +103,7 @@ class WebGPUDeviceAPI : public DeviceAPI { } } + public: TVMStreamHandle CreateStream(TVMContext ctx) final { LOG(FATAL) << "Not implemented"; return nullptr;