From 0496e1d81d2a19f5594a7862bf8a0455d46404ab Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Fri, 7 Sep 2018 11:00:57 -0700 Subject: [PATCH 01/10] Rebase to upstream master combine modules for heterogeneous execution --- python/tvm/__init__.py | 2 +- python/tvm/build_module.py | 77 +++- python/tvm/contrib/graph_runtime.py | 71 ++- src/runtime/graph/graph_runtime.cc | 260 +++++++---- .../unittest/test_runtime_heterogeneous.py | 413 ++++++++++++++++++ 5 files changed, 724 insertions(+), 99 deletions(-) create mode 100644 tests/python/unittest/test_runtime_heterogeneous.py diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index a028dfeddf36..7076cf74b630 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -31,7 +31,7 @@ from .node import register_node from .ndarray import register_extension from .schedule import create_schedule -from .build_module import build, lower, build_config +from .build_module import build, lower, build_config, combine_modules from .tag import tag_scope # Contrib initializers diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 70935cde1816..88506b49d175 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -384,8 +384,14 @@ def build(sch, target=None, target_host=None, name="default_function", - binds=None): - """Build a function with arguments as signiture. + binds=None, + postpone_host_codegen=False): + """Build a function with arguments as signiture. Code will be generated + for a device specified by the target. For homogeneous execution, a module + that contains both host and device code is returned. For heterogeneous + execution, a list of lowered functions for the host and a module containing + device code are returned, but actual code generation for the host module is + postponed after code generation is finished for all devices. Parameters ---------- @@ -414,6 +420,11 @@ def build(sch, Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. + postpone_host_codegen : bool, optional + A bool value that indicates if code generation for the host module + should be postponed. This variable is set to be true for heterogeneous + execution. Otherwise, it is defaulted to false. + Returns ------- f : Function, or pair of functions @@ -498,9 +509,67 @@ def build(sch, fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice] fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost] fhost = [ir_pass.CombineContextCall(x) for x in fhost] - mhost = codegen.build_module(fhost, str(target_host)) + # Append fhost to the device module and return the updated module. All + # device modules will be imported to the host module after all of them are + # collected. + mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None + if postpone_host_codegen: + return mdev, fhost + + mhost = codegen.build_module(fhost, str(target_host)) if fdevice: - mdev = codegen.build_module(fdevice, str(target_device)) mhost.import_module(mdev) return mhost + +def combine_modules(host_funcs, device_modules, target_host=None): + """ Generate the host module for the lowered host functions by combining + them together. Then all device modules are imported to the combined host + module. This function is used for heterogeneous execution where multiple + device modules need to be imported to the host module. For homogeneous + execution, tvm.build is sufficient. + + Parameters + ---------- + host_funcs : LoweredFunc or list of LoweredFunc. + Lowered functions to be combined as the host module through codegen. + + device_modules : tvm.module or list of tvm.module. + Device modules will be imported into host module. + + Returns + ------- + mhost : The module that contains both host and device code. + """ + if isinstance(host_funcs, container.LoweredFunc): + host_funcs = [host_funcs] + elif not isinstance(host_funcs, (list, tuple, container.Array)): + raise ValueError("host_fucns must be the type of LoweredFunc or list " + "of LoweredFunc.") + + if isinstance(device_modules, module.Module): + device_modules = [device_modules] + elif not isinstance(device_modules, (list, tuple, container.Array)): + raise ValueError("host_funcs must be the type of Module or list of " + "Module.") + for func in host_funcs: + if not isinstance(func, container.LoweredFunc): + raise ValueError("host_fucns must be the type of LoweredFunc or " + "list of LoweredFunc.") + for device_mod in device_modules: + if device_mod and not isinstance(device_mod, module.Module): + raise ValueError("device_modules must be the type of Module or " + "list of Module.") + + if not target_host: + target_host = "llvm" if module.enabled("llvm") else "stackvm" + target_host = _target.create(target_host) + + # Generate code for the list of host functions. + mhost = codegen.build_module(host_funcs, str(target_host)) + # Import all device modules. + for device_mod in device_modules: + if device_mod: + mhost.import_module(device_mod) + + return mhost diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index e49b966e6a1e..2f8354068337 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -1,28 +1,28 @@ """Minimum graph runtime that executes graph containing TVM PackedFunc.""" import numpy as np +import tvm from .._ffi.base import string_types from .._ffi.function import get_global_func +from .._ffi.runtime_ctypes import TVMContext from ..rpc import base as rpc_base from .. import ndarray as nd - def create(graph_json_str, libmod, ctx): """Create a runtime executor module given a graph and module. - Parameters ---------- graph_json_str : str or graph class The graph to be deployed in json format output by nnvm graph. The graph can only contain one operator(tvm_op) that points to the name of PackedFunc in the libmod. - libmod : tvm.Module The module of the corresponding function - - ctx : TVMContext - The context to deploy the module, can be local or remote. - + ctx : TVMContext or list of TVMContext + The context to deploy the module. It can be local or remote when there + is only one TVMContext. Otherwise, the first context in the list will + be used as this purpose. All context should be given for heterogeneous + execution. Returns ------- graph_module : GraphModule @@ -33,17 +33,54 @@ def create(graph_json_str, libmod, ctx): graph_json_str = graph_json_str._tvm_graph_json() except AttributeError: raise ValueError("Type %s is not supported" % type(graph_json_str)) - device_type = ctx.device_type - device_id = ctx.device_id - if device_type >= rpc_base.RPC_SESS_MASK: + if isinstance(ctx, TVMContext): + ctx = [ctx] + elif not isinstance(ctx, (list, tuple)): + raise ValueError("ctx has to be the type of TVMContext or a list of " + "TVMCTVMContext") + has_cpu = False + for i, cur_ctx in enumerate(ctx): + if not isinstance(cur_ctx, TVMContext): + raise ValueError("ctx has to be the type of TVMContext or a list " + "of TVMCTVMContext") + if cur_ctx.device_type == tvm.cpu(0).device_type: + has_cpu = True + elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: + ctx[0], ctx[i] = ctx[i], ctx[0] + + num_devices = len(ctx) + device_types = [] + device_ids = [] + for cur_ctx in ctx: + device_types.append(cur_ctx.device_type) + device_ids.append(cur_ctx.device_id) + + if device_types[0] >= rpc_base.RPC_SESS_MASK: + if num_devices > 1: + raise ValueError("RPC hasn't been supported for heterogeneous " + "execution yet.") assert libmod.type_key == "rpc" - assert rpc_base._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index + assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index hmod = rpc_base._ModuleHandle(libmod) - fcreate = ctx._rpc_sess.get_function("tvm.graph_runtime.remote_create") - device_type = device_type % rpc_base.RPC_SESS_MASK - return GraphModule(fcreate(graph_json_str, hmod, device_type, device_id), ctx) + fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") + device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK + return GraphModule(fcreate(graph_json_str, hmod, device_types[0], + device_ids[0]), ctx[0]) + + # Assume CPU is the host processor when there are multiple devices on + # a hardware platform. + if (num_devices > 1) and (not has_cpu): + raise RuntimeError( + "CPU should be the host processor for heterogenous execution, but" + " not found in ctx.") + + device_type_arr = (ctypes.c_int * num_devices)(*device_types) + void_dt_arr = ctypes.cast(device_type_arr, ctypes.c_void_p) + device_id_arr = (ctypes.c_int * num_devices)(*device_ids) + void_di_arr = ctypes.cast(device_id_arr, ctypes.c_void_p) fcreate = get_global_func("tvm.graph_runtime.create") - return GraphModule(fcreate(graph_json_str, libmod, device_type, device_id), ctx) + return GraphModule(fcreate(graph_json_str, libmod, void_dt_arr, + void_di_arr, num_devices), ctx[0]) class GraphModule(object): @@ -69,6 +106,7 @@ class GraphModule(object): ctx : TVMContext The context this module is under """ + def __init__(self, module, ctx): self.module = module self._set_input = module["set_input"] @@ -177,7 +215,8 @@ def debug_get_output(self, node, out): if hasattr(self, '_debug_get_output'): self._debug_get_output(node, out) else: - raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0") + raise RuntimeError( + "Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0") return out def load_params(self, params_bytes): diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 162d616dea8a..54084358ef69 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -2,22 +2,29 @@ * Copyright (c) 2017 by Contributors * \file graph_runtime.cc */ +#include "graph_runtime.h" + +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include +#include + #include -#include #include -#include "graph_runtime.h" +#include +#include namespace tvm { namespace runtime { +using StorageDeviceMap = std::unordered_map; +using DeviceStoragePoolMap = std::unordered_map; +using ModuleContextMap = std::unordered_map; -/*! \brief macro to do C API call */ +/*! \brief Macro to do C API call. */ #define TVM_CCALL(func) \ { \ int ret = (func); \ @@ -34,7 +41,7 @@ namespace runtime { class GraphRuntime : public ModuleNode { public: /*! - * \brief Get member function to front-end + * \brief Get member function to front-end. * \param name The name of the function. * \param sptr_to_self The pointer to the module node. * \return The corresponding member function. @@ -58,12 +65,13 @@ class GraphRuntime : public ModuleNode { /*! * \brief Initialize the graph executor with graph and context. * \param graph_json The execution graph. - * \param module The module containing the compiled functions. - * \param ctx The context where the graph should sit on + * \param module The module containing the compiled functions for the host + * processor. + * \param ctxs The context of the host and devices where graph nodes will be + * executed on. */ - void Init(const std::string& graph_json, - tvm::runtime::Module module, - TVMContext ctx) { + void Init(const std::string& graph_json, const tvm::runtime::Module& module, + const std::vector& ctxs) { #ifndef _LIBCPP_SGX_NO_IOSTREAMS std::istringstream is(graph_json); #else @@ -72,10 +80,11 @@ class GraphRuntime : public ModuleNode { dmlc::JSONReader reader(&is); this->Load(&reader); module_ = module; - ctx_ = ctx; + ctxs_ = ctxs; this->SetupStorage(); this->SetupOpExecs(); } + /*! * \brief Get the input index given the name of input. * \param name The name of the input. @@ -92,7 +101,7 @@ class GraphRuntime : public ModuleNode { return -1; } /*! - * \brief set index-th input to the graph. + * \brief Set index-th input to the graph. * \param index The input index. * \param data_in The input data. */ @@ -134,7 +143,7 @@ class GraphRuntime : public ModuleNode { /*! * \brief Copy index-th output to data_out. * \param index The output index. - * \param data_out the output data. + * \param data_out The output data. */ void CopyOutputTo(int index, DLTensor* data_out) { CHECK_LT(static_cast(index), outputs_.size()); @@ -172,8 +181,8 @@ class GraphRuntime : public ModuleNode { * from begining upto the index-th node and return output of index-th node. * This is costly operation and suggest to use only for debug porpose. * - * \param index: The index of the node. - * \param data_out the node data. + * \param index The index of the node. + * \param data_out The node data. */ void DebugGetNodeOutput(int index, DLTensor* data_out) { CHECK_LT(static_cast(index), nodes_.size()); @@ -188,7 +197,7 @@ class GraphRuntime : public ModuleNode { } #endif /*! - * \brief Load parameters from binary stream + * \brief Load parameters from binary stream. * \param strm The input stream. */ void LoadParams(dmlc::Stream* strm); @@ -232,6 +241,8 @@ class GraphRuntime : public ModuleNode { TVMOpParam param; // inputs std::vector inputs; + // device_type is used to indicate where the node should be scheduled to. + DLDeviceType device_type; // control deps std::vector control_deps; // JSON Loader @@ -252,16 +263,20 @@ class GraphRuntime : public ModuleNode { bitmask |= 4; } else if (key == "flatten_data") { param->flatten_data = strtoul(value.c_str(), nullptr, 10); - bitmask |= 8; + // TODO(zhiics) Enable the following when annotation is added to the + // heterogeneous compilation part. + // bitmask |= 8; } } - CHECK_EQ(bitmask, 1|2|4|8) << "invalid format"; + // TODO(zhiics) Add |8 when annotation is added to the heterogeneous + // compilation part. + CHECK_EQ(bitmask, 1|2|4) << "invalid format"; } // JSON Loader void Load(dmlc::JSONReader *reader) { reader->BeginObject(); - std::unordered_map dict; int bitmask = 0; + int device_type = 0; std::string key; while (reader->NextObjectItem(&key)) { if (key == "op") { @@ -277,10 +292,16 @@ class GraphRuntime : public ModuleNode { this->LoadAttrs(reader, ¶m); } else if (key == "control_deps") { reader->Read(&control_deps); + } else if (key == "device_type") { + reader->Read(&device_type); + this->device_type = static_cast(device_type); + // TODO(zhiics) Enable this when working on the compiler part. + // bitmask |= 8; } else { LOG(FATAL) << "do not support key " << key; } } + // TODO(zhiics) Add |8 in the compiler pass. CHECK_EQ(bitmask, 1|2|4) << "invalid format"; } }; @@ -372,13 +393,16 @@ class GraphRuntime : public ModuleNode { } /*! \brief Setup the temporal storage */ void SetupStorage(); - /*! \brief Setup the executors */ + /*! \brief Setup the executors. */ void SetupOpExecs(); + /*! \brief Get storage id to device map. */ + StorageDeviceMap GetStorageDeviceMap() const; /*! * \brief Create a executtion function given input. - * \param attrs The node attributes + * \param attrs The node attributes. * \param args The arguments to the functor, including inputs and outputs. - * \param num_inputs Number of inputs + * \param num_inputs Number of inputs. + * \param dev_type The device type of the tvm_op. * \return The created executor. */ std::function CreateTVMOp(const TVMOpParam& attrs, @@ -392,7 +416,7 @@ class GraphRuntime : public ModuleNode { uint32_t entry_id(const NodeEntry& e) const { return entry_id(e.node_id, e.index); } - // Number of node entries + // Number of node entries. uint32_t num_node_entries() const { return node_row_ptr_.back(); } @@ -400,25 +424,25 @@ class GraphRuntime : public ModuleNode { uint32_t num_nodes() const { return static_cast(nodes_.size()); } - // The graph nodes. + /*! \brief The graph nodes. */ std::vector nodes_; - // The argument nodes. + /*! \brief The argument nodes. */ std::vector input_nodes_; - // used or quick entry indexing + /*! \brief Used for quick entry indexing. */ std::vector node_row_ptr_; - // output entries + /*! \brief Output entries. */ std::vector outputs_; - // Additional graph attributes + /*! \brief Additional graph attributes. */ GraphAttr attrs_; - /*! \brief The code module */ + /*! \brief The code module that contains both host and device code. */ tvm::runtime::Module module_; - /*! \brief execution context */ - TVMContext ctx_; - /*! \brief common storage pool */ - std::vector storage_pool_; - /*! \brief data entry of each node */ + /*! \brief Execution context of all devices including the host. */ + std::vector ctxs_; + /*! \brief Common storage pool for each device. */ + DeviceStoragePoolMap device_storage_pool_; + /*! \brief Data entry of each node. */ std::vector data_entry_; - /*! \brief operator on each node */ + /*! \brief Operator on each node. */ std::vector > op_execs_; }; @@ -452,16 +476,45 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) { } } +// Return a storage id to device type map. This map will be used to help memory +// allocation for the storage pool of each device. It will be also used to +// allocate memory to each data_entry_. +StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const { + StorageDeviceMap sid_dev_map; + for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) { + const auto& inode = nodes_[nid]; + for (const auto& e : inode.inputs) { + uint32_t eid = this->entry_id(e); + uint32_t sid = attrs_.storage_id[eid]; + auto en_dev = nodes_[e.node_id].device_type; + CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev) + << "Cannot map the same storage id to multiple devices."; + sid_dev_map[sid] = en_dev; + } + } + // Get all output entries. + for (const auto& output : outputs_) { + uint32_t eid = this->entry_id(output); + uint32_t sid = attrs_.storage_id[eid]; + auto en_dev = nodes_[eid].device_type; + CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev) + << "Cannot map the same storage id to multiple devices."; + sid_dev_map[sid] = en_dev; + } + return sid_dev_map; +} + void GraphRuntime::SetupStorage() { // Grab saved optimization plan from graph. std::vector vtype; for (const std::string& s_type : attrs_.dltype) { vtype.push_back(tvm::runtime::String2TVMType(s_type)); } - data_entry_.resize(num_node_entries()); - // size of each storage pool entry - std::vector pool_entry_bytes; - // Find the maximum space size. + + const StorageDeviceMap& sid_dev_map = GetStorageDeviceMap(); + std::unordered_map device_pool_entry_bytes; + + // Find the maximum space size for each device. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; size_t size = 1; @@ -474,23 +527,38 @@ void GraphRuntime::SetupStorage() { CHECK_EQ(bits % 8U, 0U); size_t bytes = (bits / 8U) * size; - size_t sid = static_cast(storage_id); - if (sid >= pool_entry_bytes.size()) { - pool_entry_bytes.resize(sid + 1, 0); - } - pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes); + uint32_t sid = static_cast(storage_id); + device_pool_entry_bytes[sid] = + std::max(device_pool_entry_bytes[sid], bytes); } - // Allocate the space. - for (size_t i = 0; i < pool_entry_bytes.size(); ++i) { + + // Allocate the space on each device. + for (const auto& pit : device_pool_entry_bytes) { std::vector shape; - shape.push_back(static_cast(pool_entry_bytes[i] + 3) / 4); - storage_pool_.push_back(NDArray::Empty(shape, DLDataType {kDLFloat, 32, 1}, ctx_)); + shape.push_back(static_cast(pit.second + 3) / 4); + TVMContext ctx = ctxs_[0]; + // This for loop is very fast since there are usually only a couple of + // devices available on the same hardware. + for (const auto& cit : ctxs_) { + if (sid_dev_map.at(pit.first) == cit.device_type) { + ctx = cit; + break; + } + } + device_storage_pool_[pit.first] = + NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx); } - // Assign the pooled entries. + + // Assign the pooled entries. A unified memory pool is used to simplifiy + // memory assignment for each node entry. The allocated memory on each device + // is mapped to this pool by querying the storage id to device type map. + data_entry_.resize(num_node_entries()); for (size_t i = 0; i < data_entry_.size(); ++i) { - int storage_id = attrs_.storage_id[i]; - CHECK_LT(static_cast(storage_id), storage_pool_.size()); - data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); + uint32_t storage_id = static_cast(attrs_.storage_id[i]); + CHECK(device_storage_pool_.count(storage_id)) + << "The storage hasn't been assigned to a specific device."; + data_entry_[i] = + device_storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); } } @@ -508,8 +576,9 @@ void GraphRuntime::SetupOpExecs() { uint32_t eid = this->entry_id(nid, index); args.push_back(*(data_entry_[eid].operator->())); } - CHECK_EQ(inode.op_type, "tvm_op") - << "Can only take tvm_op as op"; + CHECK(inode.op_type == "tvm_op" || inode.op_type == "device_copy_op") + << "Can only take tvm_op or device_copy_op as op"; + op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size()); } } @@ -543,13 +612,26 @@ std::function GraphRuntime::CreateTVMOp( t->shape = &(arg_ptr->shape_data[i]); } } + if (param.func_name == "__nop") { return [](){}; + } else if (param.func_name == "__copy") { + // Perform cross device data copy. + // Directly copy data from the input to the output. + auto fexec = [arg_ptr]() { + DLTensor* from = static_cast(arg_ptr->arg_values[0].v_handle); + DLTensor* to = static_cast(arg_ptr->arg_values[1].v_handle); + TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr)); + }; + return fexec; } - // get compiled function from module. + + // Get compiled function from the module that contains both host and device + // code. tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false); CHECK(pf != nullptr) << "no such function in module: " << param.func_name; - auto fexec = [arg_ptr, pf] () { + + auto fexec = [arg_ptr, pf]() { TVMRetValue rv; TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), @@ -562,7 +644,7 @@ std::function GraphRuntime::CreateTVMOp( PackedFunc GraphRuntime::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - // return member functions during query. + // Return member functions during query. if (name == "set_input") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { if (args[0].type_code() == kStr) { @@ -618,29 +700,51 @@ PackedFunc GraphRuntime::GetFunction( } } -Module GraphRuntimeCreate(std::string sym_json, - tvm::runtime::Module m, - int device_type, - int device_id) { - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; +Module GraphRuntimeCreate(const std::string& sym_json, + const tvm::runtime::Module& m, + const std::vector& ctxs) { std::shared_ptr exec = std::make_shared(); - exec->Init(sym_json, m, ctx); + exec->Init(sym_json, m, ctxs); return Module(exec); } +// Get all context for the host and other runtime devices. +std::vector GetAllContext(const TVMArgs& args) { + std::vector ret; + int* device_types = args[2].ptr(); + int* device_ids = args[3].ptr(); + int num_devices = args[4]; + + TVMContext ctx; + for (int i = 0; i < num_devices; i++) { + ctx.device_type = static_cast(device_types[i]); + ctx.device_id = device_ids[i]; + ret.push_back(ctx); + } + + return ret; +} + TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") -.set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = GraphRuntimeCreate(args[0], args[1], args[2], args[3]); - }); + .set_body([](TVMArgs args, TVMRetValue* rv) { + CHECK_EQ(args.size(), 5) << "5 arguments are expected, but " + << args.size() << " are passed in."; + tvm::runtime::Module modules = args[1]; + const auto& contexts = GetAllContext(args); + *rv = GraphRuntimeCreate(args[0], modules, contexts); + }); TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create") -.set_body([](TVMArgs args, TVMRetValue *rv) { - void* mhandle = args[1]; - *rv = GraphRuntimeCreate(args[0], - *static_cast(mhandle), - args[2], args[3]); - }); + .set_body([](TVMArgs args, TVMRetValue* rv) { + void* mhandle = args[1]; + TVMContext ctx; + int dev_type = args[2]; + ctx.device_type = static_cast(dev_type); + ctx.device_id = args[3]; + std::vector contexts{ctx}; + *rv = GraphRuntimeCreate(args[0], + *static_cast(mhandle), + contexts); + }); } // namespace runtime } // namespace tvm diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py new file mode 100644 index 000000000000..4805bd44ec73 --- /dev/null +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -0,0 +1,413 @@ +# pylint: disable=too-many-locals +"""Unit tests for heterogeneous runtime""" +import json +import numpy as np + +import tvm +from tvm.contrib import graph_runtime, util +import topi + +def get_simplex_graph(host_dev_type, device_dev_type): + r""" Return the hand-crafted json object where only one copy node is + inserted. Tis node copies data from the target device to cpu. + The network is constructed as following: + A B + \ / + elemwise_add (gpu) + \ + copy C + \ / + elemwise_sub (cpu) + + Parameters + ---------- + host_dev_type : int + The device type of the host processor, e.g. cpu. + device_dev_type : int + The device type of the device processor, e.g. gpu, opencl, etc. + + Returns + ------- + json : json + A json encoded object. + """ + # Construct each node in the graph. + var_a = {"op": "null", "name": "A", "device_type": device_dev_type, + "inputs": []} + var_b = {"op": "null", "name": "B", "device_type": device_dev_type, + "inputs": []} + elemwise_add = { + "op": "tvm_op", "name": "elemwise_add", "device_type": device_dev_type, + "attrs": { + "flatten_data": "1", + "func_name": "elemwise_add", + "num_inputs": "2", + "num_outputs": "1" + }, + "inputs": [[0, 0, 0], [1, 0, 0]] + } + copy = { + "op": "device_copy_op", + "name": "__copy_add_to_sub", + "device_type": host_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "__copy", + "num_inputs": "1", + "num_outputs": "1" + }, + "inputs": [[2, 0, 0]] + } + var_c = {"op": "null", "name": "C", "device_type": host_dev_type, + "inputs": []} + elemwise_sub = { + "op": "tvm_op", "name": "elemwise_sub", + "device_type": host_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "elemwise_sub", + "num_inputs": "2", + "num_outputs": "1" + }, + "inputs": [[3, 0, 0], [4, 0, 0]] + } + + # Group the nodes. + nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub] + arg_nodes = [0, 1, 4] + node_row_ptr = [0, 1, 2, 3, 4, 5, 6] + heads = [[5, 0, 0]] + shape = (4,) + attrs = { + "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]], + "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]], + "device_type": ["list_int", [device_dev_type, device_dev_type, + device_dev_type, host_dev_type, + host_dev_type, host_dev_type]], + "dtype": ["list_int", [0, 0, 0, 0, 0, 0]], + "dltype": ["list_str", ["float32", "float32", "float32", + "float32", "float32", "float32"]] + } + + # Construct the graph. + graph = {"nodes": nodes, + "arg_nodes": arg_nodes, + "node_row_ptr": node_row_ptr, + "heads": heads, + "attrs": attrs} + return json.dumps(graph) + + +def test_simplex_data_transferring(): + r""" + Test the heterogeneous execution of a simple network where data + transferring is from the target device to the host processor at runtime. + The host processor is always assumed to be cpu, and the device varies. + """ + host = "cpu" + target_host = "llvm" + host_ctx = tvm.context(host) + if not tvm.module.enabled(target_host): + print("Skip test because llvm is not enabled.") + return + + def check_device(device, target_device): + if not tvm.module.enabled(target_device): + print("Skip test because {} is not enabled.".format(target_device)) + return + + device_ctx = tvm.context(device) + graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type) + shape = (4,) + + # Create module for add whose target is the device. + tensor_a = tvm.placeholder(shape, name="A") + tensor_b = tvm.placeholder(shape, name="B") + elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i) + + tensor_b(*i), name="elemwise_add") + target = topi.cpp.TEST_create_target(device) + schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) + lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], + name="elemwise_add") + lib_add, host_funcs_add = tvm.build(lower_add, target=target_device, + name="elemwise_add", + postpone_host_codegen=True) + + # Insert copy. Neither compute nor schedule is required for the copy + # node. The compute will be performed at runtime which is just data + # copy from the input to the output. + tensor_copy = tvm.placeholder(shape, name="__copy") + + # Create module for sub whose target is the host. + tensor_c = tvm.placeholder(shape, name="C") + elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i) + - tensor_c(*i), name="elemwise_sub") + schedule_sub = tvm.create_schedule(elemwise_sub.op) + lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c, + elemwise_sub], + name="elemwise_sub") + + lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host, + name="elemwise_sub", + postpone_host_codegen=True) + host_funcs = host_funcs_add + host_funcs_sub + combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub], + target_host=target_host) + + ctx = [host_ctx, device_ctx] + mod = graph_runtime.create(graph, combined_mod, ctx) + params = {} + params["A"] = tensor_a = np.random.uniform( + size=shape).astype(tensor_a.dtype) + params["B"] = tensor_b = np.random.uniform( + size=shape).astype(tensor_b.dtype) + params["C"] = tensor_c = np.random.uniform( + size=shape).astype(tensor_c.dtype) + mod.set_input(**params) + mod.run() + out = mod.get_output(0, tvm.nd.empty(shape)) + np.testing.assert_equal( + out.asnumpy(), (tensor_a + tensor_b) - tensor_c) + + dev_tar = {"gpu": "cuda", "opencl": "opencl"} + for device, target in dev_tar.items(): + check_device(device, target) + + +def get_duplex_graph(host_dev_type, device_dev_type): + r""" Return the hand-crafted json object where two copy nodes are inserted. + Data transferring happens back-and-forth between the target device and CPU. + The network is constructed as following: + A B + \ / + elemwise_add (gpu) + \ + copy C + \ / + elemwise_sub (cpu) + \ + copy D + \ / + elemwise_add (gpu) + + Parameters + ---------- + host_dev_type : int + The device type of the host processor, e.g. cpu. + device_dev_type : int + The device type of the device processor, e.g. gpu, opencl, etc. + + Returns + ------- + json : json + A json encoded object. + """ + # Construct each node in the graph. + var_a = {"op": "null", "name": "A", "device_type": device_dev_type, + "inputs": []} + var_b = {"op": "null", "name": "B", "device_type": device_dev_type, + "inputs": []} + elemwise_add0 = { + "op": "tvm_op", "name": "elemwise_add0", "device_type": + device_dev_type, + "attrs": { + "flatten_data": "1", + "func_name": "elemwise_add0", + "num_inputs": "2", + "num_outputs": "1" + }, + "inputs": [[0, 0, 0], [1, 0, 0]] + } + copy_add_sub = { + "op": "device_copy_op", + "name": "__copy_add_to_sub", + "device_type": host_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "__copy", + "num_inputs": "1", + "num_outputs": "1" + }, + "inputs": [[2, 0, 0]] + } + var_c = {"op": "null", "name": "C", "device_type": host_dev_type, + "inputs": []} + elemwise_sub = { + "op": "tvm_op", "name": "elemwise_sub", + "device_type": host_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "elemwise_sub", + "num_inputs": "2", + "num_outputs": "1" + }, + "inputs": [[3, 0, 0], [4, 0, 0]] + } + copy_sub_add = { + "op": "device_copy_op", + "name": "__copy_sub_to_add", + "device_type": device_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "__copy", + "num_inputs": "1", + "num_outputs": "1" + }, + "inputs": [[5, 0, 0]] + } + var_d = {"op": "null", "name": "D", "device_type": device_dev_type, + "inputs": []} + elemwise_add1 = { + "op": "tvm_op", "name": "elemwise_add1", + "device_type": device_dev_type, + "attrs": { + "flatten_data": "0", + "func_name": "elemwise_add1", + "num_inputs": "2", + "num_outputs": "1" + }, + "inputs": [[6, 0, 0], [7, 0, 0]] + } + + # Group the nodes. + nodes = [var_a, var_b, elemwise_add0, copy_add_sub, var_c, elemwise_sub, + copy_sub_add, var_d, elemwise_add1] + arg_nodes = [0, 1, 4, 7] + node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + heads = [[8, 0, 0]] + shape = (4,) + attrs = { + "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]], + "shape": ["list_shape", [shape, shape, shape, shape, shape, shape, + shape, shape, shape]], + "device_type": ["list_int", [device_dev_type, device_dev_type, + device_dev_type, + host_dev_type, host_dev_type, host_dev_type, + device_dev_type, device_dev_type, + device_dev_type]], + "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]], + "dltype": ["list_str", ["float32", "float32", "float32", + "float32", "float32", "float32", + "float32", "float32", "float32"]] + } + + # Construct the graph. + graph = {"nodes": nodes, + "arg_nodes": arg_nodes, + "node_row_ptr": node_row_ptr, + "heads": heads, + "attrs": attrs} + return json.dumps(graph) + + +def test_duplex_data_transferring(): + r""" + Test the heterogeneous execution of a simple network where data + transferring occurs back-and-forth between the target device and host + processor. + The host processor is always assumed to be cpu, and the target device + varies. + """ + host = "cpu" + target_host = "llvm" + host_ctx = tvm.context(host) + if not tvm.module.enabled(target_host): + print("Skip test because llvm is not enabled.") + return + + def check_device(device, target_device): + if not tvm.module.enabled(target_device): + print("Skip test because {} is not enabled.".format(target_device)) + return + + device_ctx = tvm.context(device) + graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type) + shape = (4,) + + # Insert copy nodes for data transferring between add and sub nodes. + # Transfers data from gpu to cpu. + copy_add_sub = tvm.placeholder(shape, name="__copy0") + # Transfers data from cpu to gpu. + copy_sub_add = tvm.placeholder(shape, name="__copy1") + + # Create a module containing adds on the device. + tensor_a = tvm.placeholder(shape, name="A") + tensor_b = tvm.placeholder(shape, name="B") + tensor_d = tvm.placeholder(shape, name="D") + elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i) + + tensor_b(*i), name="elemwise_add0") + elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i) + + tensor_d(*i), name="elemwise_add1") + target = topi.cpp.TEST_create_target(device) + add_schedule0 = topi.cpp.cuda.schedule_injective( + target, [elemwise_add0]) + lower_add0 = tvm.lower( + add_schedule0, [tensor_a, tensor_b, elemwise_add0], + name="elemwise_add0") + add_schedule1 = topi.cpp.cuda.schedule_injective( + target, [elemwise_add1]) + lower_add1 = tvm.lower( + add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], + name="elemwise_add1") + lib_add, host_funcs_add = tvm.build([lower_add0, lower_add1], + target=target_device, + postpone_host_codegen=True) + + # Create module for sub whose target is the host. + tensor_c = tvm.placeholder(shape, name="C") + elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i) + - tensor_c(*i), name="elemwise_sub") + sub_schedule = tvm.create_schedule(elemwise_sub.op) + lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c, + elemwise_sub], + name="elemwise_sub") + lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host, + postpone_host_codegen=True) + host_funcs = host_funcs_add + host_funcs_sub + + combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub], + target_host=target_host) + ctx = [host_ctx, device_ctx] + params = {} + params["A"] = tensor_a = np.random.uniform( + size=shape).astype(tensor_a.dtype) + params["B"] = tensor_b = np.random.uniform( + size=shape).astype(tensor_b.dtype) + params["C"] = tensor_c = np.random.uniform( + size=shape).astype(tensor_c.dtype) + params["D"] = tensor_d = np.random.uniform( + size=shape).astype(tensor_d.dtype) + + def check_verify(): + mod = graph_runtime.create(graph, combined_mod, ctx) + mod.set_input(**params) + mod.run() + out = mod.get_output(0, tvm.nd.empty(shape)) + np.testing.assert_equal( + out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) + + def check_load_module(): + temp = util.tempdir() + path_lib = temp.relpath("deploy.so") + combined_mod.export_library(path_lib) + with open(temp.relpath("deploy.json"), "w") as out_file: + out_file.write(graph) + loaded_lib = tvm.module.load(path_lib) + loaded_graph = open(temp.relpath("deploy.json")).read() + mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) + mod.set_input(**params) + mod.run() + out = mod.get_output(0, tvm.nd.empty(shape)) + np.testing.assert_equal( + out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) + + check_verify() + check_load_module() + + dev_tar = {"gpu": "cuda", "opencl": "opencl"} + for device, target in dev_tar.items(): + check_device(device, target) + +if __name__ == "__main__": + test_simplex_data_transferring() + test_duplex_data_transferring() From 0d0b4193c66523bf835e408a40cc1a1403255a79 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Tue, 18 Sep 2018 11:41:56 -0700 Subject: [PATCH 02/10] fix lint --- src/runtime/graph/graph_runtime.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 54084358ef69..a4eedeb5b1b4 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -739,7 +739,7 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create") void* mhandle = args[1]; TVMContext ctx; int dev_type = args[2]; - ctx.device_type = static_cast(dev_type); + ctx.device_type = static_cast(dev_type); ctx.device_id = args[3]; std::vector contexts{ctx}; *rv = GraphRuntimeCreate(args[0], From f0ef89d4b5e41f975dabf02d7e732518df586470 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Tue, 18 Sep 2018 19:51:11 -0700 Subject: [PATCH 03/10] reserve the functionality of calling graph_time.create from java and js --- src/runtime/graph/graph_runtime.cc | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index a4eedeb5b1b4..449604a3c01e 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -727,11 +727,28 @@ std::vector GetAllContext(const TVMArgs& args) { TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") .set_body([](TVMArgs args, TVMRetValue* rv) { - CHECK_EQ(args.size(), 5) << "5 arguments are expected, but " - << args.size() << " are passed in."; - tvm::runtime::Module modules = args[1]; - const auto& contexts = GetAllContext(args); - *rv = GraphRuntimeCreate(args[0], modules, contexts); + std::vector contexts; + // 4 argument version is currently reserved to keep support of calling + // from jvm4j and js, since they don't have heterogeneous execution + // support yet. For heterogenenous execution, 5 arguments will be passed + // in. They are graph_json, module, list of context types, list of context + // ids, and the number of devices. Eventually, we will only have the + // version with 5 parameters when we support heterogeneous execution for + // Java and js. + if (args.num_args == 4) { + TVMContext ctx; + int dev_type = args[2]; + ctx.device_type = static_cast(dev_type); + ctx.device_id = args[3]; + contexts.push_back(ctx); + } else if (args.num_args == 5) { + contexts = GetAllContext(args); + } else { + LOG(FATAL) + << "The number arguments of creaet must be 4 or 5, but it has " + << args.num_args; + } + *rv = GraphRuntimeCreate(args[0], args[1], contexts); }); TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create") From 6f02aa13092762db55e416ffb7f941247ceed251 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Tue, 18 Sep 2018 22:33:31 -0700 Subject: [PATCH 04/10] default device_type to CPU for transition purpose --- src/runtime/graph/graph_runtime.cc | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 449604a3c01e..ddad5bf2103b 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -242,7 +242,12 @@ class GraphRuntime : public ModuleNode { // inputs std::vector inputs; // device_type is used to indicate where the node should be scheduled to. - DLDeviceType device_type; + // TODO(zhiics) device_type is defaulted to CPU for transition purpose only + // because it will have random value otherwise. Using this default value is + // to make sure homogeneous execution works correctly. It will be removed + // when we add the compiler pass as we will read the serialized value from + // json for all execution. + DLDeviceType device_type{kDLCPU}; // control deps std::vector control_deps; // JSON Loader @@ -496,7 +501,7 @@ StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const { for (const auto& output : outputs_) { uint32_t eid = this->entry_id(output); uint32_t sid = attrs_.storage_id[eid]; - auto en_dev = nodes_[eid].device_type; + auto en_dev = nodes_[output.node_id].device_type; CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev) << "Cannot map the same storage id to multiple devices."; sid_dev_map[sid] = en_dev; @@ -728,13 +733,13 @@ std::vector GetAllContext(const TVMArgs& args) { TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") .set_body([](TVMArgs args, TVMRetValue* rv) { std::vector contexts; - // 4 argument version is currently reserved to keep support of calling - // from jvm4j and js, since they don't have heterogeneous execution - // support yet. For heterogenenous execution, 5 arguments will be passed - // in. They are graph_json, module, list of context types, list of context - // ids, and the number of devices. Eventually, we will only have the - // version with 5 parameters when we support heterogeneous execution for - // Java and js. + // 4-argument version is currently reserved to keep support of calling + // from tvm4j and javascript, since they don't have heterogeneous + // execution support yet. For heterogenenous execution, 5 arguments will + // be passed in. They are graph_json, module, list of context types, list + // of context ids, and the number of devices. + // Eventually, we will only have the version with 5 arguments when we + // support heterogeneous execution for Java and js. if (args.num_args == 4) { TVMContext ctx; int dev_type = args[2]; @@ -745,7 +750,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") contexts = GetAllContext(args); } else { LOG(FATAL) - << "The number arguments of creaet must be 4 or 5, but it has " + << "The expected number of arguments for graph_runtime.create is " + "4 or 5, but it has " << args.num_args; } *rv = GraphRuntimeCreate(args[0], args[1], contexts); From ec979b761d1e5113a336203aff75dcd0fff6da57 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Wed, 19 Sep 2018 09:34:56 -0700 Subject: [PATCH 05/10] fix description for tvm.build --- python/tvm/build_module.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 88506b49d175..ad08de9e5e5d 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -427,8 +427,11 @@ def build(sch, Returns ------- - f : Function, or pair of functions - The result function. + ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple + A module that combines both host and device code is returned when + postpone_host_codegen is not set. Otherwise, a list of lowered + functions for the host and a module contains only device code are + returned. Note ---- From a619cf4b49f2761c4271a8847c13050a1695f748 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Thu, 20 Sep 2018 12:48:12 -0700 Subject: [PATCH 06/10] remove map structures --- python/tvm/__init__.py | 2 +- python/tvm/build_module.py | 54 +---- python/tvm/contrib/graph_runtime.py | 22 +- src/runtime/graph/graph_runtime.cc | 195 ++++++++---------- .../unittest/test_runtime_heterogeneous.py | 76 +++---- 5 files changed, 126 insertions(+), 223 deletions(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 7076cf74b630..a028dfeddf36 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -31,7 +31,7 @@ from .node import register_node from .ndarray import register_extension from .schedule import create_schedule -from .build_module import build, lower, build_config, combine_modules +from .build_module import build, lower, build_config from .tag import tag_scope # Contrib initializers diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index ad08de9e5e5d..9f63d0a6a50d 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -518,61 +518,9 @@ def build(sch, # collected. mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None if postpone_host_codegen: - return mdev, fhost + return fhost, mdev mhost = codegen.build_module(fhost, str(target_host)) if fdevice: mhost.import_module(mdev) return mhost - -def combine_modules(host_funcs, device_modules, target_host=None): - """ Generate the host module for the lowered host functions by combining - them together. Then all device modules are imported to the combined host - module. This function is used for heterogeneous execution where multiple - device modules need to be imported to the host module. For homogeneous - execution, tvm.build is sufficient. - - Parameters - ---------- - host_funcs : LoweredFunc or list of LoweredFunc. - Lowered functions to be combined as the host module through codegen. - - device_modules : tvm.module or list of tvm.module. - Device modules will be imported into host module. - - Returns - ------- - mhost : The module that contains both host and device code. - """ - if isinstance(host_funcs, container.LoweredFunc): - host_funcs = [host_funcs] - elif not isinstance(host_funcs, (list, tuple, container.Array)): - raise ValueError("host_fucns must be the type of LoweredFunc or list " - "of LoweredFunc.") - - if isinstance(device_modules, module.Module): - device_modules = [device_modules] - elif not isinstance(device_modules, (list, tuple, container.Array)): - raise ValueError("host_funcs must be the type of Module or list of " - "Module.") - for func in host_funcs: - if not isinstance(func, container.LoweredFunc): - raise ValueError("host_fucns must be the type of LoweredFunc or " - "list of LoweredFunc.") - for device_mod in device_modules: - if device_mod and not isinstance(device_mod, module.Module): - raise ValueError("device_modules must be the type of Module or " - "list of Module.") - - if not target_host: - target_host = "llvm" if module.enabled("llvm") else "stackvm" - target_host = _target.create(target_host) - - # Generate code for the list of host functions. - mhost = codegen.build_module(host_funcs, str(target_host)) - # Import all device modules. - for device_mod in device_modules: - if device_mod: - mhost.import_module(device_mod) - - return mhost diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 2f8354068337..a857225008f3 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -38,13 +38,13 @@ def create(graph_json_str, libmod, ctx): elif not isinstance(ctx, (list, tuple)): raise ValueError("ctx has to be the type of TVMContext or a list of " "TVMCTVMContext") - has_cpu = False + cpu_ctx_index = -1 for i, cur_ctx in enumerate(ctx): if not isinstance(cur_ctx, TVMContext): raise ValueError("ctx has to be the type of TVMContext or a list " "of TVMCTVMContext") if cur_ctx.device_type == tvm.cpu(0).device_type: - has_cpu = True + cpu_ctx_index = i elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: ctx[0], ctx[i] = ctx[i], ctx[0] @@ -56,31 +56,23 @@ def create(graph_json_str, libmod, ctx): device_ids.append(cur_ctx.device_id) if device_types[0] >= rpc_base.RPC_SESS_MASK: - if num_devices > 1: - raise ValueError("RPC hasn't been supported for heterogeneous " - "execution yet.") assert libmod.type_key == "rpc" assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index hmod = rpc_base._ModuleHandle(libmod) fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK - return GraphModule(fcreate(graph_json_str, hmod, device_types[0], - device_ids[0]), ctx[0]) + return GraphModule(fcreate(graph_json_str, hmod, num_devices, + *device_types, *device_ids), ctx[0]) # Assume CPU is the host processor when there are multiple devices on # a hardware platform. - if (num_devices > 1) and (not has_cpu): + if (num_devices > 1) and (cpu_ctx_index < 0): raise RuntimeError( "CPU should be the host processor for heterogenous execution, but" " not found in ctx.") - - device_type_arr = (ctypes.c_int * num_devices)(*device_types) - void_dt_arr = ctypes.cast(device_type_arr, ctypes.c_void_p) - device_id_arr = (ctypes.c_int * num_devices)(*device_ids) - void_di_arr = ctypes.cast(device_id_arr, ctypes.c_void_p) fcreate = get_global_func("tvm.graph_runtime.create") - return GraphModule(fcreate(graph_json_str, libmod, void_dt_arr, - void_di_arr, num_devices), ctx[0]) + return GraphModule(fcreate(graph_json_str, libmod, num_devices, + *device_types, *device_ids), ctx[cpu_ctx_index]) class GraphModule(object): diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index ddad5bf2103b..4b3194f38c0c 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -20,9 +20,6 @@ namespace tvm { namespace runtime { -using StorageDeviceMap = std::unordered_map; -using DeviceStoragePoolMap = std::unordered_map; -using ModuleContextMap = std::unordered_map; /*! \brief Macro to do C API call. */ #define TVM_CCALL(func) \ @@ -211,6 +208,12 @@ class GraphRuntime : public ModuleNode { } private: + // Memory pool entry. + struct PoolEntry { + size_t size; + int device_type; + PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {} + }; // Node entry struct NodeEntry { uint32_t node_id; @@ -241,13 +244,6 @@ class GraphRuntime : public ModuleNode { TVMOpParam param; // inputs std::vector inputs; - // device_type is used to indicate where the node should be scheduled to. - // TODO(zhiics) device_type is defaulted to CPU for transition purpose only - // because it will have random value otherwise. Using this default value is - // to make sure homogeneous execution works correctly. It will be removed - // when we add the compiler pass as we will read the serialized value from - // json for all execution. - DLDeviceType device_type{kDLCPU}; // control deps std::vector control_deps; // JSON Loader @@ -268,20 +264,14 @@ class GraphRuntime : public ModuleNode { bitmask |= 4; } else if (key == "flatten_data") { param->flatten_data = strtoul(value.c_str(), nullptr, 10); - // TODO(zhiics) Enable the following when annotation is added to the - // heterogeneous compilation part. - // bitmask |= 8; } } - // TODO(zhiics) Add |8 when annotation is added to the heterogeneous - // compilation part. CHECK_EQ(bitmask, 1|2|4) << "invalid format"; } // JSON Loader void Load(dmlc::JSONReader *reader) { reader->BeginObject(); int bitmask = 0; - int device_type = 0; std::string key; while (reader->NextObjectItem(&key)) { if (key == "op") { @@ -297,22 +287,17 @@ class GraphRuntime : public ModuleNode { this->LoadAttrs(reader, ¶m); } else if (key == "control_deps") { reader->Read(&control_deps); - } else if (key == "device_type") { - reader->Read(&device_type); - this->device_type = static_cast(device_type); - // TODO(zhiics) Enable this when working on the compiler part. - // bitmask |= 8; } else { LOG(FATAL) << "do not support key " << key; } } - // TODO(zhiics) Add |8 in the compiler pass. CHECK_EQ(bitmask, 1|2|4) << "invalid format"; } }; struct GraphAttr { size_t storage_num_not_alloctaed{0}; std::vector storage_id; + std::vector device_index; std::vector dltype; std::vector > shape; // The graph attribute fields. @@ -348,6 +333,14 @@ class GraphRuntime : public ModuleNode { reader->Read(&shape); CHECK(!reader->NextArrayItem()); bitmask |= 4; + } else if (key == "device_index") { + reader->BeginArray(); + CHECK(reader->NextArrayItem()); + reader->Read(&type); + CHECK_EQ(type, "list_int"); + CHECK(reader->NextArrayItem()); + reader->Read(&device_index); + CHECK(!reader->NextArrayItem()); } else { reader->BeginArray(); CHECK(reader->NextArrayItem()); @@ -400,8 +393,6 @@ class GraphRuntime : public ModuleNode { void SetupStorage(); /*! \brief Setup the executors. */ void SetupOpExecs(); - /*! \brief Get storage id to device map. */ - StorageDeviceMap GetStorageDeviceMap() const; /*! * \brief Create a executtion function given input. * \param attrs The node attributes. @@ -443,8 +434,8 @@ class GraphRuntime : public ModuleNode { tvm::runtime::Module module_; /*! \brief Execution context of all devices including the host. */ std::vector ctxs_; - /*! \brief Common storage pool for each device. */ - DeviceStoragePoolMap device_storage_pool_; + /*! \brief Common storage pool for all devices. */ + std::vector storage_pool_; /*! \brief Data entry of each node. */ std::vector data_entry_; /*! \brief Operator on each node. */ @@ -481,34 +472,6 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) { } } -// Return a storage id to device type map. This map will be used to help memory -// allocation for the storage pool of each device. It will be also used to -// allocate memory to each data_entry_. -StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const { - StorageDeviceMap sid_dev_map; - for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) { - const auto& inode = nodes_[nid]; - for (const auto& e : inode.inputs) { - uint32_t eid = this->entry_id(e); - uint32_t sid = attrs_.storage_id[eid]; - auto en_dev = nodes_[e.node_id].device_type; - CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev) - << "Cannot map the same storage id to multiple devices."; - sid_dev_map[sid] = en_dev; - } - } - // Get all output entries. - for (const auto& output : outputs_) { - uint32_t eid = this->entry_id(output); - uint32_t sid = attrs_.storage_id[eid]; - auto en_dev = nodes_[output.node_id].device_type; - CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev) - << "Cannot map the same storage id to multiple devices."; - sid_dev_map[sid] = en_dev; - } - return sid_dev_map; -} - void GraphRuntime::SetupStorage() { // Grab saved optimization plan from graph. std::vector vtype; @@ -516,12 +479,16 @@ void GraphRuntime::SetupStorage() { vtype.push_back(tvm::runtime::String2TVMType(s_type)); } - const StorageDeviceMap& sid_dev_map = GetStorageDeviceMap(); - std::unordered_map device_pool_entry_bytes; - - // Find the maximum space size for each device. + // Size and device type of each storage pool entry. + std::vector pool_entry; + // Find the maximum space size. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; + // Use the fallback device if no device index is available. + int device_type = static_cast(ctxs_[0].device_type); + if (!attrs_.device_index.empty()) { + device_type = attrs_.device_index[i]; + } size_t size = 1; for (int64_t sz : attrs_.shape[i]) { size *= static_cast(sz); @@ -533,37 +500,43 @@ void GraphRuntime::SetupStorage() { size_t bytes = (bits / 8U) * size; uint32_t sid = static_cast(storage_id); - device_pool_entry_bytes[sid] = - std::max(device_pool_entry_bytes[sid], bytes); + if (sid >= pool_entry.size()) { + pool_entry.resize(sid + 1, {0, -1}); + } else { + CHECK(pool_entry[sid].device_type == -1 || + pool_entry[sid].device_type == device_type) + << "The same pool entry cannot be assigned to multiple devices"; + } + pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); + pool_entry[sid].device_type = device_type; } - // Allocate the space on each device. - for (const auto& pit : device_pool_entry_bytes) { + // Allocate the space. + for (size_t i = 0; i < pool_entry.size(); ++i) { std::vector shape; - shape.push_back(static_cast(pit.second + 3) / 4); TVMContext ctx = ctxs_[0]; // This for loop is very fast since there are usually only a couple of // devices available on the same hardware. for (const auto& cit : ctxs_) { - if (sid_dev_map.at(pit.first) == cit.device_type) { + if (pool_entry[i].device_type == static_cast(cit.device_type)) { ctx = cit; break; } } - device_storage_pool_[pit.first] = - NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx); + shape.push_back(static_cast(pool_entry[i].size + 3) / 4); + storage_pool_.push_back( + NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx)); } // Assign the pooled entries. A unified memory pool is used to simplifiy // memory assignment for each node entry. The allocated memory on each device - // is mapped to this pool by querying the storage id to device type map. + // is mapped to this pool. data_entry_.resize(num_node_entries()); for (size_t i = 0; i < data_entry_.size(); ++i) { - uint32_t storage_id = static_cast(attrs_.storage_id[i]); - CHECK(device_storage_pool_.count(storage_id)) - << "The storage hasn't been assigned to a specific device."; + int storage_id = attrs_.storage_id[i]; + CHECK_LT(static_cast(storage_id), storage_pool_.size()); data_entry_[i] = - device_storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); + storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]); } } @@ -715,59 +688,57 @@ Module GraphRuntimeCreate(const std::string& sym_json, // Get all context for the host and other runtime devices. std::vector GetAllContext(const TVMArgs& args) { - std::vector ret; - int* device_types = args[2].ptr(); - int* device_ids = args[3].ptr(); - int num_devices = args[4]; - - TVMContext ctx; - for (int i = 0; i < num_devices; i++) { - ctx.device_type = static_cast(device_types[i]); - ctx.device_id = device_ids[i]; - ret.push_back(ctx); + // Reserve the first item as the fallback device. + std::vector ret(1); + if (args.num_args == 4) { + int dev_type = args[2]; + ret[0].device_type = static_cast(dev_type); + ret[0].device_id = args[3]; + } else { + int num_devices = args[2]; + CHECK_EQ(args.num_args - 3, num_devices * 2) + << "The number of device_type and device_id passed in doesn't match, " + "or the number of is not the same as the number of devices."; + TVMContext ctx; + for (int i = 0; i < num_devices; i++) { + int dev_type = args[3 + i]; + ctx.device_type = static_cast(dev_type); + ctx.device_id = args[3 + i + num_devices]; + if (ctx.device_type == static_cast(kDLCPU)) { + ret[0] = ctx; + } else { + ret.push_back(ctx); + } + } } - return ret; } +// 4-argument version is currently reserved to keep support of calling +// from tvm4j and javascript, since they don't have heterogeneous +// execution support yet. For heterogenenous execution, at least 5 arguments will +// be passed in. The third one is the number of devices. +// Eventually, we will only probably pass TVMContext for all the languages. TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") .set_body([](TVMArgs args, TVMRetValue* rv) { - std::vector contexts; - // 4-argument version is currently reserved to keep support of calling - // from tvm4j and javascript, since they don't have heterogeneous - // execution support yet. For heterogenenous execution, 5 arguments will - // be passed in. They are graph_json, module, list of context types, list - // of context ids, and the number of devices. - // Eventually, we will only have the version with 5 arguments when we - // support heterogeneous execution for Java and js. - if (args.num_args == 4) { - TVMContext ctx; - int dev_type = args[2]; - ctx.device_type = static_cast(dev_type); - ctx.device_id = args[3]; - contexts.push_back(ctx); - } else if (args.num_args == 5) { - contexts = GetAllContext(args); - } else { - LOG(FATAL) - << "The expected number of arguments for graph_runtime.create is " - "4 or 5, but it has " - << args.num_args; - } + CHECK_GE(args.num_args, 4) + << "The expected number of arguments for graph_runtime.create is " + "at least 4, but it has " + << args.num_args; + const auto& contexts = GetAllContext(args); *rv = GraphRuntimeCreate(args[0], args[1], contexts); }); TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create") .set_body([](TVMArgs args, TVMRetValue* rv) { + CHECK_GE(args.num_args, 4) << "The expected number of arguments for " + "graph_runtime.remote_create is " + "at least 4, but it has " + << args.num_args; void* mhandle = args[1]; - TVMContext ctx; - int dev_type = args[2]; - ctx.device_type = static_cast(dev_type); - ctx.device_id = args[3]; - std::vector contexts{ctx}; - *rv = GraphRuntimeCreate(args[0], - *static_cast(mhandle), - contexts); + const auto& contexts = GetAllContext(args); + *rv = GraphRuntimeCreate( + args[0], *static_cast(mhandle), contexts); }); } // namespace runtime } // namespace tvm diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py index 4805bd44ec73..b8fcd089bc87 100644 --- a/tests/python/unittest/test_runtime_heterogeneous.py +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -32,12 +32,10 @@ def get_simplex_graph(host_dev_type, device_dev_type): A json encoded object. """ # Construct each node in the graph. - var_a = {"op": "null", "name": "A", "device_type": device_dev_type, - "inputs": []} - var_b = {"op": "null", "name": "B", "device_type": device_dev_type, - "inputs": []} + var_a = {"op": "null", "name": "A", "inputs": []} + var_b = {"op": "null", "name": "B", "inputs": []} elemwise_add = { - "op": "tvm_op", "name": "elemwise_add", "device_type": device_dev_type, + "op": "tvm_op", "name": "elemwise_add", "attrs": { "flatten_data": "1", "func_name": "elemwise_add", @@ -49,7 +47,6 @@ def get_simplex_graph(host_dev_type, device_dev_type): copy = { "op": "device_copy_op", "name": "__copy_add_to_sub", - "device_type": host_dev_type, "attrs": { "flatten_data": "0", "func_name": "__copy", @@ -58,11 +55,9 @@ def get_simplex_graph(host_dev_type, device_dev_type): }, "inputs": [[2, 0, 0]] } - var_c = {"op": "null", "name": "C", "device_type": host_dev_type, - "inputs": []} + var_c = {"op": "null", "name": "C", "inputs": []} elemwise_sub = { "op": "tvm_op", "name": "elemwise_sub", - "device_type": host_dev_type, "attrs": { "flatten_data": "0", "func_name": "elemwise_sub", @@ -81,9 +76,9 @@ def get_simplex_graph(host_dev_type, device_dev_type): attrs = { "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]], "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]], - "device_type": ["list_int", [device_dev_type, device_dev_type, - device_dev_type, host_dev_type, - host_dev_type, host_dev_type]], + "device_index": ["list_int", [device_dev_type, device_dev_type, + device_dev_type, host_dev_type, + host_dev_type, host_dev_type]], "dtype": ["list_int", [0, 0, 0, 0, 0, 0]], "dltype": ["list_str", ["float32", "float32", "float32", "float32", "float32", "float32"]] @@ -129,7 +124,7 @@ def check_device(device, target_device): schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add") - lib_add, host_funcs_add = tvm.build(lower_add, target=target_device, + host_funcs_add, lib_add = tvm.build(lower_add, target=target_device, name="elemwise_add", postpone_host_codegen=True) @@ -147,15 +142,18 @@ def check_device(device, target_device): elemwise_sub], name="elemwise_sub") - lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host, + host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host, name="elemwise_sub", postpone_host_codegen=True) host_funcs = host_funcs_add + host_funcs_sub - combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub], - target_host=target_host) + mhost = tvm.codegen.build_module(host_funcs, target_host) + if lib_add: + mhost.import_module(lib_add) + if lib_sub: + mhost.import_module(lib_sub) ctx = [host_ctx, device_ctx] - mod = graph_runtime.create(graph, combined_mod, ctx) + mod = graph_runtime.create(graph, mhost, ctx) params = {} params["A"] = tensor_a = np.random.uniform( size=shape).astype(tensor_a.dtype) @@ -203,13 +201,10 @@ def get_duplex_graph(host_dev_type, device_dev_type): A json encoded object. """ # Construct each node in the graph. - var_a = {"op": "null", "name": "A", "device_type": device_dev_type, - "inputs": []} - var_b = {"op": "null", "name": "B", "device_type": device_dev_type, - "inputs": []} + var_a = {"op": "null", "name": "A", "inputs": []} + var_b = {"op": "null", "name": "B", "inputs": []} elemwise_add0 = { - "op": "tvm_op", "name": "elemwise_add0", "device_type": - device_dev_type, + "op": "tvm_op", "name": "elemwise_add0", "attrs": { "flatten_data": "1", "func_name": "elemwise_add0", @@ -221,7 +216,6 @@ def get_duplex_graph(host_dev_type, device_dev_type): copy_add_sub = { "op": "device_copy_op", "name": "__copy_add_to_sub", - "device_type": host_dev_type, "attrs": { "flatten_data": "0", "func_name": "__copy", @@ -230,11 +224,9 @@ def get_duplex_graph(host_dev_type, device_dev_type): }, "inputs": [[2, 0, 0]] } - var_c = {"op": "null", "name": "C", "device_type": host_dev_type, - "inputs": []} + var_c = {"op": "null", "name": "C", "inputs": []} elemwise_sub = { "op": "tvm_op", "name": "elemwise_sub", - "device_type": host_dev_type, "attrs": { "flatten_data": "0", "func_name": "elemwise_sub", @@ -246,7 +238,6 @@ def get_duplex_graph(host_dev_type, device_dev_type): copy_sub_add = { "op": "device_copy_op", "name": "__copy_sub_to_add", - "device_type": device_dev_type, "attrs": { "flatten_data": "0", "func_name": "__copy", @@ -255,11 +246,9 @@ def get_duplex_graph(host_dev_type, device_dev_type): }, "inputs": [[5, 0, 0]] } - var_d = {"op": "null", "name": "D", "device_type": device_dev_type, - "inputs": []} + var_d = {"op": "null", "name": "D", "inputs": []} elemwise_add1 = { "op": "tvm_op", "name": "elemwise_add1", - "device_type": device_dev_type, "attrs": { "flatten_data": "0", "func_name": "elemwise_add1", @@ -280,11 +269,11 @@ def get_duplex_graph(host_dev_type, device_dev_type): "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]], "shape": ["list_shape", [shape, shape, shape, shape, shape, shape, shape, shape, shape]], - "device_type": ["list_int", [device_dev_type, device_dev_type, - device_dev_type, - host_dev_type, host_dev_type, host_dev_type, - device_dev_type, device_dev_type, - device_dev_type]], + "device_index": ["list_int", [device_dev_type, device_dev_type, + device_dev_type, + host_dev_type, host_dev_type, host_dev_type, + device_dev_type, device_dev_type, + device_dev_type]], "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]], "dltype": ["list_str", ["float32", "float32", "float32", "float32", "float32", "float32", @@ -349,7 +338,7 @@ def check_device(device, target_device): lower_add1 = tvm.lower( add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1") - lib_add, host_funcs_add = tvm.build([lower_add0, lower_add1], + host_funcs_add, lib_add = tvm.build([lower_add0, lower_add1], target=target_device, postpone_host_codegen=True) @@ -361,12 +350,15 @@ def check_device(device, target_device): lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub") - lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host, + host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host, postpone_host_codegen=True) host_funcs = host_funcs_add + host_funcs_sub + mhost = tvm.codegen.build_module(host_funcs, target_host) + if lib_add: + mhost.import_module(lib_add) + if lib_sub: + mhost.import_module(lib_sub) - combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub], - target_host=target_host) ctx = [host_ctx, device_ctx] params = {} params["A"] = tensor_a = np.random.uniform( @@ -379,7 +371,7 @@ def check_device(device, target_device): size=shape).astype(tensor_d.dtype) def check_verify(): - mod = graph_runtime.create(graph, combined_mod, ctx) + mod = graph_runtime.create(graph, mhost, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) @@ -389,7 +381,7 @@ def check_verify(): def check_load_module(): temp = util.tempdir() path_lib = temp.relpath("deploy.so") - combined_mod.export_library(path_lib) + mhost.export_library(path_lib) with open(temp.relpath("deploy.json"), "w") as out_file: out_file.write(graph) loaded_lib = tvm.module.load(path_lib) From 51a6e3eec79110b7c5732dc6483fabc232bc3753 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Thu, 20 Sep 2018 14:09:15 -0700 Subject: [PATCH 07/10] fix python error --- python/tvm/contrib/graph_runtime.py | 38 +++----- src/runtime/graph/graph_runtime.cc | 86 ++++++++----------- .../unittest/test_runtime_heterogeneous.py | 10 +-- 3 files changed, 53 insertions(+), 81 deletions(-) diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index a857225008f3..5ca55f77c6a8 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -38,41 +38,29 @@ def create(graph_json_str, libmod, ctx): elif not isinstance(ctx, (list, tuple)): raise ValueError("ctx has to be the type of TVMContext or a list of " "TVMCTVMContext") - cpu_ctx_index = -1 for i, cur_ctx in enumerate(ctx): if not isinstance(cur_ctx, TVMContext): raise ValueError("ctx has to be the type of TVMContext or a list " - "of TVMCTVMContext") - if cur_ctx.device_type == tvm.cpu(0).device_type: - cpu_ctx_index = i - elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: + "of TVMContext") + if cur_ctx.device_type == tvm.cpu(0).device_type or \ + cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: ctx[0], ctx[i] = ctx[i], ctx[0] - num_devices = len(ctx) - device_types = [] - device_ids = [] - for cur_ctx in ctx: - device_types.append(cur_ctx.device_type) - device_ids.append(cur_ctx.device_id) - - if device_types[0] >= rpc_base.RPC_SESS_MASK: + # ctx[0] is used as the primary/fallback context. All other ones are used + # as device context for heterogeneous execution. + device_type_id = [x for c in ctx[1:] for x in [c.device_type, c.device_id]] + if ctx[0].device_type >= rpc_base.RPC_SESS_MASK: assert libmod.type_key == "rpc" assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index hmod = rpc_base._ModuleHandle(libmod) fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") - device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK - return GraphModule(fcreate(graph_json_str, hmod, num_devices, - *device_types, *device_ids), ctx[0]) - - # Assume CPU is the host processor when there are multiple devices on - # a hardware platform. - if (num_devices > 1) and (cpu_ctx_index < 0): - raise RuntimeError( - "CPU should be the host processor for heterogenous execution, but" - " not found in ctx.") + device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK + return GraphModule(fcreate(graph_json_str, hmod, device_type, + ctx[0].device_id, *device_type_id), ctx[0]) + fcreate = get_global_func("tvm.graph_runtime.create") - return GraphModule(fcreate(graph_json_str, libmod, num_devices, - *device_types, *device_ids), ctx[cpu_ctx_index]) + return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type, + ctx[0].device_id, *device_type_id), ctx[0]) class GraphModule(object): diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 4b3194f38c0c..a48047fe369c 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -264,9 +264,10 @@ class GraphRuntime : public ModuleNode { bitmask |= 4; } else if (key == "flatten_data") { param->flatten_data = strtoul(value.c_str(), nullptr, 10); + bitmask |= 8; } } - CHECK_EQ(bitmask, 1|2|4) << "invalid format"; + CHECK_EQ(bitmask, 1|2|4|8) << "invalid format"; } // JSON Loader void Load(dmlc::JSONReader *reader) { @@ -512,18 +513,16 @@ void GraphRuntime::SetupStorage() { } // Allocate the space. - for (size_t i = 0; i < pool_entry.size(); ++i) { + for (const auto& pit : pool_entry) { std::vector shape; - TVMContext ctx = ctxs_[0]; // This for loop is very fast since there are usually only a couple of // devices available on the same hardware. - for (const auto& cit : ctxs_) { - if (pool_entry[i].device_type == static_cast(cit.device_type)) { - ctx = cit; - break; - } - } - shape.push_back(static_cast(pool_entry[i].size + 3) / 4); + const auto& cit = + std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) { + return pit.device_type == static_cast(c.device_type); + }); + TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit; + shape.push_back(static_cast(pit.size + 3) / 4); storage_pool_.push_back( NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx)); } @@ -554,8 +553,7 @@ void GraphRuntime::SetupOpExecs() { uint32_t eid = this->entry_id(nid, index); args.push_back(*(data_entry_[eid].operator->())); } - CHECK(inode.op_type == "tvm_op" || inode.op_type == "device_copy_op") - << "Can only take tvm_op or device_copy_op as op"; + CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op"; op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size()); } @@ -689,27 +687,13 @@ Module GraphRuntimeCreate(const std::string& sym_json, // Get all context for the host and other runtime devices. std::vector GetAllContext(const TVMArgs& args) { // Reserve the first item as the fallback device. - std::vector ret(1); - if (args.num_args == 4) { - int dev_type = args[2]; - ret[0].device_type = static_cast(dev_type); - ret[0].device_id = args[3]; - } else { - int num_devices = args[2]; - CHECK_EQ(args.num_args - 3, num_devices * 2) - << "The number of device_type and device_id passed in doesn't match, " - "or the number of is not the same as the number of devices."; - TVMContext ctx; - for (int i = 0; i < num_devices; i++) { - int dev_type = args[3 + i]; - ctx.device_type = static_cast(dev_type); - ctx.device_id = args[3 + i + num_devices]; - if (ctx.device_type == static_cast(kDLCPU)) { - ret[0] = ctx; - } else { - ret.push_back(ctx); - } - } + std::vector ret; + TVMContext ctx; + for (int i = 2; i < args.num_args; i += 2) { + int dev_type = args[i]; + ctx.device_type = static_cast(dev_type); + ctx.device_id = args[i + 1]; + ret.push_back(ctx); } return ret; } @@ -720,25 +704,25 @@ std::vector GetAllContext(const TVMArgs& args) { // be passed in. The third one is the number of devices. // Eventually, we will only probably pass TVMContext for all the languages. TVM_REGISTER_GLOBAL("tvm.graph_runtime.create") - .set_body([](TVMArgs args, TVMRetValue* rv) { - CHECK_GE(args.num_args, 4) - << "The expected number of arguments for graph_runtime.create is " - "at least 4, but it has " - << args.num_args; - const auto& contexts = GetAllContext(args); - *rv = GraphRuntimeCreate(args[0], args[1], contexts); - }); + .set_body([](TVMArgs args, TVMRetValue* rv) { + CHECK_GE(args.num_args, 4) + << "The expected number of arguments for graph_runtime.create is " + "at least 4, but it has " + << args.num_args; + const auto& contexts = GetAllContext(args); + *rv = GraphRuntimeCreate(args[0], args[1], contexts); + }); TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create") - .set_body([](TVMArgs args, TVMRetValue* rv) { - CHECK_GE(args.num_args, 4) << "The expected number of arguments for " - "graph_runtime.remote_create is " - "at least 4, but it has " - << args.num_args; - void* mhandle = args[1]; - const auto& contexts = GetAllContext(args); - *rv = GraphRuntimeCreate( - args[0], *static_cast(mhandle), contexts); - }); + .set_body([](TVMArgs args, TVMRetValue* rv) { + CHECK_GE(args.num_args, 4) << "The expected number of arguments for " + "graph_runtime.remote_create is " + "at least 4, but it has " + << args.num_args; + void* mhandle = args[1]; + const auto& contexts = GetAllContext(args); + *rv = GraphRuntimeCreate( + args[0], *static_cast(mhandle), contexts); + }); } // namespace runtime } // namespace tvm diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py index b8fcd089bc87..d129618375fa 100644 --- a/tests/python/unittest/test_runtime_heterogeneous.py +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -45,7 +45,7 @@ def get_simplex_graph(host_dev_type, device_dev_type): "inputs": [[0, 0, 0], [1, 0, 0]] } copy = { - "op": "device_copy_op", + "op": "tvm_op", "name": "__copy_add_to_sub", "attrs": { "flatten_data": "0", @@ -167,7 +167,7 @@ def check_device(device, target_device): np.testing.assert_equal( out.asnumpy(), (tensor_a + tensor_b) - tensor_c) - dev_tar = {"gpu": "cuda", "opencl": "opencl"} + dev_tar = {"cuda": "cuda", "opencl": "opencl"} for device, target in dev_tar.items(): check_device(device, target) @@ -214,7 +214,7 @@ def get_duplex_graph(host_dev_type, device_dev_type): "inputs": [[0, 0, 0], [1, 0, 0]] } copy_add_sub = { - "op": "device_copy_op", + "op": "tvm_op", "name": "__copy_add_to_sub", "attrs": { "flatten_data": "0", @@ -236,7 +236,7 @@ def get_duplex_graph(host_dev_type, device_dev_type): "inputs": [[3, 0, 0], [4, 0, 0]] } copy_sub_add = { - "op": "device_copy_op", + "op": "tvm_op", "name": "__copy_sub_to_add", "attrs": { "flatten_data": "0", @@ -396,7 +396,7 @@ def check_load_module(): check_verify() check_load_module() - dev_tar = {"gpu": "cuda", "opencl": "opencl"} + dev_tar = {"cuda": "cuda", "opencl": "opencl"} for device, target in dev_tar.items(): check_device(device, target) From c734697a1f4cd474d0cfc4d2909e5ed09a738990 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Fri, 21 Sep 2018 09:13:09 -0700 Subject: [PATCH 08/10] Remove ctx from GraphModule and fix typo --- python/tvm/build_module.py | 2 +- python/tvm/contrib/graph_runtime.py | 19 ++++++------------- .../unittest/test_runtime_heterogeneous.py | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py index 9f63d0a6a50d..8e0d16286d6a 100755 --- a/python/tvm/build_module.py +++ b/python/tvm/build_module.py @@ -386,7 +386,7 @@ def build(sch, name="default_function", binds=None, postpone_host_codegen=False): - """Build a function with arguments as signiture. Code will be generated + """Build a function with arguments as signature. Code will be generated for a device specified by the target. For homogeneous execution, a module that contains both host and device code is returned. For heterogeneous execution, a list of lowered functions for the host and a module containing diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 5ca55f77c6a8..515f74a438e3 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -1,12 +1,10 @@ """Minimum graph runtime that executes graph containing TVM PackedFunc.""" import numpy as np -import tvm from .._ffi.base import string_types from .._ffi.function import get_global_func from .._ffi.runtime_ctypes import TVMContext from ..rpc import base as rpc_base -from .. import ndarray as nd def create(graph_json_str, libmod, ctx): """Create a runtime executor module given a graph and module. @@ -42,8 +40,7 @@ def create(graph_json_str, libmod, ctx): if not isinstance(cur_ctx, TVMContext): raise ValueError("ctx has to be the type of TVMContext or a list " "of TVMContext") - if cur_ctx.device_type == tvm.cpu(0).device_type or \ - cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: + if cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: ctx[0], ctx[i] = ctx[i], ctx[0] # ctx[0] is used as the primary/fallback context. All other ones are used @@ -56,11 +53,11 @@ def create(graph_json_str, libmod, ctx): fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK return GraphModule(fcreate(graph_json_str, hmod, device_type, - ctx[0].device_id, *device_type_id), ctx[0]) + ctx[0].device_id, *device_type_id)) fcreate = get_global_func("tvm.graph_runtime.create") return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type, - ctx[0].device_id, *device_type_id), ctx[0]) + ctx[0].device_id, *device_type_id)) class GraphModule(object): @@ -82,12 +79,9 @@ class GraphModule(object): ---------- module : Module The interal tvm module that holds the actual graph functions. - - ctx : TVMContext - The context this module is under """ - def __init__(self, module, ctx): + def __init__(self, module): self.module = module self._set_input = module["set_input"] self._run = module["run"] @@ -99,7 +93,6 @@ def __init__(self, module, ctx): except AttributeError: pass self._load_params = module["load_params"] - self.ctx = ctx def set_input(self, key=None, value=None, **params): """Set inputs to the module via kwargs @@ -116,14 +109,14 @@ def set_input(self, key=None, value=None, **params): Additonal arguments """ if key: - self._set_input(key, nd.array(value, ctx=self.ctx)) + self._get_input(key).copyfrom(value) if params: # upload big arrays first to avoid memory issue in rpc mode keys = list(params.keys()) keys.sort(key=lambda x: -np.prod(params[x].shape)) for k in keys: - self._set_input(k, nd.array(params[k], ctx=self.ctx)) + self._get_input(k).copyfrom(params[k]) def run(self, **input_dict): """Run forward execution of the graph diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py index d129618375fa..b916ee285717 100644 --- a/tests/python/unittest/test_runtime_heterogeneous.py +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -9,7 +9,7 @@ def get_simplex_graph(host_dev_type, device_dev_type): r""" Return the hand-crafted json object where only one copy node is - inserted. Tis node copies data from the target device to cpu. + inserted. This node copies data from the target device to cpu. The network is constructed as following: A B \ / From 66b208fb0d4d4de153103bcedf267311e5df2230 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Fri, 21 Sep 2018 14:17:42 -0700 Subject: [PATCH 09/10] fix rpc --- python/tvm/contrib/graph_runtime.py | 44 +++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 515f74a438e3..8b31175b6319 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -36,28 +36,39 @@ def create(graph_json_str, libmod, ctx): elif not isinstance(ctx, (list, tuple)): raise ValueError("ctx has to be the type of TVMContext or a list of " "TVMCTVMContext") - for i, cur_ctx in enumerate(ctx): + for cur_ctx in ctx: if not isinstance(cur_ctx, TVMContext): raise ValueError("ctx has to be the type of TVMContext or a list " "of TVMContext") - if cur_ctx.device_type >= rpc_base.RPC_SESS_MASK: - ctx[0], ctx[i] = ctx[i], ctx[0] - - # ctx[0] is used as the primary/fallback context. All other ones are used - # as device context for heterogeneous execution. - device_type_id = [x for c in ctx[1:] for x in [c.device_type, c.device_id]] - if ctx[0].device_type >= rpc_base.RPC_SESS_MASK: - assert libmod.type_key == "rpc" - assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index + + # device_type_id[0], device_type_id[1] are used as the primary/fallback + # context type and id. All other ones are used as device context for + # heterogeneous execution. + num_rpc_ctx = 0 + device_type_id = [] + for cur_ctx in ctx: + device_type = cur_ctx.device_type + if device_type >= rpc_base.RPC_SESS_MASK: + assert libmod.type_key == "rpc" + assert rpc_base._SessTableIndex( + libmod) == cur_ctx._rpc_sess._tbl_index + num_rpc_ctx += 1 + device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK + device_type_id.append(device_type) + device_type_id.append(cur_ctx.device_id) + + if 0 < num_rpc_ctx < len(ctx): + raise ValueError("Either all or none of the contexts should be rpc.") + + if num_rpc_ctx == len(ctx): hmod = rpc_base._ModuleHandle(libmod) fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") - device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK - return GraphModule(fcreate(graph_json_str, hmod, device_type, - ctx[0].device_id, *device_type_id)) + return GraphModule(fcreate(graph_json_str, hmod, device_type_id[0], + device_type_id[1], *device_type_id[2:])) fcreate = get_global_func("tvm.graph_runtime.create") - return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type, - ctx[0].device_id, *device_type_id)) + return GraphModule(fcreate(graph_json_str, libmod, device_type_id[0], + device_type_id[1], *device_type_id[2:])) class GraphModule(object): @@ -72,9 +83,6 @@ class GraphModule(object): module : Module The interal tvm module that holds the actual graph functions. - ctx : TVMContext - The context this module is under - Attributes ---------- module : Module From 143341d7d4e137c111e708e66f6b83b028fcc830 Mon Sep 17 00:00:00 2001 From: Zhi Chen Date: Fri, 21 Sep 2018 19:48:33 -0700 Subject: [PATCH 10/10] pass *device_type_id --- python/tvm/contrib/graph_runtime.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 8b31175b6319..f0e83eec0bb8 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -63,12 +63,10 @@ def create(graph_json_str, libmod, ctx): if num_rpc_ctx == len(ctx): hmod = rpc_base._ModuleHandle(libmod) fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create") - return GraphModule(fcreate(graph_json_str, hmod, device_type_id[0], - device_type_id[1], *device_type_id[2:])) + return GraphModule(fcreate(graph_json_str, hmod, *device_type_id)) fcreate = get_global_func("tvm.graph_runtime.create") - return GraphModule(fcreate(graph_json_str, libmod, device_type_id[0], - device_type_id[1], *device_type_id[2:])) + return GraphModule(fcreate(graph_json_str, libmod, *device_type_id)) class GraphModule(object):