From 0496e1d81d2a19f5594a7862bf8a0455d46404ab Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 7 Sep 2018 11:00:57 -0700
Subject: [PATCH 01/10] Rebase to upstream master

combine modules for heterogeneous execution
---
 python/tvm/__init__.py                        |   2 +-
 python/tvm/build_module.py                    |  77 +++-
 python/tvm/contrib/graph_runtime.py           |  71 ++-
 src/runtime/graph/graph_runtime.cc            | 260 +++++++----
 .../unittest/test_runtime_heterogeneous.py    | 413 ++++++++++++++++++
 5 files changed, 724 insertions(+), 99 deletions(-)
 create mode 100644 tests/python/unittest/test_runtime_heterogeneous.py

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index a028dfeddf36..7076cf74b630 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -31,7 +31,7 @@
 from .node import register_node
 from .ndarray import register_extension
 from .schedule import create_schedule
-from .build_module import build, lower, build_config
+from .build_module import build, lower, build_config, combine_modules
 from .tag import tag_scope
 
 # Contrib initializers
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 70935cde1816..88506b49d175 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -384,8 +384,14 @@ def build(sch,
           target=None,
           target_host=None,
           name="default_function",
-          binds=None):
-    """Build a function with arguments as signiture.
+          binds=None,
+          postpone_host_codegen=False):
+    """Build a function with arguments as signiture. Code will be generated
+    for a device specified by the target. For homogeneous execution, a module
+    that contains both host and device code is returned. For heterogeneous
+    execution, a list of lowered functions for the host and a module containing
+    device code are returned, but actual code generation for the host module is
+    postponed after code generation is finished for all devices.
 
     Parameters
     ----------
@@ -414,6 +420,11 @@ def build(sch,
         Dictionary that maps the binding of symbolic buffer to Tensor.
         By default, a new buffer is created for each tensor in the argument.
 
+    postpone_host_codegen : bool, optional
+        A bool value that indicates if code generation for the host module
+        should be postponed. This variable is set to be true for heterogeneous
+        execution. Otherwise, it is defaulted to false.
+
     Returns
     -------
     f : Function, or pair of functions
@@ -498,9 +509,67 @@ def build(sch,
     fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
-    mhost = codegen.build_module(fhost, str(target_host))
 
+    # Append fhost to the device module and return the updated module. All
+    # device modules will be imported to the host module after all of them are
+    # collected.
+    mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
+    if postpone_host_codegen:
+        return mdev, fhost
+
+    mhost = codegen.build_module(fhost, str(target_host))
     if fdevice:
-        mdev = codegen.build_module(fdevice, str(target_device))
         mhost.import_module(mdev)
     return mhost
+
+def combine_modules(host_funcs, device_modules, target_host=None):
+    """ Generate the host module for the lowered host functions by combining
+    them together. Then all device modules are imported to the combined host
+    module. This function is used for heterogeneous execution where multiple
+    device modules need to be imported to the host module. For homogeneous
+    execution, tvm.build is sufficient.
+
+    Parameters
+    ----------
+    host_funcs : LoweredFunc or list of LoweredFunc.
+        Lowered functions to be combined as the host module through codegen.
+
+    device_modules : tvm.module or list of tvm.module.
+        Device modules will be imported into host module.
+
+    Returns
+    -------
+        mhost : The module that contains both host and device code.
+    """
+    if isinstance(host_funcs, container.LoweredFunc):
+        host_funcs = [host_funcs]
+    elif not isinstance(host_funcs, (list, tuple, container.Array)):
+        raise ValueError("host_fucns must be the type of LoweredFunc or list "
+                         "of LoweredFunc.")
+
+    if isinstance(device_modules, module.Module):
+        device_modules = [device_modules]
+    elif not isinstance(device_modules, (list, tuple, container.Array)):
+        raise ValueError("host_funcs must be the type of Module or list of "
+                         "Module.")
+    for func in host_funcs:
+        if not isinstance(func, container.LoweredFunc):
+            raise ValueError("host_fucns must be the type of LoweredFunc or "
+                             "list of LoweredFunc.")
+    for device_mod in device_modules:
+        if device_mod and not isinstance(device_mod, module.Module):
+            raise ValueError("device_modules must be the type of Module or "
+                             "list of Module.")
+
+    if not target_host:
+        target_host = "llvm" if module.enabled("llvm") else "stackvm"
+    target_host = _target.create(target_host)
+
+    # Generate code for the list of host functions.
+    mhost = codegen.build_module(host_funcs, str(target_host))
+    # Import all device modules.
+    for device_mod in device_modules:
+        if device_mod:
+            mhost.import_module(device_mod)
+
+    return mhost
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index e49b966e6a1e..2f8354068337 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -1,28 +1,28 @@
 """Minimum graph runtime that executes graph containing TVM PackedFunc."""
 import numpy as np
+import tvm
 
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
+from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
 from .. import ndarray as nd
 
-
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
-
     Parameters
     ----------
     graph_json_str : str or graph class
         The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
-
     libmod : tvm.Module
         The module of the corresponding function
-
-    ctx : TVMContext
-        The context to deploy the module, can be local or remote.
-
+    ctx : TVMContext or list of TVMContext
+        The context to deploy the module. It can be local or remote when there
+        is only one TVMContext. Otherwise, the first context in the list will
+        be used as this purpose. All context should be given for heterogeneous
+        execution.
     Returns
     -------
     graph_module : GraphModule
@@ -33,17 +33,54 @@ def create(graph_json_str, libmod, ctx):
             graph_json_str = graph_json_str._tvm_graph_json()
         except AttributeError:
             raise ValueError("Type %s is not supported" % type(graph_json_str))
-    device_type = ctx.device_type
-    device_id = ctx.device_id
-    if device_type >= rpc_base.RPC_SESS_MASK:
+    if isinstance(ctx, TVMContext):
+        ctx = [ctx]
+    elif not isinstance(ctx, (list, tuple)):
+        raise ValueError("ctx has to be the type of TVMContext or a list of "
+                         "TVMCTVMContext")
+    has_cpu = False
+    for i, cur_ctx in enumerate(ctx):
+        if not isinstance(cur_ctx, TVMContext):
+            raise ValueError("ctx has to be the type of TVMContext or a list "
+                             "of TVMCTVMContext")
+        if cur_ctx.device_type == tvm.cpu(0).device_type:
+            has_cpu = True
+        elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
+            ctx[0], ctx[i] = ctx[i], ctx[0]
+
+    num_devices = len(ctx)
+    device_types = []
+    device_ids = []
+    for cur_ctx in ctx:
+        device_types.append(cur_ctx.device_type)
+        device_ids.append(cur_ctx.device_id)
+
+    if device_types[0] >= rpc_base.RPC_SESS_MASK:
+        if num_devices > 1:
+            raise ValueError("RPC hasn't been supported for heterogeneous "
+                             "execution yet.")
         assert libmod.type_key == "rpc"
-        assert rpc_base._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
+        assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index
         hmod = rpc_base._ModuleHandle(libmod)
-        fcreate = ctx._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_type = device_type % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, device_type, device_id), ctx)
+        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
+        device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK
+        return GraphModule(fcreate(graph_json_str, hmod, device_types[0],
+                                   device_ids[0]), ctx[0])
+
+    # Assume CPU is the host processor when there are multiple devices on
+    # a hardware platform.
+    if (num_devices > 1) and (not has_cpu):
+        raise RuntimeError(
+            "CPU should be the host processor for heterogenous execution, but"
+            " not found in ctx.")
+
+    device_type_arr = (ctypes.c_int * num_devices)(*device_types)
+    void_dt_arr = ctypes.cast(device_type_arr, ctypes.c_void_p)
+    device_id_arr = (ctypes.c_int * num_devices)(*device_ids)
+    void_di_arr = ctypes.cast(device_id_arr, ctypes.c_void_p)
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, device_type, device_id), ctx)
+    return GraphModule(fcreate(graph_json_str, libmod, void_dt_arr,
+                               void_di_arr, num_devices), ctx[0])
 
 
 class GraphModule(object):
@@ -69,6 +106,7 @@ class GraphModule(object):
     ctx : TVMContext
         The context this module is under
     """
+
     def __init__(self, module, ctx):
         self.module = module
         self._set_input = module["set_input"]
@@ -177,7 +215,8 @@ def debug_get_output(self, node, out):
         if hasattr(self, '_debug_get_output'):
             self._debug_get_output(node, out)
         else:
-            raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
+            raise RuntimeError(
+                "Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
         return out
 
     def load_params(self, params_bytes):
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 162d616dea8a..54084358ef69 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -2,22 +2,29 @@
  *  Copyright (c) 2017 by Contributors
  * \file graph_runtime.cc
  */
+#include "graph_runtime.h"
+
+#include <dlpack/dlpack.h>
+#include <dmlc/json.h>
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/device_api.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/json.h>
-#include <numeric>
+#include <tvm/runtime/serializer.h>
+
 #include <algorithm>
-#include <vector>
 #include <functional>
-#include "graph_runtime.h"
+#include <numeric>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
+using StorageDeviceMap = std::unordered_map<uint32_t, DLDeviceType>;
+using DeviceStoragePoolMap = std::unordered_map<size_t, NDArray>;
+using ModuleContextMap = std::unordered_map<tvm::runtime::Module*, TVMContext>;
 
-/*! \brief macro to do C API call */
+/*! \brief Macro to do C API call. */
 #define TVM_CCALL(func)                                            \
   {                                                                \
     int ret = (func);                                              \
@@ -34,7 +41,7 @@ namespace runtime {
 class GraphRuntime : public ModuleNode {
  public:
   /*!
-   * \brief Get member function to front-end
+   * \brief Get member function to front-end.
    * \param name The name of the function.
    * \param sptr_to_self The pointer to the module node.
    * \return The corresponding member function.
@@ -58,12 +65,13 @@ class GraphRuntime : public ModuleNode {
   /*!
    * \brief Initialize the graph executor with graph and context.
    * \param graph_json The execution graph.
-   * \param module The module containing the compiled functions.
-   * \param ctx The context where the graph should sit on
+   * \param module The module containing the compiled functions for the host
+   * processor.
+   * \param ctxs The context of the host and devices where graph nodes will be
+   * executed on.
    */
-  void Init(const std::string& graph_json,
-            tvm::runtime::Module module,
-            TVMContext ctx) {
+  void Init(const std::string& graph_json, const tvm::runtime::Module& module,
+            const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
     std::istringstream is(graph_json);
 #else
@@ -72,10 +80,11 @@ class GraphRuntime : public ModuleNode {
     dmlc::JSONReader reader(&is);
     this->Load(&reader);
     module_ = module;
-    ctx_ = ctx;
+    ctxs_ = ctxs;
     this->SetupStorage();
     this->SetupOpExecs();
   }
+
   /*!
    * \brief Get the input index given the name of input.
    * \param name The name of the input.
@@ -92,7 +101,7 @@ class GraphRuntime : public ModuleNode {
     return -1;
   }
   /*!
-   * \brief set index-th input to the graph.
+   * \brief Set index-th input to the graph.
    * \param index The input index.
    * \param data_in The input data.
    */
@@ -134,7 +143,7 @@ class GraphRuntime : public ModuleNode {
   /*!
    * \brief Copy index-th output to data_out.
    * \param index The output index.
-   * \param data_out the output data.
+   * \param data_out The output data.
    */
   void CopyOutputTo(int index, DLTensor* data_out) {
     CHECK_LT(static_cast<size_t>(index), outputs_.size());
@@ -172,8 +181,8 @@ class GraphRuntime : public ModuleNode {
    * from begining upto the index-th node and return output of index-th node.
    * This is costly operation and suggest to use only for debug porpose.
    *
-   * \param index: The  index of the node.
-   * \param data_out the node data.
+   * \param index The index of the node.
+   * \param data_out The node data.
    */
   void DebugGetNodeOutput(int index, DLTensor* data_out) {
     CHECK_LT(static_cast<size_t>(index), nodes_.size());
@@ -188,7 +197,7 @@ class GraphRuntime : public ModuleNode {
   }
 #endif
   /*!
-   * \brief Load parameters from binary stream
+   * \brief Load parameters from binary stream.
    * \param strm The input stream.
    */
   void LoadParams(dmlc::Stream* strm);
@@ -232,6 +241,8 @@ class GraphRuntime : public ModuleNode {
     TVMOpParam param;
     // inputs
     std::vector<NodeEntry> inputs;
+    // device_type is used to indicate where the node should be scheduled to.
+    DLDeviceType device_type;
     // control deps
     std::vector<uint32_t> control_deps;
     // JSON Loader
@@ -252,16 +263,20 @@ class GraphRuntime : public ModuleNode {
           bitmask |= 4;
         } else if (key == "flatten_data") {
           param->flatten_data = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 8;
+          // TODO(zhiics) Enable the following when annotation is added to the
+          // heterogeneous compilation part.
+          // bitmask |= 8;
         }
       }
-      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
+      // TODO(zhiics) Add |8 when annotation is added to the heterogeneous
+      // compilation part.
+      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
     }
     // JSON Loader
     void Load(dmlc::JSONReader *reader) {
       reader->BeginObject();
-      std::unordered_map<std::string, std::string> dict;
       int bitmask = 0;
+      int device_type = 0;
       std::string key;
       while (reader->NextObjectItem(&key)) {
         if (key == "op") {
@@ -277,10 +292,16 @@ class GraphRuntime : public ModuleNode {
           this->LoadAttrs(reader, &param);
         } else if (key == "control_deps") {
           reader->Read(&control_deps);
+        } else if (key == "device_type") {
+          reader->Read(&device_type);
+          this->device_type = static_cast<DLDeviceType>(device_type);
+          // TODO(zhiics) Enable this when working on the compiler part.
+          // bitmask |= 8;
         } else {
           LOG(FATAL) << "do not support key " << key;
         }
       }
+      // TODO(zhiics) Add |8 in the compiler pass.
       CHECK_EQ(bitmask, 1|2|4) << "invalid format";
     }
   };
@@ -372,13 +393,16 @@ class GraphRuntime : public ModuleNode {
   }
   /*! \brief Setup the temporal storage */
   void SetupStorage();
-  /*! \brief Setup the executors */
+  /*! \brief Setup the executors. */
   void SetupOpExecs();
+  /*! \brief Get storage id to device map. */
+  StorageDeviceMap GetStorageDeviceMap() const;
   /*!
    * \brief Create a executtion function given input.
-   * \param attrs The node attributes
+   * \param attrs The node attributes.
    * \param args The arguments to the functor, including inputs and outputs.
-   * \param num_inputs Number of inputs
+   * \param num_inputs Number of inputs.
+   * \param dev_type The device type of the tvm_op.
    * \return The created executor.
    */
   std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
@@ -392,7 +416,7 @@ class GraphRuntime : public ModuleNode {
   uint32_t entry_id(const NodeEntry& e) const {
     return entry_id(e.node_id, e.index);
   }
-  // Number of node entries
+  // Number of node entries.
   uint32_t num_node_entries() const {
     return node_row_ptr_.back();
   }
@@ -400,25 +424,25 @@ class GraphRuntime : public ModuleNode {
   uint32_t num_nodes() const {
     return static_cast<uint32_t>(nodes_.size());
   }
-  // The graph nodes.
+  /*! \brief The graph nodes. */
   std::vector<Node> nodes_;
-  // The argument nodes.
+  /*! \brief The argument nodes. */
   std::vector<uint32_t> input_nodes_;
-  // used or quick entry indexing
+  /*! \brief Used for quick entry indexing. */
   std::vector<uint32_t> node_row_ptr_;
-  // output entries
+  /*! \brief Output entries. */
   std::vector<NodeEntry> outputs_;
-  // Additional graph attributes
+  /*! \brief Additional graph attributes. */
   GraphAttr attrs_;
-  /*! \brief The code module */
+  /*! \brief The code module that contains both host and device code. */
   tvm::runtime::Module module_;
-  /*! \brief execution context */
-  TVMContext ctx_;
-  /*! \brief common storage pool */
-  std::vector<NDArray> storage_pool_;
-  /*! \brief data entry of each node */
+  /*! \brief Execution context of all devices including the host. */
+  std::vector<TVMContext> ctxs_;
+  /*! \brief Common storage pool for each device. */
+  DeviceStoragePoolMap device_storage_pool_;
+  /*! \brief Data entry of each node. */
   std::vector<NDArray> data_entry_;
-  /*! \brief operator on each node */
+  /*! \brief Operator on each node. */
   std::vector<std::function<void()> > op_execs_;
 };
 
@@ -452,16 +476,45 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   }
 }
 
+// Return a storage id to device type map. This map will be used to help memory
+// allocation for the storage pool of each device. It will be also used to
+// allocate memory to each data_entry_.
+StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const {
+  StorageDeviceMap sid_dev_map;
+  for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) {
+    const auto& inode = nodes_[nid];
+    for (const auto& e : inode.inputs) {
+      uint32_t eid = this->entry_id(e);
+      uint32_t sid = attrs_.storage_id[eid];
+      auto en_dev = nodes_[e.node_id].device_type;
+      CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev)
+          << "Cannot map the same storage id to multiple devices.";
+      sid_dev_map[sid] = en_dev;
+    }
+  }
+  // Get all output entries.
+  for (const auto& output : outputs_) {
+    uint32_t eid = this->entry_id(output);
+    uint32_t sid = attrs_.storage_id[eid];
+    auto en_dev = nodes_[eid].device_type;
+    CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev)
+        << "Cannot map the same storage id to multiple devices.";
+    sid_dev_map[sid] = en_dev;
+  }
+  return sid_dev_map;
+}
+
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<TVMType> vtype;
   for (const std::string& s_type : attrs_.dltype) {
     vtype.push_back(tvm::runtime::String2TVMType(s_type));
   }
-  data_entry_.resize(num_node_entries());
-  // size of each storage pool entry
-  std::vector<size_t> pool_entry_bytes;
-  // Find the maximum space size.
+
+  const StorageDeviceMap& sid_dev_map = GetStorageDeviceMap();
+  std::unordered_map<uint32_t, size_t> device_pool_entry_bytes;
+
+  // Find the maximum space size for each device.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     size_t size = 1;
@@ -474,23 +527,38 @@ void GraphRuntime::SetupStorage() {
     CHECK_EQ(bits % 8U, 0U);
     size_t bytes = (bits / 8U) * size;
 
-    size_t sid = static_cast<size_t>(storage_id);
-    if (sid >= pool_entry_bytes.size()) {
-      pool_entry_bytes.resize(sid + 1, 0);
-    }
-    pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes);
+    uint32_t sid = static_cast<uint32_t>(storage_id);
+    device_pool_entry_bytes[sid] =
+        std::max(device_pool_entry_bytes[sid], bytes);
   }
-  // Allocate the space.
-  for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
+
+  // Allocate the space on each device.
+  for (const auto& pit : device_pool_entry_bytes) {
     std::vector<int64_t> shape;
-    shape.push_back(static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4);
-    storage_pool_.push_back(NDArray::Empty(shape, DLDataType {kDLFloat, 32, 1}, ctx_));
+    shape.push_back(static_cast<int64_t>(pit.second + 3) / 4);
+    TVMContext ctx = ctxs_[0];
+    // This for loop is very fast since there are usually only a couple of
+    // devices available on the same hardware.
+    for (const auto& cit : ctxs_) {
+      if (sid_dev_map.at(pit.first) == cit.device_type) {
+        ctx = cit;
+        break;
+      }
+    }
+    device_storage_pool_[pit.first] =
+        NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
   }
-  // Assign the pooled entries.
+
+  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // memory assignment for each node entry. The allocated memory on each device
+  // is mapped to this pool by querying the storage id to device type map.
+  data_entry_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
-    int storage_id = attrs_.storage_id[i];
-    CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    uint32_t storage_id = static_cast<uint32_t>(attrs_.storage_id[i]);
+    CHECK(device_storage_pool_.count(storage_id))
+        << "The storage hasn't been assigned to a specific device.";
+    data_entry_[i] =
+        device_storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
   }
 }
 
@@ -508,8 +576,9 @@ void GraphRuntime::SetupOpExecs() {
       uint32_t eid = this->entry_id(nid, index);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK_EQ(inode.op_type, "tvm_op")
-        << "Can only take tvm_op as op";
+    CHECK(inode.op_type == "tvm_op" || inode.op_type == "device_copy_op")
+        << "Can only take tvm_op or device_copy_op as op";
+
     op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size());
   }
 }
@@ -543,13 +612,26 @@ std::function<void()> GraphRuntime::CreateTVMOp(
       t->shape = &(arg_ptr->shape_data[i]);
     }
   }
+
   if (param.func_name == "__nop") {
     return [](){};
+  } else if (param.func_name == "__copy") {
+    // Perform cross device data copy.
+    // Directly copy data from the input to the output.
+    auto fexec = [arg_ptr]() {
+      DLTensor* from = static_cast<DLTensor*>(arg_ptr->arg_values[0].v_handle);
+      DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
+      TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
+    };
+    return fexec;
   }
-  // get compiled function from module.
+
+  // Get compiled function from the module that contains both host and device
+  // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
   CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
-  auto fexec = [arg_ptr, pf] () {
+
+  auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
     TVMArgs targs(arg_ptr->arg_values.data(),
                   arg_ptr->arg_tcodes.data(),
@@ -562,7 +644,7 @@ std::function<void()> GraphRuntime::CreateTVMOp(
 PackedFunc GraphRuntime::GetFunction(
     const std::string& name,
     const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  // return member functions during query.
+  // Return member functions during query.
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         if (args[0].type_code() == kStr) {
@@ -618,29 +700,51 @@ PackedFunc GraphRuntime::GetFunction(
   }
 }
 
-Module GraphRuntimeCreate(std::string sym_json,
-                          tvm::runtime::Module m,
-                          int device_type,
-                          int device_id) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id   = device_id;
+Module GraphRuntimeCreate(const std::string& sym_json,
+                          const tvm::runtime::Module& m,
+                          const std::vector<TVMContext>& ctxs) {
   std::shared_ptr<GraphRuntime> exec = std::make_shared<GraphRuntime>();
-  exec->Init(sym_json, m, ctx);
+  exec->Init(sym_json, m, ctxs);
   return Module(exec);
 }
 
+// Get all context for the host and other runtime devices.
+std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+  std::vector<TVMContext> ret;
+  int* device_types = args[2].ptr<int>();
+  int* device_ids = args[3].ptr<int>();
+  int num_devices = args[4];
+
+  TVMContext ctx;
+  for (int i = 0; i < num_devices; i++) {
+    ctx.device_type = static_cast<DLDeviceType>(device_types[i]);
+    ctx.device_id = device_ids[i];
+    ret.push_back(ctx);
+  }
+
+  return ret;
+}
+
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    *rv = GraphRuntimeCreate(args[0], args[1], args[2], args[3]);
-  });
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 5) << "5 arguments are expected, but "
+                               << args.size() << " are passed in.";
+      tvm::runtime::Module modules = args[1];
+      const auto& contexts = GetAllContext(args);
+      *rv = GraphRuntimeCreate(args[0], modules, contexts);
+    });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    void* mhandle = args[1];
-    *rv = GraphRuntimeCreate(args[0],
-                             *static_cast<tvm::runtime::Module*>(mhandle),
-                             args[2], args[3]);
-  });
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      void* mhandle = args[1];
+      TVMContext ctx;
+      int dev_type = args[2];
+      ctx.device_type = static_cast<DLDeviceType>(dev_type); 
+      ctx.device_id = args[3];
+      std::vector<TVMContext> contexts{ctx};
+      *rv = GraphRuntimeCreate(args[0],
+                               *static_cast<tvm::runtime::Module*>(mhandle),
+                               contexts);
+    });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
new file mode 100644
index 000000000000..4805bd44ec73
--- /dev/null
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -0,0 +1,413 @@
+# pylint: disable=too-many-locals
+"""Unit tests for heterogeneous runtime"""
+import json
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime, util
+import topi
+
+def get_simplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where only one copy node is
+    inserted. Tis node copies data from the target device to cpu.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy      C
+                       \      /
+                     elemwise_sub  (cpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "device_type": device_dev_type,
+             "inputs": []}
+    var_b = {"op": "null", "name": "B", "device_type": device_dev_type,
+             "inputs": []}
+    elemwise_add = {
+        "op": "tvm_op", "name": "elemwise_add", "device_type": device_dev_type,
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy = {
+        "op": "device_copy_op",
+        "name": "__copy_add_to_sub",
+        "device_type": host_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "device_type": host_dev_type,
+             "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "device_type": host_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub]
+    arg_nodes = [0, 1, 4]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6]
+    heads = [[5, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
+        "device_type": ["list_int", [device_dev_type, device_dev_type,
+                                     device_dev_type, host_dev_type,
+                                     host_dev_type, host_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_simplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring is from the target device to the host processor at runtime.
+    The host processor is always assumed to be cpu, and the device varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Create module for add whose target is the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                   + tensor_b(*i), name="elemwise_add")
+        target = topi.cpp.TEST_create_target(device)
+        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
+        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
+                              name="elemwise_add")
+        lib_add, host_funcs_add = tvm.build(lower_add, target=target_device,
+                                            name="elemwise_add",
+                                            postpone_host_codegen=True)
+
+        # Insert copy. Neither compute nor schedule is required for the copy
+        # node. The compute will be performed at runtime which is just data
+        # copy from the input to the output.
+        tensor_copy = tvm.placeholder(shape, name="__copy")
+
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        schedule_sub = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+
+        lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host,
+                                            name="elemwise_sub",
+                                            postpone_host_codegen=True)
+        host_funcs = host_funcs_add + host_funcs_sub
+        combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub],
+                                           target_host=target_host)
+
+        ctx = [host_ctx, device_ctx]
+        mod = graph_runtime.create(graph, combined_mod, ctx)
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        mod.set_input(**params)
+        mod.run()
+        out = mod.get_output(0, tvm.nd.empty(shape))
+        np.testing.assert_equal(
+            out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
+
+    dev_tar = {"gpu": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+
+def get_duplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where two copy nodes are inserted.
+    Data transferring happens back-and-forth between the target device and CPU.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy        C
+                       \        /
+                      elemwise_sub  (cpu)
+                         \
+                         copy          D
+                           \          /
+                           elemwise_add  (gpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "device_type": device_dev_type,
+             "inputs": []}
+    var_b = {"op": "null", "name": "B", "device_type": device_dev_type,
+             "inputs": []}
+    elemwise_add0 = {
+        "op": "tvm_op", "name": "elemwise_add0", "device_type":
+        device_dev_type,
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add0",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy_add_sub = {
+        "op": "device_copy_op",
+        "name": "__copy_add_to_sub",
+        "device_type": host_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "device_type": host_dev_type,
+             "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "device_type": host_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+    copy_sub_add = {
+        "op": "device_copy_op",
+        "name": "__copy_sub_to_add",
+        "device_type": device_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[5, 0, 0]]
+    }
+    var_d = {"op": "null", "name": "D", "device_type": device_dev_type,
+             "inputs": []}
+    elemwise_add1 = {
+        "op": "tvm_op", "name": "elemwise_add1",
+        "device_type": device_dev_type,
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_add1",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[6, 0, 0], [7, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add0, copy_add_sub, var_c, elemwise_sub,
+             copy_sub_add, var_d, elemwise_add1]
+    arg_nodes = [0, 1, 4, 7]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    heads = [[8, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape,
+                                 shape, shape, shape]],
+        "device_type": ["list_int", [device_dev_type, device_dev_type,
+                                device_dev_type,
+                                host_dev_type, host_dev_type, host_dev_type,
+                                device_dev_type, device_dev_type,
+                                device_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_duplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring occurs back-and-forth between the target device and host
+    processor.
+    The host processor is always assumed to be cpu, and the target device
+    varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Insert copy nodes for data transferring between add and sub nodes.
+        # Transfers data from gpu to cpu.
+        copy_add_sub = tvm.placeholder(shape, name="__copy0")
+        # Transfers data from cpu to gpu.
+        copy_sub_add = tvm.placeholder(shape, name="__copy1")
+
+        # Create a module containing adds on the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        tensor_d = tvm.placeholder(shape, name="D")
+        elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                    + tensor_b(*i), name="elemwise_add0")
+        elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i)
+                                    + tensor_d(*i), name="elemwise_add1")
+        target = topi.cpp.TEST_create_target(device)
+        add_schedule0 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add0])
+        lower_add0 = tvm.lower(
+            add_schedule0, [tensor_a, tensor_b, elemwise_add0],
+            name="elemwise_add0")
+        add_schedule1 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add1])
+        lower_add1 = tvm.lower(
+            add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
+            name="elemwise_add1")
+        lib_add, host_funcs_add = tvm.build([lower_add0, lower_add1],
+                                            target=target_device,
+                                            postpone_host_codegen=True)
+
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        sub_schedule = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+        lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host,
+                                            postpone_host_codegen=True)
+        host_funcs = host_funcs_add + host_funcs_sub
+
+        combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub],
+                                           target_host=target_host)
+        ctx = [host_ctx, device_ctx]
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        params["D"] = tensor_d = np.random.uniform(
+            size=shape).astype(tensor_d.dtype)
+
+        def check_verify():
+            mod = graph_runtime.create(graph, combined_mod, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        def check_load_module():
+            temp = util.tempdir()
+            path_lib = temp.relpath("deploy.so")
+            combined_mod.export_library(path_lib)
+            with open(temp.relpath("deploy.json"), "w") as out_file:
+                out_file.write(graph)
+            loaded_lib = tvm.module.load(path_lib)
+            loaded_graph = open(temp.relpath("deploy.json")).read()
+            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        check_verify()
+        check_load_module()
+
+    dev_tar = {"gpu": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+if __name__ == "__main__":
+    test_simplex_data_transferring()
+    test_duplex_data_transferring()

From 0d0b4193c66523bf835e408a40cc1a1403255a79 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 18 Sep 2018 11:41:56 -0700
Subject: [PATCH 02/10] fix lint

---
 src/runtime/graph/graph_runtime.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 54084358ef69..a4eedeb5b1b4 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -739,7 +739,7 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
       void* mhandle = args[1];
       TVMContext ctx;
       int dev_type = args[2];
-      ctx.device_type = static_cast<DLDeviceType>(dev_type); 
+      ctx.device_type = static_cast<DLDeviceType>(dev_type);
       ctx.device_id = args[3];
       std::vector<TVMContext> contexts{ctx};
       *rv = GraphRuntimeCreate(args[0],

From f0ef89d4b5e41f975dabf02d7e732518df586470 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 18 Sep 2018 19:51:11 -0700
Subject: [PATCH 03/10] reserve the functionality of calling graph_time.create
 from java and js

---
 src/runtime/graph/graph_runtime.cc | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index a4eedeb5b1b4..449604a3c01e 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -727,11 +727,28 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 5) << "5 arguments are expected, but "
-                               << args.size() << " are passed in.";
-      tvm::runtime::Module modules = args[1];
-      const auto& contexts = GetAllContext(args);
-      *rv = GraphRuntimeCreate(args[0], modules, contexts);
+      std::vector<TVMContext> contexts;
+      // 4 argument version is currently reserved to keep support of calling
+      // from jvm4j and js, since they don't have heterogeneous execution
+      // support yet. For heterogenenous execution, 5 arguments will be passed
+      // in. They are graph_json, module, list of context types, list of context
+      // ids, and the number of devices. Eventually, we will only have the
+      // version with 5 parameters when we support heterogeneous execution for
+      // Java and js.
+      if (args.num_args == 4) {
+        TVMContext ctx;
+        int dev_type = args[2];
+        ctx.device_type = static_cast<DLDeviceType>(dev_type);
+        ctx.device_id = args[3];
+        contexts.push_back(ctx);
+      } else if (args.num_args == 5) {
+        contexts = GetAllContext(args);
+      } else {
+        LOG(FATAL)
+            << "The number arguments of creaet must be 4 or 5, but it has "
+            << args.num_args;
+      }
+      *rv = GraphRuntimeCreate(args[0], args[1], contexts);
     });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")

From 6f02aa13092762db55e416ffb7f941247ceed251 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 18 Sep 2018 22:33:31 -0700
Subject: [PATCH 04/10] default device_type to CPU for transition purpose

---
 src/runtime/graph/graph_runtime.cc | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 449604a3c01e..ddad5bf2103b 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -242,7 +242,12 @@ class GraphRuntime : public ModuleNode {
     // inputs
     std::vector<NodeEntry> inputs;
     // device_type is used to indicate where the node should be scheduled to.
-    DLDeviceType device_type;
+    // TODO(zhiics) device_type is defaulted to CPU for transition purpose only
+    // because it will have random value otherwise. Using this default value is
+    // to make sure homogeneous execution works correctly. It will be removed
+    // when we add the compiler pass as we will read the serialized value from
+    // json for all execution.
+    DLDeviceType device_type{kDLCPU};
     // control deps
     std::vector<uint32_t> control_deps;
     // JSON Loader
@@ -496,7 +501,7 @@ StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const {
   for (const auto& output : outputs_) {
     uint32_t eid = this->entry_id(output);
     uint32_t sid = attrs_.storage_id[eid];
-    auto en_dev = nodes_[eid].device_type;
+    auto en_dev = nodes_[output.node_id].device_type;
     CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev)
         << "Cannot map the same storage id to multiple devices.";
     sid_dev_map[sid] = en_dev;
@@ -728,13 +733,13 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
       std::vector<TVMContext> contexts;
-      // 4 argument version is currently reserved to keep support of calling
-      // from jvm4j and js, since they don't have heterogeneous execution
-      // support yet. For heterogenenous execution, 5 arguments will be passed
-      // in. They are graph_json, module, list of context types, list of context
-      // ids, and the number of devices. Eventually, we will only have the
-      // version with 5 parameters when we support heterogeneous execution for
-      // Java and js.
+      // 4-argument version is currently reserved to keep support of calling
+      // from tvm4j and javascript, since they don't have heterogeneous
+      // execution support yet. For heterogenenous execution, 5 arguments will
+      // be passed in. They are graph_json, module, list of context types, list
+      // of context ids, and the number of devices.
+      // Eventually, we will only have the version with 5 arguments when we
+      // support heterogeneous execution for Java and js.
       if (args.num_args == 4) {
         TVMContext ctx;
         int dev_type = args[2];
@@ -745,7 +750,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
         contexts = GetAllContext(args);
       } else {
         LOG(FATAL)
-            << "The number arguments of creaet must be 4 or 5, but it has "
+            << "The expected number of arguments for graph_runtime.create is "
+               "4 or 5, but it has "
             << args.num_args;
       }
       *rv = GraphRuntimeCreate(args[0], args[1], contexts);

From ec979b761d1e5113a336203aff75dcd0fff6da57 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 19 Sep 2018 09:34:56 -0700
Subject: [PATCH 05/10] fix description for tvm.build

---
 python/tvm/build_module.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 88506b49d175..ad08de9e5e5d 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -427,8 +427,11 @@ def build(sch,
 
     Returns
     -------
-    f : Function, or pair of functions
-       The result function.
+    ret : tvm.module, or (list of LoweredFunc, tvm.module) tuple
+        A module that combines both host and device code is returned when
+        postpone_host_codegen is not set. Otherwise, a list of lowered
+        functions for the host and a module contains only device code are
+        returned.
 
     Note
     ----

From a619cf4b49f2761c4271a8847c13050a1695f748 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 20 Sep 2018 12:48:12 -0700
Subject: [PATCH 06/10] remove map structures

---
 python/tvm/__init__.py                        |   2 +-
 python/tvm/build_module.py                    |  54 +----
 python/tvm/contrib/graph_runtime.py           |  22 +-
 src/runtime/graph/graph_runtime.cc            | 195 ++++++++----------
 .../unittest/test_runtime_heterogeneous.py    |  76 +++----
 5 files changed, 126 insertions(+), 223 deletions(-)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 7076cf74b630..a028dfeddf36 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -31,7 +31,7 @@
 from .node import register_node
 from .ndarray import register_extension
 from .schedule import create_schedule
-from .build_module import build, lower, build_config, combine_modules
+from .build_module import build, lower, build_config
 from .tag import tag_scope
 
 # Contrib initializers
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index ad08de9e5e5d..9f63d0a6a50d 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -518,61 +518,9 @@ def build(sch,
     # collected.
     mdev = codegen.build_module(fdevice, str(target_device)) if fdevice else None
     if postpone_host_codegen:
-        return mdev, fhost
+        return fhost, mdev
 
     mhost = codegen.build_module(fhost, str(target_host))
     if fdevice:
         mhost.import_module(mdev)
     return mhost
-
-def combine_modules(host_funcs, device_modules, target_host=None):
-    """ Generate the host module for the lowered host functions by combining
-    them together. Then all device modules are imported to the combined host
-    module. This function is used for heterogeneous execution where multiple
-    device modules need to be imported to the host module. For homogeneous
-    execution, tvm.build is sufficient.
-
-    Parameters
-    ----------
-    host_funcs : LoweredFunc or list of LoweredFunc.
-        Lowered functions to be combined as the host module through codegen.
-
-    device_modules : tvm.module or list of tvm.module.
-        Device modules will be imported into host module.
-
-    Returns
-    -------
-        mhost : The module that contains both host and device code.
-    """
-    if isinstance(host_funcs, container.LoweredFunc):
-        host_funcs = [host_funcs]
-    elif not isinstance(host_funcs, (list, tuple, container.Array)):
-        raise ValueError("host_fucns must be the type of LoweredFunc or list "
-                         "of LoweredFunc.")
-
-    if isinstance(device_modules, module.Module):
-        device_modules = [device_modules]
-    elif not isinstance(device_modules, (list, tuple, container.Array)):
-        raise ValueError("host_funcs must be the type of Module or list of "
-                         "Module.")
-    for func in host_funcs:
-        if not isinstance(func, container.LoweredFunc):
-            raise ValueError("host_fucns must be the type of LoweredFunc or "
-                             "list of LoweredFunc.")
-    for device_mod in device_modules:
-        if device_mod and not isinstance(device_mod, module.Module):
-            raise ValueError("device_modules must be the type of Module or "
-                             "list of Module.")
-
-    if not target_host:
-        target_host = "llvm" if module.enabled("llvm") else "stackvm"
-    target_host = _target.create(target_host)
-
-    # Generate code for the list of host functions.
-    mhost = codegen.build_module(host_funcs, str(target_host))
-    # Import all device modules.
-    for device_mod in device_modules:
-        if device_mod:
-            mhost.import_module(device_mod)
-
-    return mhost
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 2f8354068337..a857225008f3 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -38,13 +38,13 @@ def create(graph_json_str, libmod, ctx):
     elif not isinstance(ctx, (list, tuple)):
         raise ValueError("ctx has to be the type of TVMContext or a list of "
                          "TVMCTVMContext")
-    has_cpu = False
+    cpu_ctx_index = -1
     for i, cur_ctx in enumerate(ctx):
         if not isinstance(cur_ctx, TVMContext):
             raise ValueError("ctx has to be the type of TVMContext or a list "
                              "of TVMCTVMContext")
         if cur_ctx.device_type == tvm.cpu(0).device_type:
-            has_cpu = True
+            cpu_ctx_index = i
         elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
             ctx[0], ctx[i] = ctx[i], ctx[0]
 
@@ -56,31 +56,23 @@ def create(graph_json_str, libmod, ctx):
         device_ids.append(cur_ctx.device_id)
 
     if device_types[0] >= rpc_base.RPC_SESS_MASK:
-        if num_devices > 1:
-            raise ValueError("RPC hasn't been supported for heterogeneous "
-                             "execution yet.")
         assert libmod.type_key == "rpc"
         assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
         device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, device_types[0],
-                                   device_ids[0]), ctx[0])
+        return GraphModule(fcreate(graph_json_str, hmod, num_devices,
+                                   *device_types, *device_ids), ctx[0])
 
     # Assume CPU is the host processor when there are multiple devices on
     # a hardware platform.
-    if (num_devices > 1) and (not has_cpu):
+    if (num_devices > 1) and (cpu_ctx_index < 0):
         raise RuntimeError(
             "CPU should be the host processor for heterogenous execution, but"
             " not found in ctx.")
-
-    device_type_arr = (ctypes.c_int * num_devices)(*device_types)
-    void_dt_arr = ctypes.cast(device_type_arr, ctypes.c_void_p)
-    device_id_arr = (ctypes.c_int * num_devices)(*device_ids)
-    void_di_arr = ctypes.cast(device_id_arr, ctypes.c_void_p)
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, void_dt_arr,
-                               void_di_arr, num_devices), ctx[0])
+    return GraphModule(fcreate(graph_json_str, libmod, num_devices,
+                               *device_types, *device_ids), ctx[cpu_ctx_index])
 
 
 class GraphModule(object):
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index ddad5bf2103b..4b3194f38c0c 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -20,9 +20,6 @@
 
 namespace tvm {
 namespace runtime {
-using StorageDeviceMap = std::unordered_map<uint32_t, DLDeviceType>;
-using DeviceStoragePoolMap = std::unordered_map<size_t, NDArray>;
-using ModuleContextMap = std::unordered_map<tvm::runtime::Module*, TVMContext>;
 
 /*! \brief Macro to do C API call. */
 #define TVM_CCALL(func)                                            \
@@ -211,6 +208,12 @@ class GraphRuntime : public ModuleNode {
   }
 
  private:
+  // Memory pool entry.
+  struct PoolEntry {
+    size_t size;
+    int device_type;
+    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+  };
   // Node entry
   struct NodeEntry {
     uint32_t node_id;
@@ -241,13 +244,6 @@ class GraphRuntime : public ModuleNode {
     TVMOpParam param;
     // inputs
     std::vector<NodeEntry> inputs;
-    // device_type is used to indicate where the node should be scheduled to.
-    // TODO(zhiics) device_type is defaulted to CPU for transition purpose only
-    // because it will have random value otherwise. Using this default value is
-    // to make sure homogeneous execution works correctly. It will be removed
-    // when we add the compiler pass as we will read the serialized value from
-    // json for all execution.
-    DLDeviceType device_type{kDLCPU};
     // control deps
     std::vector<uint32_t> control_deps;
     // JSON Loader
@@ -268,20 +264,14 @@ class GraphRuntime : public ModuleNode {
           bitmask |= 4;
         } else if (key == "flatten_data") {
           param->flatten_data = strtoul(value.c_str(), nullptr, 10);
-          // TODO(zhiics) Enable the following when annotation is added to the
-          // heterogeneous compilation part.
-          // bitmask |= 8;
         }
       }
-      // TODO(zhiics) Add |8 when annotation is added to the heterogeneous
-      // compilation part.
       CHECK_EQ(bitmask, 1|2|4) << "invalid format";
     }
     // JSON Loader
     void Load(dmlc::JSONReader *reader) {
       reader->BeginObject();
       int bitmask = 0;
-      int device_type = 0;
       std::string key;
       while (reader->NextObjectItem(&key)) {
         if (key == "op") {
@@ -297,22 +287,17 @@ class GraphRuntime : public ModuleNode {
           this->LoadAttrs(reader, &param);
         } else if (key == "control_deps") {
           reader->Read(&control_deps);
-        } else if (key == "device_type") {
-          reader->Read(&device_type);
-          this->device_type = static_cast<DLDeviceType>(device_type);
-          // TODO(zhiics) Enable this when working on the compiler part.
-          // bitmask |= 8;
         } else {
           LOG(FATAL) << "do not support key " << key;
         }
       }
-      // TODO(zhiics) Add |8 in the compiler pass.
       CHECK_EQ(bitmask, 1|2|4) << "invalid format";
     }
   };
   struct GraphAttr {
     size_t storage_num_not_alloctaed{0};
     std::vector<int> storage_id;
+    std::vector<int> device_index;
     std::vector<std::string> dltype;
     std::vector<std::vector<int64_t> > shape;
     // The graph attribute fields.
@@ -348,6 +333,14 @@ class GraphRuntime : public ModuleNode {
           reader->Read(&shape);
           CHECK(!reader->NextArrayItem());
           bitmask |= 4;
+        } else if (key == "device_index") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&device_index);
+          CHECK(!reader->NextArrayItem());
         } else {
           reader->BeginArray();
           CHECK(reader->NextArrayItem());
@@ -400,8 +393,6 @@ class GraphRuntime : public ModuleNode {
   void SetupStorage();
   /*! \brief Setup the executors. */
   void SetupOpExecs();
-  /*! \brief Get storage id to device map. */
-  StorageDeviceMap GetStorageDeviceMap() const;
   /*!
    * \brief Create a executtion function given input.
    * \param attrs The node attributes.
@@ -443,8 +434,8 @@ class GraphRuntime : public ModuleNode {
   tvm::runtime::Module module_;
   /*! \brief Execution context of all devices including the host. */
   std::vector<TVMContext> ctxs_;
-  /*! \brief Common storage pool for each device. */
-  DeviceStoragePoolMap device_storage_pool_;
+  /*! \brief Common storage pool for all devices. */
+  std::vector<NDArray> storage_pool_;
   /*! \brief Data entry of each node. */
   std::vector<NDArray> data_entry_;
   /*! \brief Operator on each node. */
@@ -481,34 +472,6 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   }
 }
 
-// Return a storage id to device type map. This map will be used to help memory
-// allocation for the storage pool of each device. It will be also used to
-// allocate memory to each data_entry_.
-StorageDeviceMap GraphRuntime::GetStorageDeviceMap() const {
-  StorageDeviceMap sid_dev_map;
-  for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) {
-    const auto& inode = nodes_[nid];
-    for (const auto& e : inode.inputs) {
-      uint32_t eid = this->entry_id(e);
-      uint32_t sid = attrs_.storage_id[eid];
-      auto en_dev = nodes_[e.node_id].device_type;
-      CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev)
-          << "Cannot map the same storage id to multiple devices.";
-      sid_dev_map[sid] = en_dev;
-    }
-  }
-  // Get all output entries.
-  for (const auto& output : outputs_) {
-    uint32_t eid = this->entry_id(output);
-    uint32_t sid = attrs_.storage_id[eid];
-    auto en_dev = nodes_[output.node_id].device_type;
-    CHECK(sid_dev_map.count(sid) == 0 || sid_dev_map[sid] == en_dev)
-        << "Cannot map the same storage id to multiple devices.";
-    sid_dev_map[sid] = en_dev;
-  }
-  return sid_dev_map;
-}
-
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<TVMType> vtype;
@@ -516,12 +479,16 @@ void GraphRuntime::SetupStorage() {
     vtype.push_back(tvm::runtime::String2TVMType(s_type));
   }
 
-  const StorageDeviceMap& sid_dev_map = GetStorageDeviceMap();
-  std::unordered_map<uint32_t, size_t> device_pool_entry_bytes;
-
-  // Find the maximum space size for each device.
+  // Size and device type of each storage pool entry.
+  std::vector<PoolEntry> pool_entry;
+  // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    // Use the fallback device if no device index is available.
+    int device_type = static_cast<int>(ctxs_[0].device_type);
+    if (!attrs_.device_index.empty()) {
+      device_type = attrs_.device_index[i];
+    }
     size_t size = 1;
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
@@ -533,37 +500,43 @@ void GraphRuntime::SetupStorage() {
     size_t bytes = (bits / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
-    device_pool_entry_bytes[sid] =
-        std::max(device_pool_entry_bytes[sid], bytes);
+    if (sid >= pool_entry.size()) {
+      pool_entry.resize(sid + 1, {0, -1});
+    } else {
+      CHECK(pool_entry[sid].device_type == -1 ||
+            pool_entry[sid].device_type == device_type)
+          << "The same pool entry cannot be assigned to multiple devices";
+    }
+    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
+    pool_entry[sid].device_type = device_type;
   }
 
-  // Allocate the space on each device.
-  for (const auto& pit : device_pool_entry_bytes) {
+  // Allocate the space.
+  for (size_t i = 0; i < pool_entry.size(); ++i) {
     std::vector<int64_t> shape;
-    shape.push_back(static_cast<int64_t>(pit.second + 3) / 4);
     TVMContext ctx = ctxs_[0];
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
     for (const auto& cit : ctxs_) {
-      if (sid_dev_map.at(pit.first) == cit.device_type) {
+      if (pool_entry[i].device_type == static_cast<int>(cit.device_type)) {
         ctx = cit;
         break;
       }
     }
-    device_storage_pool_[pit.first] =
-        NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
+    shape.push_back(static_cast<int64_t>(pool_entry[i].size + 3) / 4);
+    storage_pool_.push_back(
+        NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplifiy
   // memory assignment for each node entry. The allocated memory on each device
-  // is mapped to this pool by querying the storage id to device type map.
+  // is mapped to this pool.
   data_entry_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
-    uint32_t storage_id = static_cast<uint32_t>(attrs_.storage_id[i]);
-    CHECK(device_storage_pool_.count(storage_id))
-        << "The storage hasn't been assigned to a specific device.";
+    int storage_id = attrs_.storage_id[i];
+    CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
     data_entry_[i] =
-        device_storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+        storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
   }
 }
 
@@ -715,59 +688,57 @@ Module GraphRuntimeCreate(const std::string& sym_json,
 
 // Get all context for the host and other runtime devices.
 std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
-  std::vector<TVMContext> ret;
-  int* device_types = args[2].ptr<int>();
-  int* device_ids = args[3].ptr<int>();
-  int num_devices = args[4];
-
-  TVMContext ctx;
-  for (int i = 0; i < num_devices; i++) {
-    ctx.device_type = static_cast<DLDeviceType>(device_types[i]);
-    ctx.device_id = device_ids[i];
-    ret.push_back(ctx);
+  // Reserve the first item as the fallback device.
+  std::vector<TVMContext> ret(1);
+  if (args.num_args == 4) {
+    int dev_type = args[2];
+    ret[0].device_type = static_cast<DLDeviceType>(dev_type);
+    ret[0].device_id = args[3];
+  } else {
+    int num_devices = args[2];
+    CHECK_EQ(args.num_args - 3, num_devices * 2)
+        << "The number of device_type and device_id passed in doesn't match, "
+           "or the number of is not the same as the number of devices.";
+    TVMContext ctx;
+    for (int i = 0; i < num_devices; i++) {
+      int dev_type = args[3 + i];
+      ctx.device_type = static_cast<DLDeviceType>(dev_type);
+      ctx.device_id = args[3 + i + num_devices];
+      if (ctx.device_type == static_cast<int>(kDLCPU)) {
+        ret[0] = ctx;
+      } else {
+        ret.push_back(ctx);
+      }
+    }
   }
-
   return ret;
 }
 
+// 4-argument version is currently reserved to keep support of calling
+// from tvm4j and javascript, since they don't have heterogeneous
+// execution support yet. For heterogenenous execution, at least 5 arguments will
+// be passed in. The third one is the number of devices.
+// Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
-      std::vector<TVMContext> contexts;
-      // 4-argument version is currently reserved to keep support of calling
-      // from tvm4j and javascript, since they don't have heterogeneous
-      // execution support yet. For heterogenenous execution, 5 arguments will
-      // be passed in. They are graph_json, module, list of context types, list
-      // of context ids, and the number of devices.
-      // Eventually, we will only have the version with 5 arguments when we
-      // support heterogeneous execution for Java and js.
-      if (args.num_args == 4) {
-        TVMContext ctx;
-        int dev_type = args[2];
-        ctx.device_type = static_cast<DLDeviceType>(dev_type);
-        ctx.device_id = args[3];
-        contexts.push_back(ctx);
-      } else if (args.num_args == 5) {
-        contexts = GetAllContext(args);
-      } else {
-        LOG(FATAL)
-            << "The expected number of arguments for graph_runtime.create is "
-               "4 or 5, but it has "
-            << args.num_args;
-      }
+      CHECK_GE(args.num_args, 4)
+          << "The expected number of arguments for graph_runtime.create is "
+             "at least 4, but it has "
+          << args.num_args;
+      const auto& contexts = GetAllContext(args);
       *rv = GraphRuntimeCreate(args[0], args[1], contexts);
     });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
+      CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                    "graph_runtime.remote_create is "
+                                    "at least 4, but it has "
+                                 << args.num_args;
       void* mhandle = args[1];
-      TVMContext ctx;
-      int dev_type = args[2];
-      ctx.device_type = static_cast<DLDeviceType>(dev_type);
-      ctx.device_id = args[3];
-      std::vector<TVMContext> contexts{ctx};
-      *rv = GraphRuntimeCreate(args[0],
-                               *static_cast<tvm::runtime::Module*>(mhandle),
-                               contexts);
+      const auto& contexts = GetAllContext(args);
+      *rv = GraphRuntimeCreate(
+          args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
     });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 4805bd44ec73..b8fcd089bc87 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -32,12 +32,10 @@ def get_simplex_graph(host_dev_type, device_dev_type):
         A json encoded object.
     """
     # Construct each node in the graph.
-    var_a = {"op": "null", "name": "A", "device_type": device_dev_type,
-             "inputs": []}
-    var_b = {"op": "null", "name": "B", "device_type": device_dev_type,
-             "inputs": []}
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
     elemwise_add = {
-        "op": "tvm_op", "name": "elemwise_add", "device_type": device_dev_type,
+        "op": "tvm_op", "name": "elemwise_add",
         "attrs": {
             "flatten_data": "1",
             "func_name": "elemwise_add",
@@ -49,7 +47,6 @@ def get_simplex_graph(host_dev_type, device_dev_type):
     copy = {
         "op": "device_copy_op",
         "name": "__copy_add_to_sub",
-        "device_type": host_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "__copy",
@@ -58,11 +55,9 @@ def get_simplex_graph(host_dev_type, device_dev_type):
         },
         "inputs": [[2, 0, 0]]
     }
-    var_c = {"op": "null", "name": "C", "device_type": host_dev_type,
-             "inputs": []}
+    var_c = {"op": "null", "name": "C", "inputs": []}
     elemwise_sub = {
         "op": "tvm_op", "name": "elemwise_sub",
-        "device_type": host_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "elemwise_sub",
@@ -81,9 +76,9 @@ def get_simplex_graph(host_dev_type, device_dev_type):
     attrs = {
         "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
         "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
-        "device_type": ["list_int", [device_dev_type, device_dev_type,
-                                     device_dev_type, host_dev_type,
-                                     host_dev_type, host_dev_type]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type, host_dev_type,
+                                      host_dev_type, host_dev_type]],
         "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
         "dltype": ["list_str", ["float32", "float32", "float32",
                                 "float32", "float32", "float32"]]
@@ -129,7 +124,7 @@ def check_device(device, target_device):
         schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
         lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
                               name="elemwise_add")
-        lib_add, host_funcs_add = tvm.build(lower_add, target=target_device,
+        host_funcs_add, lib_add = tvm.build(lower_add, target=target_device,
                                             name="elemwise_add",
                                             postpone_host_codegen=True)
 
@@ -147,15 +142,18 @@ def check_device(device, target_device):
                                              elemwise_sub],
                               name="elemwise_sub")
 
-        lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host,
+        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
                                             name="elemwise_sub",
                                             postpone_host_codegen=True)
         host_funcs = host_funcs_add + host_funcs_sub
-        combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub],
-                                           target_host=target_host)
+        mhost = tvm.codegen.build_module(host_funcs, target_host)
+        if lib_add:
+            mhost.import_module(lib_add)
+        if lib_sub:
+            mhost.import_module(lib_sub)
 
         ctx = [host_ctx, device_ctx]
-        mod = graph_runtime.create(graph, combined_mod, ctx)
+        mod = graph_runtime.create(graph, mhost, ctx)
         params = {}
         params["A"] = tensor_a = np.random.uniform(
             size=shape).astype(tensor_a.dtype)
@@ -203,13 +201,10 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         A json encoded object.
     """
     # Construct each node in the graph.
-    var_a = {"op": "null", "name": "A", "device_type": device_dev_type,
-             "inputs": []}
-    var_b = {"op": "null", "name": "B", "device_type": device_dev_type,
-             "inputs": []}
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
     elemwise_add0 = {
-        "op": "tvm_op", "name": "elemwise_add0", "device_type":
-        device_dev_type,
+        "op": "tvm_op", "name": "elemwise_add0",
         "attrs": {
             "flatten_data": "1",
             "func_name": "elemwise_add0",
@@ -221,7 +216,6 @@ def get_duplex_graph(host_dev_type, device_dev_type):
     copy_add_sub = {
         "op": "device_copy_op",
         "name": "__copy_add_to_sub",
-        "device_type": host_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "__copy",
@@ -230,11 +224,9 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         },
         "inputs": [[2, 0, 0]]
     }
-    var_c = {"op": "null", "name": "C", "device_type": host_dev_type,
-             "inputs": []}
+    var_c = {"op": "null", "name": "C", "inputs": []}
     elemwise_sub = {
         "op": "tvm_op", "name": "elemwise_sub",
-        "device_type": host_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "elemwise_sub",
@@ -246,7 +238,6 @@ def get_duplex_graph(host_dev_type, device_dev_type):
     copy_sub_add = {
         "op": "device_copy_op",
         "name": "__copy_sub_to_add",
-        "device_type": device_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "__copy",
@@ -255,11 +246,9 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         },
         "inputs": [[5, 0, 0]]
     }
-    var_d = {"op": "null", "name": "D", "device_type": device_dev_type,
-             "inputs": []}
+    var_d = {"op": "null", "name": "D", "inputs": []}
     elemwise_add1 = {
         "op": "tvm_op", "name": "elemwise_add1",
-        "device_type": device_dev_type,
         "attrs": {
             "flatten_data": "0",
             "func_name": "elemwise_add1",
@@ -280,11 +269,11 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
         "shape": ["list_shape", [shape, shape, shape, shape, shape, shape,
                                  shape, shape, shape]],
-        "device_type": ["list_int", [device_dev_type, device_dev_type,
-                                device_dev_type,
-                                host_dev_type, host_dev_type, host_dev_type,
-                                device_dev_type, device_dev_type,
-                                device_dev_type]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type,
+                                      host_dev_type, host_dev_type, host_dev_type,
+                                      device_dev_type, device_dev_type,
+                                      device_dev_type]],
         "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
         "dltype": ["list_str", ["float32", "float32", "float32",
                                 "float32", "float32", "float32",
@@ -349,7 +338,7 @@ def check_device(device, target_device):
         lower_add1 = tvm.lower(
             add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
             name="elemwise_add1")
-        lib_add, host_funcs_add = tvm.build([lower_add0, lower_add1],
+        host_funcs_add, lib_add = tvm.build([lower_add0, lower_add1],
                                             target=target_device,
                                             postpone_host_codegen=True)
 
@@ -361,12 +350,15 @@ def check_device(device, target_device):
         lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
                                              elemwise_sub],
                               name="elemwise_sub")
-        lib_sub, host_funcs_sub = tvm.build(lower_sub, target=target_host,
+        host_funcs_sub, lib_sub = tvm.build(lower_sub, target=target_host,
                                             postpone_host_codegen=True)
         host_funcs = host_funcs_add + host_funcs_sub
+        mhost = tvm.codegen.build_module(host_funcs, target_host)
+        if lib_add:
+            mhost.import_module(lib_add)
+        if lib_sub:
+            mhost.import_module(lib_sub)
 
-        combined_mod = tvm.combine_modules(host_funcs, [lib_add, lib_sub],
-                                           target_host=target_host)
         ctx = [host_ctx, device_ctx]
         params = {}
         params["A"] = tensor_a = np.random.uniform(
@@ -379,7 +371,7 @@ def check_device(device, target_device):
             size=shape).astype(tensor_d.dtype)
 
         def check_verify():
-            mod = graph_runtime.create(graph, combined_mod, ctx)
+            mod = graph_runtime.create(graph, mhost, ctx)
             mod.set_input(**params)
             mod.run()
             out = mod.get_output(0, tvm.nd.empty(shape))
@@ -389,7 +381,7 @@ def check_verify():
         def check_load_module():
             temp = util.tempdir()
             path_lib = temp.relpath("deploy.so")
-            combined_mod.export_library(path_lib)
+            mhost.export_library(path_lib)
             with open(temp.relpath("deploy.json"), "w") as out_file:
                 out_file.write(graph)
             loaded_lib = tvm.module.load(path_lib)

From 51a6e3eec79110b7c5732dc6483fabc232bc3753 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 20 Sep 2018 14:09:15 -0700
Subject: [PATCH 07/10] fix python error

---
 python/tvm/contrib/graph_runtime.py           | 38 +++-----
 src/runtime/graph/graph_runtime.cc            | 86 ++++++++-----------
 .../unittest/test_runtime_heterogeneous.py    | 10 +--
 3 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index a857225008f3..5ca55f77c6a8 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -38,41 +38,29 @@ def create(graph_json_str, libmod, ctx):
     elif not isinstance(ctx, (list, tuple)):
         raise ValueError("ctx has to be the type of TVMContext or a list of "
                          "TVMCTVMContext")
-    cpu_ctx_index = -1
     for i, cur_ctx in enumerate(ctx):
         if not isinstance(cur_ctx, TVMContext):
             raise ValueError("ctx has to be the type of TVMContext or a list "
-                             "of TVMCTVMContext")
-        if cur_ctx.device_type == tvm.cpu(0).device_type:
-            cpu_ctx_index = i
-        elif cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
+                             "of TVMContext")
+        if cur_ctx.device_type == tvm.cpu(0).device_type or \
+           cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
             ctx[0], ctx[i] = ctx[i], ctx[0]
 
-    num_devices = len(ctx)
-    device_types = []
-    device_ids = []
-    for cur_ctx in ctx:
-        device_types.append(cur_ctx.device_type)
-        device_ids.append(cur_ctx.device_id)
-
-    if device_types[0] >= rpc_base.RPC_SESS_MASK:
+    # ctx[0] is used as the primary/fallback context. All other ones are used
+    # as device context for heterogeneous execution.
+    device_type_id = [x for c in ctx[1:] for x in [c.device_type, c.device_id]]
+    if ctx[0].device_type >= rpc_base.RPC_SESS_MASK:
         assert libmod.type_key == "rpc"
         assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_types[0] = device_types[0] % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, num_devices,
-                                   *device_types, *device_ids), ctx[0])
-
-    # Assume CPU is the host processor when there are multiple devices on
-    # a hardware platform.
-    if (num_devices > 1) and (cpu_ctx_index < 0):
-        raise RuntimeError(
-            "CPU should be the host processor for heterogenous execution, but"
-            " not found in ctx.")
+        device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK
+        return GraphModule(fcreate(graph_json_str, hmod, device_type,
+                                   ctx[0].device_id, *device_type_id), ctx[0])
+
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, num_devices,
-                               *device_types, *device_ids), ctx[cpu_ctx_index])
+    return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type,
+                               ctx[0].device_id, *device_type_id), ctx[0])
 
 
 class GraphModule(object):
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 4b3194f38c0c..a48047fe369c 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -264,9 +264,10 @@ class GraphRuntime : public ModuleNode {
           bitmask |= 4;
         } else if (key == "flatten_data") {
           param->flatten_data = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 8;
         }
       }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
+      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
     }
     // JSON Loader
     void Load(dmlc::JSONReader *reader) {
@@ -512,18 +513,16 @@ void GraphRuntime::SetupStorage() {
   }
 
   // Allocate the space.
-  for (size_t i = 0; i < pool_entry.size(); ++i) {
+  for (const auto& pit : pool_entry) {
     std::vector<int64_t> shape;
-    TVMContext ctx = ctxs_[0];
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
-    for (const auto& cit : ctxs_) {
-      if (pool_entry[i].device_type == static_cast<int>(cit.device_type)) {
-        ctx = cit;
-        break;
-      }
-    }
-    shape.push_back(static_cast<int64_t>(pool_entry[i].size + 3) / 4);
+    const auto& cit =
+        std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
+          return pit.device_type == static_cast<int>(c.device_type);
+        });
+    TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
+    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
     storage_pool_.push_back(
         NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
   }
@@ -554,8 +553,7 @@ void GraphRuntime::SetupOpExecs() {
       uint32_t eid = this->entry_id(nid, index);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK(inode.op_type == "tvm_op" || inode.op_type == "device_copy_op")
-        << "Can only take tvm_op or device_copy_op as op";
+    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
     op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size());
   }
@@ -689,27 +687,13 @@ Module GraphRuntimeCreate(const std::string& sym_json,
 // Get all context for the host and other runtime devices.
 std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
   // Reserve the first item as the fallback device.
-  std::vector<TVMContext> ret(1);
-  if (args.num_args == 4) {
-    int dev_type = args[2];
-    ret[0].device_type = static_cast<DLDeviceType>(dev_type);
-    ret[0].device_id = args[3];
-  } else {
-    int num_devices = args[2];
-    CHECK_EQ(args.num_args - 3, num_devices * 2)
-        << "The number of device_type and device_id passed in doesn't match, "
-           "or the number of is not the same as the number of devices.";
-    TVMContext ctx;
-    for (int i = 0; i < num_devices; i++) {
-      int dev_type = args[3 + i];
-      ctx.device_type = static_cast<DLDeviceType>(dev_type);
-      ctx.device_id = args[3 + i + num_devices];
-      if (ctx.device_type == static_cast<int>(kDLCPU)) {
-        ret[0] = ctx;
-      } else {
-        ret.push_back(ctx);
-      }
-    }
+  std::vector<TVMContext> ret;
+  TVMContext ctx;
+  for (int i = 2; i < args.num_args; i += 2) {
+    int dev_type = args[i];
+    ctx.device_type = static_cast<DLDeviceType>(dev_type);
+    ctx.device_id = args[i + 1];
+    ret.push_back(ctx);
   }
   return ret;
 }
@@ -720,25 +704,25 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 // be passed in. The third one is the number of devices.
 // Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      CHECK_GE(args.num_args, 4)
-          << "The expected number of arguments for graph_runtime.create is "
-             "at least 4, but it has "
-          << args.num_args;
-      const auto& contexts = GetAllContext(args);
-      *rv = GraphRuntimeCreate(args[0], args[1], contexts);
-    });
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(args[0], args[1], contexts);
+  });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
-                                    "graph_runtime.remote_create is "
-                                    "at least 4, but it has "
-                                 << args.num_args;
-      void* mhandle = args[1];
-      const auto& contexts = GetAllContext(args);
-      *rv = GraphRuntimeCreate(
-          args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
-    });
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                  "graph_runtime.remote_create is "
+                                  "at least 4, but it has "
+                               << args.num_args;
+    void* mhandle = args[1];
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(
+        args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
+  });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index b8fcd089bc87..d129618375fa 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -45,7 +45,7 @@ def get_simplex_graph(host_dev_type, device_dev_type):
         "inputs": [[0, 0, 0], [1, 0, 0]]
     }
     copy = {
-        "op": "device_copy_op",
+        "op": "tvm_op",
         "name": "__copy_add_to_sub",
         "attrs": {
             "flatten_data": "0",
@@ -167,7 +167,7 @@ def check_device(device, target_device):
         np.testing.assert_equal(
             out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
 
-    dev_tar = {"gpu": "cuda", "opencl": "opencl"}
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
     for device, target in dev_tar.items():
         check_device(device, target)
 
@@ -214,7 +214,7 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         "inputs": [[0, 0, 0], [1, 0, 0]]
     }
     copy_add_sub = {
-        "op": "device_copy_op",
+        "op": "tvm_op",
         "name": "__copy_add_to_sub",
         "attrs": {
             "flatten_data": "0",
@@ -236,7 +236,7 @@ def get_duplex_graph(host_dev_type, device_dev_type):
         "inputs": [[3, 0, 0], [4, 0, 0]]
     }
     copy_sub_add = {
-        "op": "device_copy_op",
+        "op": "tvm_op",
         "name": "__copy_sub_to_add",
         "attrs": {
             "flatten_data": "0",
@@ -396,7 +396,7 @@ def check_load_module():
         check_verify()
         check_load_module()
 
-    dev_tar = {"gpu": "cuda", "opencl": "opencl"}
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
     for device, target in dev_tar.items():
         check_device(device, target)
 

From c734697a1f4cd474d0cfc4d2909e5ed09a738990 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 21 Sep 2018 09:13:09 -0700
Subject: [PATCH 08/10] Remove ctx from GraphModule and fix typo

---
 python/tvm/build_module.py                    |  2 +-
 python/tvm/contrib/graph_runtime.py           | 19 ++++++-------------
 .../unittest/test_runtime_heterogeneous.py    |  2 +-
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 9f63d0a6a50d..8e0d16286d6a 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -386,7 +386,7 @@ def build(sch,
           name="default_function",
           binds=None,
           postpone_host_codegen=False):
-    """Build a function with arguments as signiture. Code will be generated
+    """Build a function with arguments as signature. Code will be generated
     for a device specified by the target. For homogeneous execution, a module
     that contains both host and device code is returned. For heterogeneous
     execution, a list of lowered functions for the host and a module containing
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 5ca55f77c6a8..515f74a438e3 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -1,12 +1,10 @@
 """Minimum graph runtime that executes graph containing TVM PackedFunc."""
 import numpy as np
-import tvm
 
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
 from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
-from .. import ndarray as nd
 
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
@@ -42,8 +40,7 @@ def create(graph_json_str, libmod, ctx):
         if not isinstance(cur_ctx, TVMContext):
             raise ValueError("ctx has to be the type of TVMContext or a list "
                              "of TVMContext")
-        if cur_ctx.device_type == tvm.cpu(0).device_type or \
-           cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
+        if cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
             ctx[0], ctx[i] = ctx[i], ctx[0]
 
     # ctx[0] is used as the primary/fallback context. All other ones are used
@@ -56,11 +53,11 @@ def create(graph_json_str, libmod, ctx):
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
         device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK
         return GraphModule(fcreate(graph_json_str, hmod, device_type,
-                                   ctx[0].device_id, *device_type_id), ctx[0])
+                                   ctx[0].device_id, *device_type_id))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
     return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type,
-                               ctx[0].device_id, *device_type_id), ctx[0])
+                               ctx[0].device_id, *device_type_id))
 
 
 class GraphModule(object):
@@ -82,12 +79,9 @@ class GraphModule(object):
     ----------
     module : Module
         The interal tvm module that holds the actual graph functions.
-
-    ctx : TVMContext
-        The context this module is under
     """
 
-    def __init__(self, module, ctx):
+    def __init__(self, module):
         self.module = module
         self._set_input = module["set_input"]
         self._run = module["run"]
@@ -99,7 +93,6 @@ def __init__(self, module, ctx):
         except AttributeError:
             pass
         self._load_params = module["load_params"]
-        self.ctx = ctx
 
     def set_input(self, key=None, value=None, **params):
         """Set inputs to the module via kwargs
@@ -116,14 +109,14 @@ def set_input(self, key=None, value=None, **params):
            Additonal arguments
         """
         if key:
-            self._set_input(key, nd.array(value, ctx=self.ctx))
+            self._get_input(key).copyfrom(value)
 
         if params:
             # upload big arrays first to avoid memory issue in rpc mode
             keys = list(params.keys())
             keys.sort(key=lambda x: -np.prod(params[x].shape))
             for k in keys:
-                self._set_input(k, nd.array(params[k], ctx=self.ctx))
+                self._get_input(k).copyfrom(params[k])
 
     def run(self, **input_dict):
         """Run forward execution of the graph
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index d129618375fa..b916ee285717 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -9,7 +9,7 @@
 
 def get_simplex_graph(host_dev_type, device_dev_type):
     r""" Return the hand-crafted json object where only one copy node is
-    inserted. Tis node copies data from the target device to cpu.
+    inserted. This node copies data from the target device to cpu.
     The network is constructed as following:
                  A    B
                   \  /

From 66b208fb0d4d4de153103bcedf267311e5df2230 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 21 Sep 2018 14:17:42 -0700
Subject: [PATCH 09/10] fix rpc

---
 python/tvm/contrib/graph_runtime.py | 44 +++++++++++++++++------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 515f74a438e3..8b31175b6319 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -36,28 +36,39 @@ def create(graph_json_str, libmod, ctx):
     elif not isinstance(ctx, (list, tuple)):
         raise ValueError("ctx has to be the type of TVMContext or a list of "
                          "TVMCTVMContext")
-    for i, cur_ctx in enumerate(ctx):
+    for cur_ctx in ctx:
         if not isinstance(cur_ctx, TVMContext):
             raise ValueError("ctx has to be the type of TVMContext or a list "
                              "of TVMContext")
-        if cur_ctx.device_type >= rpc_base.RPC_SESS_MASK:
-            ctx[0], ctx[i] = ctx[i], ctx[0]
-
-    # ctx[0] is used as the primary/fallback context. All other ones are used
-    # as device context for heterogeneous execution.
-    device_type_id = [x for c in ctx[1:] for x in [c.device_type, c.device_id]]
-    if ctx[0].device_type >= rpc_base.RPC_SESS_MASK:
-        assert libmod.type_key == "rpc"
-        assert rpc_base._SessTableIndex(libmod) == ctx[0]._rpc_sess._tbl_index
+
+    # device_type_id[0], device_type_id[1] are used as the primary/fallback
+    # context type and id. All other ones are used as device context for
+    # heterogeneous execution.
+    num_rpc_ctx = 0
+    device_type_id = []
+    for cur_ctx in ctx:
+        device_type = cur_ctx.device_type
+        if device_type >= rpc_base.RPC_SESS_MASK:
+            assert libmod.type_key == "rpc"
+            assert rpc_base._SessTableIndex(
+                libmod) == cur_ctx._rpc_sess._tbl_index
+            num_rpc_ctx += 1
+            device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK
+        device_type_id.append(device_type)
+        device_type_id.append(cur_ctx.device_id)
+
+    if 0 < num_rpc_ctx < len(ctx):
+        raise ValueError("Either all or none of the contexts should be rpc.")
+
+    if num_rpc_ctx == len(ctx):
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_type = ctx[0].device_type % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, device_type,
-                                   ctx[0].device_id, *device_type_id))
+        return GraphModule(fcreate(graph_json_str, hmod, device_type_id[0],
+                                   device_type_id[1], *device_type_id[2:]))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, ctx[0].device_type,
-                               ctx[0].device_id, *device_type_id))
+    return GraphModule(fcreate(graph_json_str, libmod, device_type_id[0],
+                               device_type_id[1], *device_type_id[2:]))
 
 
 class GraphModule(object):
@@ -72,9 +83,6 @@ class GraphModule(object):
     module : Module
         The interal tvm module that holds the actual graph functions.
 
-    ctx : TVMContext
-        The context this module is under
-
     Attributes
     ----------
     module : Module

From 143341d7d4e137c111e708e66f6b83b028fcc830 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 21 Sep 2018 19:48:33 -0700
Subject: [PATCH 10/10] pass *device_type_id

---
 python/tvm/contrib/graph_runtime.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 8b31175b6319..f0e83eec0bb8 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -63,12 +63,10 @@ def create(graph_json_str, libmod, ctx):
     if num_rpc_ctx == len(ctx):
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        return GraphModule(fcreate(graph_json_str, hmod, device_type_id[0],
-                                   device_type_id[1], *device_type_id[2:]))
+        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, device_type_id[0],
-                               device_type_id[1], *device_type_id[2:]))
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
 
 
 class GraphModule(object):