From 62cb18053ac438b039477ef343b7e73c3f649ef2 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 7 Nov 2019 07:39:05 +0000
Subject: [PATCH 01/22] [tvm] A minimum runtime for external library

---
 CMakeLists.txt                              |   1 +
 cmake/config.cmake                          |   5 +
 cmake/modules/contrib/Extern.cmake          |  28 ++
 include/tvm/runtime/module.h                |   1 +
 python/tvm/contrib/graph_runtime.py         |  13 +-
 python/tvm/module.py                        |   7 +-
 src/runtime/contrib/external_util.h         |  64 +++++
 src/runtime/contrib/gcc/gcc.cc              |  83 ++++++
 src/runtime/contrib/gcc/gcc.h               |  58 +++++
 src/runtime/dso_module.cc                   |  97 ++-----
 src/runtime/dso_module.h                    | 122 +++++++++
 src/runtime/graph/graph_runtime.cc          |  49 +++-
 src/runtime/graph/graph_runtime.h           |   8 +
 src/runtime/module.cc                       |  11 +-
 tests/python/relay/test_external_runtime.py | 270 ++++++++++++++++++++
 15 files changed, 725 insertions(+), 92 deletions(-)
 create mode 100644 cmake/modules/contrib/Extern.cmake
 create mode 100644 src/runtime/contrib/external_util.h
 create mode 100644 src/runtime/contrib/gcc/gcc.cc
 create mode 100644 src/runtime/contrib/gcc/gcc.h
 create mode 100644 src/runtime/dso_module.h
 create mode 100644 tests/python/relay/test_external_runtime.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bea818b7581..e5d7c7c1ede1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,6 +246,7 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
+include(cmake/modules/contrib/Extern.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 51c929233aa6..41a8f365dd2a 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -143,6 +143,11 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
+# Whether use contrib extern (use ";" to separate multiple externs)
+# Available externs:
+#   gcc
+# set(USE_EXTERN none)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
new file mode 100644
index 000000000000..697a2c5289c3
--- /dev/null
+++ b/cmake/modules/contrib/Extern.cmake
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+message(STATUS "Build with relay.backend.contrib")
+
+list(FIND USE_EXTERN "gcc" _gcc_idx)
+if(_gcc_idx GREATER -1)
+    file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
+    list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+
+    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+    message(STATUS "Use extern library: GCC")
+endif()
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index ff096eec5a43..69db2691a5bf 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -79,6 +79,7 @@ class Module : public ObjectRef {
    * \brief Load a module from file.
    * \param file_name The name of the host function module.
    * \param format The format of the file.
+   * \param external_lib_name The name of the external library.
    * \note This function won't load the import relationship.
    *  Re-create import relationship by calling Import.
    */
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 3e182e26fd22..00053b3f065d 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -22,7 +22,7 @@
 from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
 
-def create(graph_json_str, libmod, ctx):
+def create(graph_json_str, libmod, ctx, ext_lib=None):
     """Create a runtime executor module given a graph and module.
     Parameters
     ----------
@@ -30,13 +30,19 @@ def create(graph_json_str, libmod, ctx):
         The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
+
     libmod : tvm.Module
         The module of the corresponding function
+
     ctx : TVMContext or list of TVMContext
         The context to deploy the module. It can be local or remote when there
         is only one TVMContext. Otherwise, the first context in the list will
         be used as this purpose. All context should be given for heterogeneous
         execution.
+
+    ext_lib: tvm.Module
+        The module contains library functions from external codegen tools.
+
     Returns
     -------
     graph_module : GraphModule
@@ -51,12 +57,15 @@ def create(graph_json_str, libmod, ctx):
     ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx)
 
     if num_rpc_ctx == len(ctx):
+        if ext_lib:
+            raise Exception("External library is not supported for remote "
+                            "execution")
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
         return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
+    return GraphModule(fcreate(graph_json_str, libmod, ext_lib, *device_type_id))
 
 def get_device_ctx(libmod, ctx):
     """Parse and validate all the device context(s).
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 98a35926cfa6..bd44937d4541 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -227,7 +227,7 @@ def system_lib():
     return _GetSystemLib()
 
 
-def load(path, fmt=""):
+def load(path, fmt="", ext_lib=""):
     """Load module from file.
 
     Parameters
@@ -239,6 +239,9 @@ def load(path, fmt=""):
         The format of the file, if not specified
         it will be inferred from suffix of the file.
 
+    ext_lib : str, optional
+        The string to indicate the name of the external codegen
+
     Returns
     -------
     module : Module
@@ -261,7 +264,7 @@ def load(path, fmt=""):
         _cc.create_shared(path + ".so", files)
         path += ".so"
     # Redirect to the load API
-    return _LoadFromFile(path, fmt)
+    return _LoadFromFile(path, fmt, ext_lib)
 
 
 def enabled(target):
diff --git a/src/runtime/contrib/external_util.h b/src/runtime/contrib/external_util.h
new file mode 100644
index 000000000000..cff3a1e6b903
--- /dev/null
+++ b/src/runtime/contrib/external_util.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/extern_util.h
+ * \brief The definition of utility function for the external runtime.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
+#define TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*!
+ * \brief Split the encoded function name to tokens.
+ *
+ * \param the function name string.
+ *
+ * \return a vector of tokenized function name splitted by "_".
+ */
+static inline std::string GetSubgraphID(const std::string& name) {
+  std::string temp = name;
+  std::vector<std::string> tokens;
+  std::string delimiter = "_";
+  size_t pos = 0;
+  std::string token;
+  while ((pos = temp.find(delimiter)) != std::string::npos) {
+    token = temp.substr(0, pos);
+    tokens.push_back(token);
+    temp.erase(0, pos + delimiter.length());
+  }
+  tokens.push_back(temp);
+
+  CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
+  CHECK(tokens[0] == "subgraph")
+      << "Function name does not start with \"subgraph\": " << name;
+  return tokens[1];
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
new file mode 100644
index 000000000000..0aa3b42c0f55
--- /dev/null
+++ b/src/runtime/contrib/gcc/gcc.cc
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "gcc.h"
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/util.h>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+runtime::PackedFunc GccModuleNode::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "init") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      this->Init(args[0]);
+    });
+  } else {
+    std::string curr_id = GetSubgraphID(name);
+
+    CHECK(IsLoaded()) << "The external module has not been built or failed to open.\n";
+    // Generate an external packed function
+    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_GT(args.size(), 0U) << "No input is provided.";
+
+      NDArray input0 = args[0];
+      const DLTensor* dptr = input0.operator->();
+      CHECK(dptr) << "Expect a NDArray as the input.";
+      runtime::NDArray out_arg = args[args.size() - 1];
+      auto out = reinterpret_cast<float*>(out_arg->data);
+
+      // Get function from the library
+      std::string encoded_name = "gcc_" + curr_id;
+      auto func_s = reinterpret_cast<GccSubgraphFunc>(this->GetSymbol(encoded_name.c_str()));
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        GccPackedArgs packed_args;
+        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
+        for (int i = 0; i < args.size() - 1; ++i) {
+          runtime::NDArray arg = args[i];
+          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
+        }
+        (*func_s)(packed_args, out);
+      } else {
+        LOG(FATAL) << "Only float32 values are supported.";
+      }
+      *rv = out;
+    });
+  }
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_gcc_so")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::shared_ptr<DSOModuleNode> n = std::make_shared<GccModuleNode>();
+    n->Init(args[0]);
+    *rv = runtime::Module(n);
+  });
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.h b/src/runtime/contrib/gcc/gcc.h
new file mode 100644
index 000000000000..969b036c7419
--- /dev/null
+++ b/src/runtime/contrib/gcc/gcc.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_GCC_GCC_H_
+#define TVM_RUNTIME_CONTRIB_GCC_GCC_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <string>
+
+#include "../external_util.h"
+#include "../../dso_module.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+constexpr const char* kGccPrefix = "gcc_";
+
+/*!
+ * \brief Defined a data structure to save subgraph args.
+ */
+typedef struct {
+  float** data;
+} GccPackedArgs;
+
+typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
+
+class GccModuleNode : public DSOModuleNode {
+ public:
+  const char* type_key() const final {
+    return "GccModule";
+  }
+
+  runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_GCC_GCC_H_
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index abbbe124a569..5eee836a0b14 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -18,34 +18,43 @@
  */
 
 /*!
- * \file dso_dll_module.cc
+ * \file dso_module.cc
  * \brief Module to load from dynamic shared library.
  */
+#include "dso_module.h"
+
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
+#include <string>
 #include "module_util.h"
 
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
 namespace tvm {
 namespace runtime {
 
-// Module to load from dynamic shared libary.
-// This is the default module TVM used for host-side AOT
-class DSOModuleNode final : public ModuleNode {
- public:
-  ~DSOModuleNode() {
-    if (lib_handle_) Unload();
+PackedFunc DSOModuleNode::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  BackendPackedCFunc faddr;
+  if (name == runtime::symbol::tvm_module_main) {
+    const char* entry_name = reinterpret_cast<const char*>(
+        GetSymbol(runtime::symbol::tvm_module_main));
+    CHECK(entry_name!= nullptr)
+        << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
+    faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(entry_name));
+  } else {
+    faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(name.c_str()));
   }
+  if (faddr == nullptr) return PackedFunc();
+  return WrapPackedFunc(faddr, sptr_to_self);
+}
 
-  const char* type_key() const final {
-    return "dso";
+void DSOModuleNode::Init(const std::string& name) {
+  Load(name);
+  if (auto *ctx_addr =
+      reinterpret_cast<void**>(GetSymbol(runtime::symbol::tvm_module_ctx))) {
+    *ctx_addr = this;
   }
 
   PackedFunc GetFunction(
@@ -64,63 +73,7 @@ class DSOModuleNode final : public ModuleNode {
     if (faddr == nullptr) return PackedFunc();
     return WrapPackedFunc(faddr, sptr_to_self);
   }
-
-  void Init(const std::string& name) {
-    Load(name);
-    if (auto *ctx_addr =
-        reinterpret_cast<void**>(GetSymbol(runtime::symbol::tvm_module_ctx))) {
-      *ctx_addr = this;
-    }
-    InitContextFunctions([this](const char* fname) {
-        return GetSymbol(fname);
-      });
-    // Load the imported modules
-    const char* dev_mblob =
-        reinterpret_cast<const char*>(
-            GetSymbol(runtime::symbol::tvm_dev_mblob));
-    if (dev_mblob != nullptr) {
-      ImportModuleBlob(dev_mblob, &imports_);
-    }
-  }
-
- private:
-  // Platform dependent handling.
-#if defined(_WIN32)
-  // library handle
-  HMODULE lib_handle_{nullptr};
-  // Load the library
-  void Load(const std::string& name) {
-    // use wstring version that is needed by LLVM.
-    std::wstring wname(name.begin(), name.end());
-    lib_handle_ = LoadLibraryW(wname.c_str());
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name;
-  }
-  void* GetSymbol(const char* name) {
-    return reinterpret_cast<void*>(
-        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
-  }
-  void Unload() {
-    FreeLibrary(lib_handle_);
-  }
-#else
-  // Library handle
-  void* lib_handle_{nullptr};
-  // load the library
-  void Load(const std::string& name) {
-    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name
-        << " " << dlerror();
-  }
-  void* GetSymbol(const char* name) {
-    return dlsym(lib_handle_, name);
-  }
-  void Unload() {
-    dlclose(lib_handle_);
-  }
-#endif
-};
+}
 
 TVM_REGISTER_GLOBAL("module.loadfile_so")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/dso_module.h b/src/runtime/dso_module.h
new file mode 100644
index 000000000000..2b27b45ed447
--- /dev/null
+++ b/src/runtime/dso_module.h
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/dso_module.h
+ * \brief Module to load from dynamic shared library.
+ */
+#ifndef TVM_RUNTIME_DSO_MODULE_H_
+#define TVM_RUNTIME_DSO_MODULE_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <string>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tvm {
+namespace runtime {
+
+// Module to load from dynamic shared libary.
+// This is the default module TVM used for host-side AOT
+class DSOModuleNode : public ModuleNode {
+ public:
+  ~DSOModuleNode() {
+    if (lib_handle_) Unload();
+  }
+
+  virtual const char* type_key() const {
+    return "dso";
+  }
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters. This function can be implemented by
+   * different backends as well to implement their own way of retrieving
+   * a function poninter and invoking it.
+   *
+   * \param name the name of the external function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override;
+
+  /*!
+   * \brief Initialize the module using a prvided shared library.
+   * \param name. The dynamically linked shared library.
+   */
+  void Init(const std::string& name);
+
+ protected:
+  // Platform dependent handling.
+#if defined(_WIN32)
+  // library handle
+  HMODULE lib_handle_{nullptr};
+  // Load the library
+  void Load(const std::string& name) {
+    // use wstring version that is needed by LLVM.
+    std::wstring wname(name.begin(), name.end());
+    lib_handle_ = LoadLibraryW(wname.c_str());
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name;
+  }
+  void* GetSymbol(const char* name) {
+    return reinterpret_cast<void*>(
+        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
+  }
+  void Unload() {
+    FreeLibrary(lib_handle_);
+  }
+  // Check if the handle_ is open.
+  bool IsLoaded() const {
+    rewturn lib_handle_ != nullptr;
+  }
+#else
+  // Library handle
+  void* lib_handle_{nullptr};
+  // load the library
+  void Load(const std::string& name) {
+    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name
+        << " " << dlerror();
+  }
+  void* GetSymbol(const char* name) {
+    return dlsym(lib_handle_, name);
+  }
+  void Unload() {
+    dlclose(lib_handle_);
+  }
+  // Check if the handle_ is open.
+  bool IsLoaded() const {
+    return lib_handle_ != nullptr;
+  }
+#endif
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_DSO_MODULE_H_
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 9ad10c1232c3..cca671ac0091 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -67,6 +67,7 @@ void GraphRuntime::Run() {
  */
 void GraphRuntime::Init(const std::string& graph_json,
                         tvm::runtime::Module module,
+                        tvm::runtime::Module ext_module,
                         const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
   std::istringstream is(graph_json);
@@ -76,6 +77,7 @@ void GraphRuntime::Init(const std::string& graph_json,
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
+  ext_module_ = ext_module;
   ctxs_ = ctxs;
   this->SetupStorage();
   this->SetupOpExecs();
@@ -332,27 +334,47 @@ void GraphRuntime::SetupOpExecs() {
     const auto& inode = nodes_[nid];
     if (inode.op_type == "null") continue;
     std::vector<DLTensor> args;
+    std::vector<size_t> arity;
     for (const auto& e : inode.inputs) {
       uint32_t eid = this->entry_id(e);
+      arity.push_back(eid);
       args.push_back(*(data_entry_[eid].operator->()));
     }
     for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
       uint32_t eid = this->entry_id(nid, index);
+      arity.push_back(eid);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
+    CHECK(inode.op_type == "tvm_op" || inode.op_type == "external_op")
+        << "Can only take tvm_op or external_op as op";
 
-    std::shared_ptr<OpArgs> op_args = nullptr;
-    std::tie(op_execs_[nid], op_args) =
+    if (inode.op_type == "tvm_op") {
+      std::shared_ptr<OpArgs> op_args = nullptr;
+      std::tie(op_execs_[nid], op_args) =
         CreateTVMOp(inode.param, args, inode.inputs.size());
 
-    for (size_t i = 0; i < inode.inputs.size(); i++) {
-      uint32_t eid = this->entry_id(inode.inputs[i]);
-      // check if op input is model input
-      if (input_node_eids.count(eid) > 0) {
-        input_dltensors_[eid].push_back(
-            static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+      for (size_t i = 0; i < inode.inputs.size(); i++) {
+        uint32_t eid = this->entry_id(inode.inputs[i]);
+        // check if op input is model input
+        if (input_node_eids.count(eid) > 0) {
+          input_dltensors_[eid].push_back(
+              static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+        }
       }
+    } else if (inode.op_type == "external_op") {
+      tvm::runtime::PackedFunc pf = ext_module_.GetFunction(inode.param.func_name, false);
+      CHECK(pf != nullptr) << "no such function in module: " << inode.param.func_name;
+      auto fexec = [pf, arity, this]() {
+        std::vector<TVMValue> values(arity.size());
+        std::vector<int> codes(arity.size());
+        runtime::TVMArgsSetter setter(values.data(), codes.data());
+        for (size_t i = 0; i < arity.size(); i++) {
+          setter(i, this->data_entry_[arity[i]]);
+        }
+        TVMRetValue rv;
+        pf.CallPacked(TVMArgs(values.data(), codes.data(), arity.size()), &rv);
+      };
+      op_execs_[nid] = fexec;
     }
   }
 }
@@ -477,6 +499,7 @@ PackedFunc GraphRuntime::GetFunction(
 
 Module GraphRuntimeCreate(const std::string& sym_json,
                           const tvm::runtime::Module& m,
+                          const tvm::runtime::Module& ext_m,
                           const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntime>();
   exec->Init(sym_json, m, ctxs);
@@ -488,7 +511,7 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
   // Reserve the first item as the fallback device.
   std::vector<TVMContext> ret;
   TVMContext ctx;
-  for (int i = 2; i < args.num_args; i += 2) {
+  for (int i = 3; i < args.num_args; i += 2) {
     int dev_type = args[i];
     ctx.device_type = static_cast<DLDeviceType>(dev_type);
     ctx.device_id = args[i + 1];
@@ -504,12 +527,12 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 // Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
   .set_body([](TVMArgs args, TVMRetValue* rv) {
-    CHECK_GE(args.num_args, 4)
+    CHECK_GE(args.num_args, 5)
         << "The expected number of arguments for graph_runtime.create is "
-           "at least 4, but it has "
+           "at least 5, but it has "
         << args.num_args;
     const auto& contexts = GetAllContext(args);
-    *rv = GraphRuntimeCreate(args[0], args[1], contexts);
+    *rv = GraphRuntimeCreate(args[0], args[1], args[2], contexts);
   });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index c83d68e08159..cded19f15245 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -96,12 +96,15 @@ class GraphRuntime : public ModuleNode {
    * \param graph_json The execution graph.
    * \param module The module containing the compiled functions for the host
    *  processor.
+   * \param ext_module The module containing the compiled functions using
+   * external codegen tools.
    * \param ctxs The context of the host and devices where graph nodes will be
    *  executed on.
    */
 
   void Init(const std::string& graph_json,
             tvm::runtime::Module module,
+            tvm::runtime::Module ext_module,
             const std::vector<TVMContext>& ctxs);
 
   /*!
@@ -408,6 +411,11 @@ class GraphRuntime : public ModuleNode {
   GraphAttr attrs_;
   /*! \brief The code module that contains both host and device code. */
   tvm::runtime::Module module_;
+  /*!
+   * \brief The code module that contains external library.
+   * TODO(zhiics) Support multiple external modules.
+   */
+  tvm::runtime::Module ext_module_;
   /*! \brief Execution context of all devices including the host. */
   std::vector<TVMContext> ctxs_;
   /*! \brief Common storage pool for all devices. */
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 161675c7ca0c..b1d4c5046bd8 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -76,7 +76,8 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
 }
 
 Module Module::LoadFromFile(const std::string& file_name,
-                            const std::string& format) {
+                            const std::string& format,
+                            const std::string& external_lib_name) {
 #ifndef _LIBCPP_SGX_CONFIG
   std::string fmt = GetFileFormat(file_name, format);
   CHECK(fmt.length() != 0)
@@ -84,7 +85,11 @@ Module Module::LoadFromFile(const std::string& file_name,
   if (fmt == "dll" || fmt == "dylib" || fmt == "dso") {
     fmt = "so";
   }
-  std::string load_f_name = "module.loadfile_" + fmt;
+  std::string load_f_name = "module.loadfile_";
+  if (!external_lib_name.empty()) {
+    load_f_name = load_f_name + external_lib_name + "_";
+  }
+  load_f_name += fmt;
   const PackedFunc* f = Registry::Get(load_f_name);
   CHECK(f != nullptr)
       << "Loader of " << format << "("
@@ -196,7 +201,7 @@ TVM_REGISTER_GLOBAL("module._GetTypeKey")
 
 TVM_REGISTER_GLOBAL("module._LoadFromFile")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = Module::LoadFromFile(args[0], args[1]);
+    *ret = Module::LoadFromFile(args[0], args[1], args[2]);
     });
 
 TVM_REGISTER_GLOBAL("module._SaveToFile")
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
new file mode 100644
index 000000000000..42e9ec85271b
--- /dev/null
+++ b/tests/python/relay/test_external_runtime.py
@@ -0,0 +1,270 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for external runtime."""
+import os
+from shutil import which
+import numpy as np
+import json
+
+import tvm
+from tvm import relay
+
+
+def generate_multinode_binary():
+    '''Generate a binary'''
+
+    code = r'''
+    # include <cstdint>
+    # include <cstring>
+    # include <iostream>
+
+    typedef struct {
+      float** data;
+    } GccPackedArgs;
+
+    # define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)          \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          out[i] = a[i] p_OP_ b[i];                           \
+        }                                                     \
+      }
+
+    # define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
+      extern "C" void p_ID_(float* a, float* b, float* out) {  \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {                \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {              \
+            int64_t k = i * p_DIM2_ + j;                       \
+            out[k] = a[k] p_OP_ b[k];                          \
+          }                                                    \
+        }                                                      \
+      }
+    GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
+    GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
+    GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
+    extern  "C" void gcc_1(GccPackedArgs args, float* out) {
+      float* gcc_input4 = args.data[0];
+      float* gcc_input5 = args.data[1];
+      float* gcc_input6 = args.data[2];
+      float* gcc_input7 = args.data[3];
+      float* buf_0 = (float*)malloc(4 * 100);
+      float* buf_1 = (float*)malloc(4 * 100);
+      float* buf_2 = (float*)malloc(4 * 100);
+      gcc_1_2(gcc_input4, gcc_input5, buf_0);
+      gcc_1_1(buf_0, gcc_input6, buf_1);
+      gcc_1_0(buf_1, gcc_input7, buf_2);
+      memcpy(out, buf_2, 4 *100);
+    }
+    GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
+    GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
+    GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
+    extern  "C" void gcc_0(GccPackedArgs args, float* out) {
+      float* gcc_input0 = args.data[0];
+      float* gcc_input1 = args.data[1];
+      float* gcc_input2 = args.data[2];
+      float* gcc_input3 = args.data[3];
+      float* buf_0 = (float*)malloc(4 * 100);
+      float* buf_1 = (float*)malloc(4 * 100);
+      float* buf_2 = (float*)malloc(4 * 100);
+      gcc_0_2(gcc_input0, gcc_input1, buf_0);
+      gcc_0_1(buf_0, gcc_input2, buf_1);
+      gcc_0_0(buf_1, gcc_input3, buf_2);
+      memcpy(out, buf_2, 4 *100);
+    }
+    '''
+    code = "echo \'" + code + "\'"
+    cmd = "g++ -std=c++11 -shared -fPIC -ldl -o external_test.so -xc++ -"
+    cmd = code + " | " + cmd
+    if os.system(cmd) != 0:
+        raise RuntimeError("Compilation for external_test.so failed")
+
+
+def get_synthetic_lib():
+    x = relay.var('x', shape=(10, 10))
+    w0 = relay.var('w0', shape=(10, 10))
+    w1 = relay.var('w1', shape=(10, 10))
+    w2 = relay.var('w2', shape=(10, 10))
+    w3 = relay.var('w3', shape=(10, 10))
+    w4 = relay.var('w4', shape=(10, 10))
+    w5 = relay.var('w5', shape=(10, 10))
+    w6 = relay.var('w6', shape=(10, 10))
+    w7 = relay.var('w7', shape=(10, 10))
+
+    # subgraph0
+    gcc_input0 = relay.var('gcc_input0', shape=(10, 10))
+    gcc_input1 = relay.var('gcc_input1', shape=(10, 10))
+    gcc_input2 = relay.var('gcc_input2', shape=(10, 10))
+    gcc_input3 = relay.var('gcc_input3', shape=(10, 10))
+    subgraph0 = relay.Function([gcc_input0, gcc_input1, gcc_input2,
+                                gcc_input3], relay.copy(gcc_input0))
+    subgraph0 = subgraph0.set_attribute(
+        "Primitive", tvm.expr.IntImm("int32", 1))
+
+    # Call subgraph0
+    subgraph0_ret = relay.Call(subgraph0, [x, w0, w1, w2])
+
+    # subgraph1
+    gcc_input4 = relay.var('gcc_input4', shape=(10, 10))
+    gcc_input5 = relay.var('gcc_input5', shape=(10, 10))
+    gcc_input6 = relay.var('gcc_input6', shape=(10, 10))
+    gcc_input7 = relay.var('gcc_input7', shape=(10, 10))
+    add1 = relay.add(gcc_input4, gcc_input5)
+    sub1 = relay.subtract(add1, gcc_input6)
+    mul1 = relay.multiply(sub1, gcc_input7)
+    subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6,
+                                gcc_input7], relay.copy(gcc_input4))
+    subgraph1 = subgraph1.set_attribute(
+        "Primitive", tvm.expr.IntImm("int32", 1))
+
+    # Call subgraph1
+    subgraph1_ret = relay.Call(subgraph1, [x, w3, w4, w5])
+
+    # Other ops that will be executed on TVM.
+    add2 = relay.add(x, w6)
+    sub2 = relay.subtract(add2, w7)
+    ret = relay.concatenate((subgraph0_ret, subgraph1_ret, sub2), 0)
+    func = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], ret)
+    mod = relay.Module.from_expr(func)
+    _, lib, _ = relay.build(mod, "llvm")
+    return lib
+
+
+def get_json():
+    nodex = {"op": "null", "name": "x", "inputs": []}
+    node0 = {"op": "null", "name": "w0", "inputs": []}
+    node1 = {"op": "null", "name": "w1", "inputs": []}
+    node2 = {"op": "null", "name": "w2", "inputs": []}
+    node3 = {"op": "null", "name": "w3", "inputs": []}
+    node4 = {"op": "null", "name": "w4", "inputs": []}
+    node5 = {"op": "null", "name": "w5", "inputs": []}
+    node6 = {"op": "null", "name": "w6", "inputs": []}
+    node7 = {"op": "null", "name": "w7", "inputs": []}
+
+    subgraph0 = {
+        "op": "external_op",
+        "name": "subgraph_0",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "4",
+            "func_name": "subgraph_0",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [0, 0, 0],
+            [1, 0, 0],
+            [2, 0, 0],
+            [3, 0, 0],
+        ]
+    }
+    subgraph1 = {
+        "op": "external_op",
+        "name": "subgraph_1",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "4",
+            "func_name": "subgraph_1",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [0, 0, 0],
+            [4, 0, 0],
+            [5, 0, 0],
+            [6, 0, 0],
+        ]
+    }
+
+    fused_op = {
+        "op": "tvm_op",
+        "name": "fused_add_subtract_concatenate",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "5",
+            "func_name": "fused_add_subtract_concatenate",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [9, 0, 0],
+            [10, 0, 0],
+            [0, 0, 0],
+            [7, 0, 0],
+            [8, 0, 0]
+        ]
+    }
+    nodes = [nodex, node0, node1, node2, node3, node4,
+             node5, node6, node7, subgraph0, subgraph1, fused_op]
+    arg_nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
+    heads = [[11, 0, 0]]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    storage_id = ["list_int", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
+
+    shape = ["list_shape", [
+        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [10, 10],
+        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [30, 10]]]
+
+    dltype = ["list_str", [
+        "float32", "float32", "float32", "float32", "float32", "float32",
+        "float32", "float32", "float32", "float32", "float32", "float32"]]
+
+    attrs = {
+        "shape": shape,
+        "dltype": dltype,
+        "storage_id": storage_id,
+    }
+
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+
+    return json.dumps(graph)
+
+
+def test_simulated_runtime():
+    if which("gcc") is None:
+        print("Skip test because gcc is not available.")
+
+    # library that contains external code.
+    generate_multinode_binary()
+
+    json = get_json()
+    lib = get_synthetic_lib()
+    ext_lib = tvm.module.load("external_test.so", ext_lib="gcc")
+
+    mod = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0), ext_lib)
+
+    x_data = np.random.rand(10, 10).astype('float32')
+    mod.set_input("x", x_data)
+    w_data = []
+    for i in range(8):
+        data = np.random.rand(10, 10).astype('float32')
+        w_data.append(data)
+        var = "w" + str(i)
+        mod.set_input(var, data)
+    mod.run()
+    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
+    out = mod.get_output(0, out)
+    tvm.testing.assert_allclose(
+        out.asnumpy(),
+        np.concatenate(
+            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
+             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+             x_data + w_data[6] - w_data[7]),
+            axis=0))
+
+
+if __name__ == "__main__":
+    test_simulated_runtime()

From 67657218431a34bb27a27b94ad10a8b84dbc4168 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 8 Nov 2019 19:09:03 +0000
Subject: [PATCH 02/22] fix lint and no external lib for rpc

---
 python/tvm/contrib/debugger/debug_runtime.py  |  6 ++++--
 python/tvm/contrib/graph_runtime.py           |  2 +-
 src/runtime/contrib/gcc/gcc.cc                |  1 +
 src/runtime/contrib/gcc/gcc.h                 |  1 +
 src/runtime/dso_module.cc                     |  1 +
 src/runtime/dso_module.h                      |  1 +
 .../graph/debug/graph_runtime_debug.cc        | 20 ++++++++++++-------
 src/runtime/graph/graph_runtime.cc            | 10 ++++++++--
 8 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index f77a927eeabf..05b91d99daf1 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -30,7 +30,7 @@
 _DUMP_PATH_PREFIX = "_tvmdbg_"
 
 
-def create(graph_json_str, libmod, ctx, dump_root=None):
+def create(graph_json_str, libmod, ctx, ext_lib=None, dump_root=None):
     """Create a runtime executor module given a graph and module.
 
     Parameters
@@ -69,6 +69,8 @@ def create(graph_json_str, libmod, ctx, dump_root=None):
 
     ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
     if num_rpc_ctx == len(ctx):
+        if ext_lib:
+            raise Exception("RPC doesn't support external library yet")
         libmod = rpc_base._ModuleHandle(libmod)
         try:
             fcreate = ctx[0]._rpc_sess.get_function(
@@ -79,7 +81,7 @@ def create(graph_json_str, libmod, ctx, dump_root=None):
                 "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in "
                 "config.cmake and rebuild TVM to enable debug mode"
             )
-    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
+    func_obj = fcreate(graph_json_str, libmod, ext_lib, *device_type_id)
     return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
 
 
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 00053b3f065d..cc9ca23562a4 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -62,7 +62,7 @@ def create(graph_json_str, libmod, ctx, ext_lib=None):
                             "execution")
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
+        return GraphModule(fcreate(graph_json_str, hmod, ext_lib, *device_type_id))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
     return GraphModule(fcreate(graph_json_str, libmod, ext_lib, *device_type_id))
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
index 0aa3b42c0f55..e2008e2b850d 100644
--- a/src/runtime/contrib/gcc/gcc.cc
+++ b/src/runtime/contrib/gcc/gcc.cc
@@ -23,6 +23,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
+#include <memory>
 #include <string>
 
 namespace tvm {
diff --git a/src/runtime/contrib/gcc/gcc.h b/src/runtime/contrib/gcc/gcc.h
index 969b036c7419..7f05b0d415d7 100644
--- a/src/runtime/contrib/gcc/gcc.h
+++ b/src/runtime/contrib/gcc/gcc.h
@@ -22,6 +22,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <string>
+#include <memory>
 
 #include "../external_util.h"
 #include "../../dso_module.h"
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 5eee836a0b14..4f2eb7515b2c 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -27,6 +27,7 @@
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
+#include <memory>
 #include <string>
 #include "module_util.h"
 
diff --git a/src/runtime/dso_module.h b/src/runtime/dso_module.h
index 2b27b45ed447..2ca084fdc487 100644
--- a/src/runtime/dso_module.h
+++ b/src/runtime/dso_module.h
@@ -26,6 +26,7 @@
 
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
+#include <memory>
 #include <string>
 
 #if defined(_WIN32)
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index ab28cb662f2a..6bce89daec07 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -202,10 +202,12 @@ PackedFunc GraphRuntimeDebug::GetFunction(
  * \brief GraphRuntimeDebugCreate Get the function based on input.
  * \param sym_json The graph symbol in json format.
  * \param m Compiled module which will be loaded.
+ * \param ext_m Compiled module that contains code from external library.
  * \param ctxs All devices contexts.
  */
 Module GraphRuntimeDebugCreate(const std::string& sym_json,
                                const tvm::runtime::Module& m,
+                               const tvm::runtime::Module& ext_m,
                                const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntimeDebug>();
   exec->Init(sym_json, m, ctxs);
@@ -214,18 +216,22 @@ Module GraphRuntimeDebugCreate(const std::string& sym_json,
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    CHECK_GE(args.num_args, 4)
-        << "The expected number of arguments for graph_runtime.create is "
-           "at least 4, but it has "
-        << args.num_args;
-    *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
-  });
+  CHECK_GE(args.num_args, 5)
+      << "The expected number of arguments for graph_runtime.create is "
+         "at least 5, but it has "
+      << args.num_args;
+  if (args[2].type_code() == kModuleHandle) {
+    *rv = GraphRuntimeDebugCreate(args[0], args[1], args[2], GetAllContext(args));
+  } else {
+    *rv = GraphRuntimeDebugCreate(args[0], args[1], Module(nullptr), GetAllContext(args));
+  }
+});
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.remote_create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
                                   "graph_runtime.remote_create is "
-                                  "at least 4, but it has "
+                                  "at least 5, but it has "
                                << args.num_args;
     void* mhandle = args[1];
     ModuleNode* mnode = ObjectInternal::GetModuleNode(mhandle);
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index cca671ac0091..7331250e44da 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -532,19 +532,25 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
            "at least 5, but it has "
         << args.num_args;
     const auto& contexts = GetAllContext(args);
-    *rv = GraphRuntimeCreate(args[0], args[1], args[2], contexts);
+    if (args[2].type_code() == kModuleHandle) {
+      *rv = GraphRuntimeCreate(args[0], args[1], args[2], contexts);
+    } else {
+      *rv = GraphRuntimeCreate(args[0], args[1], Module(nullptr), contexts);
+    }
   });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
                                   "graph_runtime.remote_create is "
-                                  "at least 4, but it has "
+                                  "at least 5, but it has "
                                << args.num_args;
     void* mhandle = args[1];
     ModuleNode* mnode = ObjectInternal::GetModuleNode(mhandle);
 
     const auto& contexts = GetAllContext(args);
+    // TODO(zhiics) RPC is not supported for external library.
+    CHECK_NE(args[2].type_code(), kModuleHandle) << "External library is not supported by RPC";
     *rv = GraphRuntimeCreate(
         args[0], GetRef<Module>(mnode), contexts);
   });

From b24f83a99a119d4ecd4277aabe8d0ad717ea95e5 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 11 Nov 2019 08:14:53 +0000
Subject: [PATCH 03/22] remove gcc extern runtime

---
 CMakeLists.txt                                |   1 -
 cmake/config.cmake                            |   5 -
 cmake/modules/contrib/Extern.cmake            |  28 ----
 include/tvm/runtime/module.h                  |   1 -
 python/tvm/contrib/debugger/debug_runtime.py  |   6 +-
 python/tvm/contrib/graph_runtime.py           |  15 +-
 python/tvm/module.py                          |  14 +-
 src/codegen/source_module.cc                  |   3 +
 src/runtime/contrib/external_util.h           |  64 --------
 src/runtime/contrib/gcc/gcc.cc                |  84 -----------
 src/runtime/contrib/gcc/gcc.h                 |  59 --------
 src/runtime/dso_module.cc                     |  96 ++++++++----
 src/runtime/dso_module.h                      | 123 ----------------
 .../graph/debug/graph_runtime_debug.cc        |  20 +--
 src/runtime/graph/graph_runtime.cc            |  57 ++------
 src/runtime/graph/graph_runtime.h             |   8 -
 src/runtime/module.cc                         |  11 +-
 tests/python/relay/test_external_runtime.py   | 137 ++++++++++--------
 18 files changed, 191 insertions(+), 541 deletions(-)
 delete mode 100644 cmake/modules/contrib/Extern.cmake
 delete mode 100644 src/runtime/contrib/external_util.h
 delete mode 100644 src/runtime/contrib/gcc/gcc.cc
 delete mode 100644 src/runtime/contrib/gcc/gcc.h
 delete mode 100644 src/runtime/dso_module.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5d7c7c1ede1..2bea818b7581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,7 +246,6 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
-include(cmake/modules/contrib/Extern.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 41a8f365dd2a..51c929233aa6 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -143,11 +143,6 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
-# Whether use contrib extern (use ";" to separate multiple externs)
-# Available externs:
-#   gcc
-# set(USE_EXTERN none)
-
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
deleted file mode 100644
index 697a2c5289c3..000000000000
--- a/cmake/modules/contrib/Extern.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-message(STATUS "Build with relay.backend.contrib")
-
-list(FIND USE_EXTERN "gcc" _gcc_idx)
-if(_gcc_idx GREATER -1)
-    file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
-    list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
-
-    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
-    message(STATUS "Use extern library: GCC")
-endif()
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 69db2691a5bf..ff096eec5a43 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -79,7 +79,6 @@ class Module : public ObjectRef {
    * \brief Load a module from file.
    * \param file_name The name of the host function module.
    * \param format The format of the file.
-   * \param external_lib_name The name of the external library.
    * \note This function won't load the import relationship.
    *  Re-create import relationship by calling Import.
    */
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 05b91d99daf1..f77a927eeabf 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -30,7 +30,7 @@
 _DUMP_PATH_PREFIX = "_tvmdbg_"
 
 
-def create(graph_json_str, libmod, ctx, ext_lib=None, dump_root=None):
+def create(graph_json_str, libmod, ctx, dump_root=None):
     """Create a runtime executor module given a graph and module.
 
     Parameters
@@ -69,8 +69,6 @@ def create(graph_json_str, libmod, ctx, ext_lib=None, dump_root=None):
 
     ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
     if num_rpc_ctx == len(ctx):
-        if ext_lib:
-            raise Exception("RPC doesn't support external library yet")
         libmod = rpc_base._ModuleHandle(libmod)
         try:
             fcreate = ctx[0]._rpc_sess.get_function(
@@ -81,7 +79,7 @@ def create(graph_json_str, libmod, ctx, ext_lib=None, dump_root=None):
                 "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in "
                 "config.cmake and rebuild TVM to enable debug mode"
             )
-    func_obj = fcreate(graph_json_str, libmod, ext_lib, *device_type_id)
+    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
     return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
 
 
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index cc9ca23562a4..3e182e26fd22 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -22,7 +22,7 @@
 from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
 
-def create(graph_json_str, libmod, ctx, ext_lib=None):
+def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
     Parameters
     ----------
@@ -30,19 +30,13 @@ def create(graph_json_str, libmod, ctx, ext_lib=None):
         The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
-
     libmod : tvm.Module
         The module of the corresponding function
-
     ctx : TVMContext or list of TVMContext
         The context to deploy the module. It can be local or remote when there
         is only one TVMContext. Otherwise, the first context in the list will
         be used as this purpose. All context should be given for heterogeneous
         execution.
-
-    ext_lib: tvm.Module
-        The module contains library functions from external codegen tools.
-
     Returns
     -------
     graph_module : GraphModule
@@ -57,15 +51,12 @@ def create(graph_json_str, libmod, ctx, ext_lib=None):
     ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx)
 
     if num_rpc_ctx == len(ctx):
-        if ext_lib:
-            raise Exception("External library is not supported for remote "
-                            "execution")
         hmod = rpc_base._ModuleHandle(libmod)
         fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        return GraphModule(fcreate(graph_json_str, hmod, ext_lib, *device_type_id))
+        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
 
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, ext_lib, *device_type_id))
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
 
 def get_device_ctx(libmod, ctx):
     """Parse and validate all the device context(s).
diff --git a/python/tvm/module.py b/python/tvm/module.py
index bd44937d4541..2790227f32c7 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -144,7 +144,12 @@ def export_library(self,
             else:
                 fcompile = _cc.create_shared
         if self.type_key == "c":
-            kwargs.update({'options': ["-I" + path for path in find_include_path()]})
+            options = []
+            if "options" in kwargs:
+                opts = kwargs["options"]
+                options = opts if isinstance(opts, (list, tuple)) else [opts]
+            opts = options + ["-I" + path for path in find_include_path()]
+            kwargs.update({'options': opts})
         fcompile(file_name, files, **kwargs)
 
     def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
@@ -227,7 +232,7 @@ def system_lib():
     return _GetSystemLib()
 
 
-def load(path, fmt="", ext_lib=""):
+def load(path, fmt=""):
     """Load module from file.
 
     Parameters
@@ -239,9 +244,6 @@ def load(path, fmt="", ext_lib=""):
         The format of the file, if not specified
         it will be inferred from suffix of the file.
 
-    ext_lib : str, optional
-        The string to indicate the name of the external codegen
-
     Returns
     -------
     module : Module
@@ -264,7 +266,7 @@ def load(path, fmt="", ext_lib=""):
         _cc.create_shared(path + ".so", files)
         path += ".so"
     # Redirect to the load API
-    return _LoadFromFile(path, fmt, ext_lib)
+    return _LoadFromFile(path, fmt)
 
 
 def enabled(target):
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index adbe7eaed451..e23ce60223f2 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -185,5 +185,8 @@ runtime::Module DeviceSourceModuleCreate(
 
 TVM_REGISTER_GLOBAL("module.source_module_create")
 .set_body_typed(SourceModuleCreate);
+
+TVM_REGISTER_GLOBAL("module.csource_module_create")
+.set_body_typed(CSourceModuleCreate);
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/runtime/contrib/external_util.h b/src/runtime/contrib/external_util.h
deleted file mode 100644
index cff3a1e6b903..000000000000
--- a/src/runtime/contrib/external_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/extern_util.h
- * \brief The definition of utility function for the external runtime.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
-#define TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
-
-#include <string>
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-/*!
- * \brief Split the encoded function name to tokens.
- *
- * \param the function name string.
- *
- * \return a vector of tokenized function name splitted by "_".
- */
-static inline std::string GetSubgraphID(const std::string& name) {
-  std::string temp = name;
-  std::vector<std::string> tokens;
-  std::string delimiter = "_";
-  size_t pos = 0;
-  std::string token;
-  while ((pos = temp.find(delimiter)) != std::string::npos) {
-    token = temp.substr(0, pos);
-    tokens.push_back(token);
-    temp.erase(0, pos + delimiter.length());
-  }
-  tokens.push_back(temp);
-
-  CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
-  CHECK(tokens[0] == "subgraph")
-      << "Function name does not start with \"subgraph\": " << name;
-  return tokens[1];
-}
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_EXTERNAL_UTIL_H_
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
deleted file mode 100644
index e2008e2b850d..000000000000
--- a/src/runtime/contrib/gcc/gcc.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include "gcc.h"
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/util.h>
-#include <memory>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-runtime::PackedFunc GccModuleNode::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "init") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->Init(args[0]);
-    });
-  } else {
-    std::string curr_id = GetSubgraphID(name);
-
-    CHECK(IsLoaded()) << "The external module has not been built or failed to open.\n";
-    // Generate an external packed function
-    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_GT(args.size(), 0U) << "No input is provided.";
-
-      NDArray input0 = args[0];
-      const DLTensor* dptr = input0.operator->();
-      CHECK(dptr) << "Expect a NDArray as the input.";
-      runtime::NDArray out_arg = args[args.size() - 1];
-      auto out = reinterpret_cast<float*>(out_arg->data);
-
-      // Get function from the library
-      std::string encoded_name = "gcc_" + curr_id;
-      auto func_s = reinterpret_cast<GccSubgraphFunc>(this->GetSymbol(encoded_name.c_str()));
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        GccPackedArgs packed_args;
-        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
-        for (int i = 0; i < args.size() - 1; ++i) {
-          runtime::NDArray arg = args[i];
-          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
-        }
-        (*func_s)(packed_args, out);
-      } else {
-        LOG(FATAL) << "Only float32 values are supported.";
-      }
-      *rv = out;
-    });
-  }
-}
-
-TVM_REGISTER_GLOBAL("module.loadfile_gcc_so")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    std::shared_ptr<DSOModuleNode> n = std::make_shared<GccModuleNode>();
-    n->Init(args[0]);
-    *rv = runtime::Module(n);
-  });
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.h b/src/runtime/contrib/gcc/gcc.h
deleted file mode 100644
index 7f05b0d415d7..000000000000
--- a/src/runtime/contrib/gcc/gcc.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_RUNTIME_CONTRIB_GCC_GCC_H_
-#define TVM_RUNTIME_CONTRIB_GCC_GCC_H_
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <string>
-#include <memory>
-
-#include "../external_util.h"
-#include "../../dso_module.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-constexpr const char* kGccPrefix = "gcc_";
-
-/*!
- * \brief Defined a data structure to save subgraph args.
- */
-typedef struct {
-  float** data;
-} GccPackedArgs;
-
-typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
-
-class GccModuleNode : public DSOModuleNode {
- public:
-  const char* type_key() const final {
-    return "GccModule";
-  }
-
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-};
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_GCC_GCC_H_
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 4f2eb7515b2c..4e189573ffa5 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -21,41 +21,31 @@
  * \file dso_module.cc
  * \brief Module to load from dynamic shared library.
  */
-#include "dso_module.h"
-
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include <memory>
-#include <string>
 #include "module_util.h"
 
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
 namespace tvm {
 namespace runtime {
 
-PackedFunc DSOModuleNode::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  BackendPackedCFunc faddr;
-  if (name == runtime::symbol::tvm_module_main) {
-    const char* entry_name = reinterpret_cast<const char*>(
-        GetSymbol(runtime::symbol::tvm_module_main));
-    CHECK(entry_name!= nullptr)
-        << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
-    faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(entry_name));
-  } else {
-    faddr = reinterpret_cast<BackendPackedCFunc>(GetSymbol(name.c_str()));
+// Module to load from dynamic shared libary.
+// This is the default module TVM used for host-side AOT
+class DSOModuleNode final : public ModuleNode {
+ public:
+  ~DSOModuleNode() {
+    if (lib_handle_) Unload();
   }
-  if (faddr == nullptr) return PackedFunc();
-  return WrapPackedFunc(faddr, sptr_to_self);
-}
 
-void DSOModuleNode::Init(const std::string& name) {
-  Load(name);
-  if (auto *ctx_addr =
-      reinterpret_cast<void**>(GetSymbol(runtime::symbol::tvm_module_ctx))) {
-    *ctx_addr = this;
+  const char* type_key() const final {
+    return "dso";
   }
 
   PackedFunc GetFunction(
@@ -74,7 +64,63 @@ void DSOModuleNode::Init(const std::string& name) {
     if (faddr == nullptr) return PackedFunc();
     return WrapPackedFunc(faddr, sptr_to_self);
   }
-}
+
+  void Init(const std::string& name) {
+    Load(name);
+    if (auto *ctx_addr =
+        reinterpret_cast<void**>(GetSymbol(runtime::symbol::tvm_module_ctx))) {
+      *ctx_addr = this;
+    }
+    InitContextFunctions([this](const char* fname) {
+        return GetSymbol(fname);
+      });
+    // Load the imported modules
+    const char* dev_mblob =
+        reinterpret_cast<const char*>(
+            GetSymbol(runtime::symbol::tvm_dev_mblob));
+    if (dev_mblob != nullptr) {
+      ImportModuleBlob(dev_mblob, &imports_);
+    }
+  }
+
+ private:
+  // Platform dependent handling.
+#if defined(_WIN32)
+  // library handle
+  HMODULE lib_handle_{nullptr};
+  // Load the library
+  void Load(const std::string& name) {
+    // use wstring version that is needed by LLVM.
+    std::wstring wname(name.begin(), name.end());
+    lib_handle_ = LoadLibraryW(wname.c_str());
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name;
+  }
+  void* GetSymbol(const char* name) {
+    return reinterpret_cast<void*>(
+        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
+  }
+  void Unload() {
+    FreeLibrary(lib_handle_);
+  }
+#else
+  // Library handle
+  void* lib_handle_{nullptr};
+  // load the library
+  void Load(const std::string& name) {
+    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name
+        << " " << dlerror();
+  }
+  void* GetSymbol(const char* name) {
+    return dlsym(lib_handle_, name);
+  }
+  void Unload() {
+    dlclose(lib_handle_);
+  }
+#endif
+};
 
 TVM_REGISTER_GLOBAL("module.loadfile_so")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/dso_module.h b/src/runtime/dso_module.h
deleted file mode 100644
index 2ca084fdc487..000000000000
--- a/src/runtime/dso_module.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/dso_module.h
- * \brief Module to load from dynamic shared library.
- */
-#ifndef TVM_RUNTIME_DSO_MODULE_H_
-#define TVM_RUNTIME_DSO_MODULE_H_
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <memory>
-#include <string>
-
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-namespace tvm {
-namespace runtime {
-
-// Module to load from dynamic shared libary.
-// This is the default module TVM used for host-side AOT
-class DSOModuleNode : public ModuleNode {
- public:
-  ~DSOModuleNode() {
-    if (lib_handle_) Unload();
-  }
-
-  virtual const char* type_key() const {
-    return "dso";
-  }
-
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters. This function can be implemented by
-   * different backends as well to implement their own way of retrieving
-   * a function poninter and invoking it.
-   *
-   * \param name the name of the external function.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override;
-
-  /*!
-   * \brief Initialize the module using a prvided shared library.
-   * \param name. The dynamically linked shared library.
-   */
-  void Init(const std::string& name);
-
- protected:
-  // Platform dependent handling.
-#if defined(_WIN32)
-  // library handle
-  HMODULE lib_handle_{nullptr};
-  // Load the library
-  void Load(const std::string& name) {
-    // use wstring version that is needed by LLVM.
-    std::wstring wname(name.begin(), name.end());
-    lib_handle_ = LoadLibraryW(wname.c_str());
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name;
-  }
-  void* GetSymbol(const char* name) {
-    return reinterpret_cast<void*>(
-        GetProcAddress(lib_handle_, (LPCSTR)name)); // NOLINT(*)
-  }
-  void Unload() {
-    FreeLibrary(lib_handle_);
-  }
-  // Check if the handle_ is open.
-  bool IsLoaded() const {
-    rewturn lib_handle_ != nullptr;
-  }
-#else
-  // Library handle
-  void* lib_handle_{nullptr};
-  // load the library
-  void Load(const std::string& name) {
-    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name
-        << " " << dlerror();
-  }
-  void* GetSymbol(const char* name) {
-    return dlsym(lib_handle_, name);
-  }
-  void Unload() {
-    dlclose(lib_handle_);
-  }
-  // Check if the handle_ is open.
-  bool IsLoaded() const {
-    return lib_handle_ != nullptr;
-  }
-#endif
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_DSO_MODULE_H_
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 6bce89daec07..ab28cb662f2a 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -202,12 +202,10 @@ PackedFunc GraphRuntimeDebug::GetFunction(
  * \brief GraphRuntimeDebugCreate Get the function based on input.
  * \param sym_json The graph symbol in json format.
  * \param m Compiled module which will be loaded.
- * \param ext_m Compiled module that contains code from external library.
  * \param ctxs All devices contexts.
  */
 Module GraphRuntimeDebugCreate(const std::string& sym_json,
                                const tvm::runtime::Module& m,
-                               const tvm::runtime::Module& ext_m,
                                const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntimeDebug>();
   exec->Init(sym_json, m, ctxs);
@@ -216,22 +214,18 @@ Module GraphRuntimeDebugCreate(const std::string& sym_json,
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 5)
-      << "The expected number of arguments for graph_runtime.create is "
-         "at least 5, but it has "
-      << args.num_args;
-  if (args[2].type_code() == kModuleHandle) {
-    *rv = GraphRuntimeDebugCreate(args[0], args[1], args[2], GetAllContext(args));
-  } else {
-    *rv = GraphRuntimeDebugCreate(args[0], args[1], Module(nullptr), GetAllContext(args));
-  }
-});
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.remote_create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
                                   "graph_runtime.remote_create is "
-                                  "at least 5, but it has "
+                                  "at least 4, but it has "
                                << args.num_args;
     void* mhandle = args[1];
     ModuleNode* mnode = ObjectInternal::GetModuleNode(mhandle);
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7331250e44da..9ad10c1232c3 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -67,7 +67,6 @@ void GraphRuntime::Run() {
  */
 void GraphRuntime::Init(const std::string& graph_json,
                         tvm::runtime::Module module,
-                        tvm::runtime::Module ext_module,
                         const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
   std::istringstream is(graph_json);
@@ -77,7 +76,6 @@ void GraphRuntime::Init(const std::string& graph_json,
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
-  ext_module_ = ext_module;
   ctxs_ = ctxs;
   this->SetupStorage();
   this->SetupOpExecs();
@@ -334,47 +332,27 @@ void GraphRuntime::SetupOpExecs() {
     const auto& inode = nodes_[nid];
     if (inode.op_type == "null") continue;
     std::vector<DLTensor> args;
-    std::vector<size_t> arity;
     for (const auto& e : inode.inputs) {
       uint32_t eid = this->entry_id(e);
-      arity.push_back(eid);
       args.push_back(*(data_entry_[eid].operator->()));
     }
     for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
       uint32_t eid = this->entry_id(nid, index);
-      arity.push_back(eid);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK(inode.op_type == "tvm_op" || inode.op_type == "external_op")
-        << "Can only take tvm_op or external_op as op";
+    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
-    if (inode.op_type == "tvm_op") {
-      std::shared_ptr<OpArgs> op_args = nullptr;
-      std::tie(op_execs_[nid], op_args) =
+    std::shared_ptr<OpArgs> op_args = nullptr;
+    std::tie(op_execs_[nid], op_args) =
         CreateTVMOp(inode.param, args, inode.inputs.size());
 
-      for (size_t i = 0; i < inode.inputs.size(); i++) {
-        uint32_t eid = this->entry_id(inode.inputs[i]);
-        // check if op input is model input
-        if (input_node_eids.count(eid) > 0) {
-          input_dltensors_[eid].push_back(
-              static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
-        }
+    for (size_t i = 0; i < inode.inputs.size(); i++) {
+      uint32_t eid = this->entry_id(inode.inputs[i]);
+      // check if op input is model input
+      if (input_node_eids.count(eid) > 0) {
+        input_dltensors_[eid].push_back(
+            static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
       }
-    } else if (inode.op_type == "external_op") {
-      tvm::runtime::PackedFunc pf = ext_module_.GetFunction(inode.param.func_name, false);
-      CHECK(pf != nullptr) << "no such function in module: " << inode.param.func_name;
-      auto fexec = [pf, arity, this]() {
-        std::vector<TVMValue> values(arity.size());
-        std::vector<int> codes(arity.size());
-        runtime::TVMArgsSetter setter(values.data(), codes.data());
-        for (size_t i = 0; i < arity.size(); i++) {
-          setter(i, this->data_entry_[arity[i]]);
-        }
-        TVMRetValue rv;
-        pf.CallPacked(TVMArgs(values.data(), codes.data(), arity.size()), &rv);
-      };
-      op_execs_[nid] = fexec;
     }
   }
 }
@@ -499,7 +477,6 @@ PackedFunc GraphRuntime::GetFunction(
 
 Module GraphRuntimeCreate(const std::string& sym_json,
                           const tvm::runtime::Module& m,
-                          const tvm::runtime::Module& ext_m,
                           const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntime>();
   exec->Init(sym_json, m, ctxs);
@@ -511,7 +488,7 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
   // Reserve the first item as the fallback device.
   std::vector<TVMContext> ret;
   TVMContext ctx;
-  for (int i = 3; i < args.num_args; i += 2) {
+  for (int i = 2; i < args.num_args; i += 2) {
     int dev_type = args[i];
     ctx.device_type = static_cast<DLDeviceType>(dev_type);
     ctx.device_id = args[i + 1];
@@ -527,30 +504,24 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 // Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
   .set_body([](TVMArgs args, TVMRetValue* rv) {
-    CHECK_GE(args.num_args, 5)
+    CHECK_GE(args.num_args, 4)
         << "The expected number of arguments for graph_runtime.create is "
-           "at least 5, but it has "
+           "at least 4, but it has "
         << args.num_args;
     const auto& contexts = GetAllContext(args);
-    if (args[2].type_code() == kModuleHandle) {
-      *rv = GraphRuntimeCreate(args[0], args[1], args[2], contexts);
-    } else {
-      *rv = GraphRuntimeCreate(args[0], args[1], Module(nullptr), contexts);
-    }
+    *rv = GraphRuntimeCreate(args[0], args[1], contexts);
   });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
                                   "graph_runtime.remote_create is "
-                                  "at least 5, but it has "
+                                  "at least 4, but it has "
                                << args.num_args;
     void* mhandle = args[1];
     ModuleNode* mnode = ObjectInternal::GetModuleNode(mhandle);
 
     const auto& contexts = GetAllContext(args);
-    // TODO(zhiics) RPC is not supported for external library.
-    CHECK_NE(args[2].type_code(), kModuleHandle) << "External library is not supported by RPC";
     *rv = GraphRuntimeCreate(
         args[0], GetRef<Module>(mnode), contexts);
   });
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index cded19f15245..c83d68e08159 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -96,15 +96,12 @@ class GraphRuntime : public ModuleNode {
    * \param graph_json The execution graph.
    * \param module The module containing the compiled functions for the host
    *  processor.
-   * \param ext_module The module containing the compiled functions using
-   * external codegen tools.
    * \param ctxs The context of the host and devices where graph nodes will be
    *  executed on.
    */
 
   void Init(const std::string& graph_json,
             tvm::runtime::Module module,
-            tvm::runtime::Module ext_module,
             const std::vector<TVMContext>& ctxs);
 
   /*!
@@ -411,11 +408,6 @@ class GraphRuntime : public ModuleNode {
   GraphAttr attrs_;
   /*! \brief The code module that contains both host and device code. */
   tvm::runtime::Module module_;
-  /*!
-   * \brief The code module that contains external library.
-   * TODO(zhiics) Support multiple external modules.
-   */
-  tvm::runtime::Module ext_module_;
   /*! \brief Execution context of all devices including the host. */
   std::vector<TVMContext> ctxs_;
   /*! \brief Common storage pool for all devices. */
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index b1d4c5046bd8..161675c7ca0c 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -76,8 +76,7 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
 }
 
 Module Module::LoadFromFile(const std::string& file_name,
-                            const std::string& format,
-                            const std::string& external_lib_name) {
+                            const std::string& format) {
 #ifndef _LIBCPP_SGX_CONFIG
   std::string fmt = GetFileFormat(file_name, format);
   CHECK(fmt.length() != 0)
@@ -85,11 +84,7 @@ Module Module::LoadFromFile(const std::string& file_name,
   if (fmt == "dll" || fmt == "dylib" || fmt == "dso") {
     fmt = "so";
   }
-  std::string load_f_name = "module.loadfile_";
-  if (!external_lib_name.empty()) {
-    load_f_name = load_f_name + external_lib_name + "_";
-  }
-  load_f_name += fmt;
+  std::string load_f_name = "module.loadfile_" + fmt;
   const PackedFunc* f = Registry::Get(load_f_name);
   CHECK(f != nullptr)
       << "Loader of " << format << "("
@@ -201,7 +196,7 @@ TVM_REGISTER_GLOBAL("module._GetTypeKey")
 
 TVM_REGISTER_GLOBAL("module._LoadFromFile")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = Module::LoadFromFile(args[0], args[1], args[2]);
+    *ret = Module::LoadFromFile(args[0], args[1]);
     });
 
 TVM_REGISTER_GLOBAL("module._SaveToFile")
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 42e9ec85271b..b820fbeafb61 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -15,81 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 """Unit tests for external runtime."""
-import os
 from shutil import which
-import numpy as np
 import json
+import numpy as np
 
 import tvm
 from tvm import relay
+from tvm import module as _tvm_module
 
 
-def generate_multinode_binary():
-    '''Generate a binary'''
+def generate_csource_module():
+    """Generate a binary"""
 
     code = r'''
-    # include <cstdint>
-    # include <cstring>
-    # include <iostream>
-
-    typedef struct {
-      float** data;
-    } GccPackedArgs;
+    #include <tvm/runtime/c_runtime_api.h>
+    #include <dlpack/dlpack.h>
+    #include <cstdint>
+    #include <cstring>
+    #include <iostream>
 
-    # define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)          \
+    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
       extern "C" void p_ID_(float* a, float* b, float* out) { \
         for (int64_t i = 0; i < p_DIM1_; ++i) {               \
           out[i] = a[i] p_OP_ b[i];                           \
         }                                                     \
       }
 
-    # define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-      extern "C" void p_ID_(float* a, float* b, float* out) {  \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {                \
-          for (int64_t j = 0; j < p_DIM2_; ++j) {              \
-            int64_t k = i * p_DIM2_ + j;                       \
-            out[k] = a[k] p_OP_ b[k];                          \
-          }                                                    \
-        }                                                      \
+    #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {             \
+            int64_t k = i * p_DIM2_ + j;                      \
+            out[k] = a[k] p_OP_ b[k];                         \
+          }                                                   \
+        }                                                     \
       }
     GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
     GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
     GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
-    extern  "C" void gcc_1(GccPackedArgs args, float* out) {
-      float* gcc_input4 = args.data[0];
-      float* gcc_input5 = args.data[1];
-      float* gcc_input6 = args.data[2];
-      float* gcc_input7 = args.data[3];
+
+    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
+                           float* gcc_input6, float* gcc_input7, float* out) {
       float* buf_0 = (float*)malloc(4 * 100);
       float* buf_1 = (float*)malloc(4 * 100);
-      float* buf_2 = (float*)malloc(4 * 100);
       gcc_1_2(gcc_input4, gcc_input5, buf_0);
       gcc_1_1(buf_0, gcc_input6, buf_1);
-      gcc_1_0(buf_1, gcc_input7, buf_2);
-      memcpy(out, buf_2, 4 *100);
+      gcc_1_0(buf_1, gcc_input7, out);
+    }
+
+    extern "C" int gcc_1(TVMValue* value, int* type_code, int nargs) {
+      if (nargs != 5) {
+        printf("Expect 5 args, but get %d", nargs);
+        return 1;
+      }
+      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+      gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+             static_cast<float*>(out->data));
+      return 0;
     }
+
     GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
     GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
     GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
-    extern  "C" void gcc_0(GccPackedArgs args, float* out) {
-      float* gcc_input0 = args.data[0];
-      float* gcc_input1 = args.data[1];
-      float* gcc_input2 = args.data[2];
-      float* gcc_input3 = args.data[3];
+
+    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
+                           float* gcc_input2, float* gcc_input3, float* out) {
       float* buf_0 = (float*)malloc(4 * 100);
       float* buf_1 = (float*)malloc(4 * 100);
-      float* buf_2 = (float*)malloc(4 * 100);
       gcc_0_2(gcc_input0, gcc_input1, buf_0);
       gcc_0_1(buf_0, gcc_input2, buf_1);
-      gcc_0_0(buf_1, gcc_input3, buf_2);
-      memcpy(out, buf_2, 4 *100);
+      gcc_0_0(buf_1, gcc_input3, out);
+    }
+
+    extern "C" int gcc_0(TVMValue* value, int* type_code, int nargs) {
+      if (nargs != 5) {
+        printf("Expect 5 args, but get %d", nargs);
+        return 1;
+      }
+      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+      gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+             static_cast<float*>(out->data));
+      return 0;
     }
     '''
-    code = "echo \'" + code + "\'"
-    cmd = "g++ -std=c++11 -shared -fPIC -ldl -o external_test.so -xc++ -"
-    cmd = code + " | " + cmd
-    if os.system(cmd) != 0:
-        raise RuntimeError("Compilation for external_test.so failed")
+    csource_module = _tvm_module.csource_module_create(code, "cc")
+    return csource_module
 
 
 def get_synthetic_lib():
@@ -121,9 +141,6 @@ def get_synthetic_lib():
     gcc_input5 = relay.var('gcc_input5', shape=(10, 10))
     gcc_input6 = relay.var('gcc_input6', shape=(10, 10))
     gcc_input7 = relay.var('gcc_input7', shape=(10, 10))
-    add1 = relay.add(gcc_input4, gcc_input5)
-    sub1 = relay.subtract(add1, gcc_input6)
-    mul1 = relay.multiply(sub1, gcc_input7)
     subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6,
                                 gcc_input7], relay.copy(gcc_input4))
     subgraph1 = subgraph1.set_attribute(
@@ -154,12 +171,12 @@ def get_json():
     node7 = {"op": "null", "name": "w7", "inputs": []}
 
     subgraph0 = {
-        "op": "external_op",
-        "name": "subgraph_0",
+        "op": "tvm_op",
+        "name": "gcc_0",
         "attrs": {
             "num_outputs": "1",
             "num_inputs": "4",
-            "func_name": "subgraph_0",
+            "func_name": "gcc_0",
             "flatten_data": "0"
         },
         "inputs": [
@@ -170,12 +187,12 @@ def get_json():
         ]
     }
     subgraph1 = {
-        "op": "external_op",
-        "name": "subgraph_1",
+        "op": "tvm_op",
+        "name": "gcc_1",
         "attrs": {
             "num_outputs": "1",
             "num_inputs": "4",
-            "func_name": "subgraph_1",
+            "func_name": "gcc_1",
             "flatten_data": "0"
         },
         "inputs": [
@@ -233,18 +250,24 @@ def get_json():
     return json.dumps(graph)
 
 
-def test_simulated_runtime():
+def test_extern_dso_runtime():
     if which("gcc") is None:
         print("Skip test because gcc is not available.")
 
-    # library that contains external code.
-    generate_multinode_binary()
-
+    # Get Json and the compiled library.
     json = get_json()
     lib = get_synthetic_lib()
-    ext_lib = tvm.module.load("external_test.so", ext_lib="gcc")
+    cur_lib = lib.save("lib.o")
 
-    mod = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0), ext_lib)
+    # library that contains external code.
+    csource_module = generate_csource_module()
+    # csource_module.save("external.cc", "cc")
+    kwargs = {"options": ["lib.o", "-O2", "-std=c++11"]}
+    # csource_module.save("external.cc")
+    csource_module.export_library("external.so", fcompile=False, **kwargs)
+    # load module for execution.
+    lib = tvm.module.load("external.so")
+    mod = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0))
 
     x_data = np.random.rand(10, 10).astype('float32')
     mod.set_input("x", x_data)
@@ -267,4 +290,4 @@ def test_simulated_runtime():
 
 
 if __name__ == "__main__":
-    test_simulated_runtime()
+    test_extern_dso_runtime()

From 1c2baf91aade39b36bb3162dbb555772816b117c Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 12 Nov 2019 06:46:25 +0000
Subject: [PATCH 04/22] retrigger ci


From 3647def8dd9655a4297751993dc1a344894ce1dc Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 13 Nov 2019 12:03:42 -0800
Subject: [PATCH 05/22] add an example with runtime engine

---
 tests/python/relay/test_external_runtime.py | 215 ++++++++++++++++++--
 1 file changed, 199 insertions(+), 16 deletions(-)

diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index b820fbeafb61..c5257c16eb36 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -25,7 +25,7 @@
 
 
 def generate_csource_module():
-    """Generate a binary"""
+    """Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""
 
     code = r'''
     #include <tvm/runtime/c_runtime_api.h>
@@ -111,6 +111,187 @@ def generate_csource_module():
     csource_module = _tvm_module.csource_module_create(code, "cc")
     return csource_module
 
+def generate_engine_module():
+    """
+    Mock the codegen of an external backend with its own runtime engine
+    (e.g., MKL-DNN/TensorRT)
+    """
+
+    code = r'''
+    #include <tvm/runtime/c_runtime_api.h>
+    #include <dlpack/dlpack.h>
+    #include "gcc_engine.h"
+
+    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
+            float* gcc_input6, float* gcc_input7, float* out) {
+            
+        std::string graph =
+            "add_2d,10,10\n"
+            "sub_2d,10,10\n"
+            "mul_2d,10,10\n";
+
+        Engine engine;
+        engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out);
+    }
+
+
+    extern "C" int gcc_1(TVMValue* value, int* type_code, int nargs) {
+        if (nargs != 5) {
+            printf("Expect 5 args, but get %d", nargs);
+            return 1;
+        }
+        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+        gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+                static_cast<float*>(out->data));
+        return 0;
+    }
+
+    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
+            float* gcc_input2, float* gcc_input3, float* out) {
+            
+        std::string graph =
+            "add_2d,10,10\n"
+            "sub_2d,10,10\n"
+            "mul_2d,10,10\n";
+
+        Engine engine;
+        engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out);
+
+    }
+
+    extern "C" int gcc_0(TVMValue* value, int* type_code, int nargs) {
+        if (nargs != 5) {
+            printf("Expect 5 args, but get %d", nargs);
+            return 1;
+        }
+        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+        gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+                static_cast<float*>(out->data));
+        return 0;
+    }
+    '''
+
+    gen_gcc_engine()
+    csource_module = _tvm_module.csource_module_create(code, "cc")
+    return csource_module
+
+def gen_gcc_engine():
+    """An example of external backend runtime engine. This is supposed to be provided
+      by third-party vendors and included when building the generated external kernel code.
+    """
+
+    code = r'''
+    #ifndef _GCC_ENGINE_H_
+    #define _GCC_ENGINE_H_
+    #include <cstdint>
+    #include <string>
+    #include <sstream>
+    #include <vector>
+
+    #define GCC_BINARY_OP_2D(p_ID_, p_OP_)  \
+      void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < dim1; ++i) {                                   \
+          for (int64_t j = 0; j < dim2; ++j) {                                 \
+            int64_t k = i * dim2 + j;                                          \
+            out[k] = a[k] p_OP_ b[k];                                          \
+          }                                                                    \
+        }                                                                      \
+      }
+    GCC_BINARY_OP_2D(add_2d, +);
+    GCC_BINARY_OP_2D(sub_2d, -);
+    GCC_BINARY_OP_2D(mul_2d, *);
+
+    struct Layer {
+        void (*op)(int64_t, int64_t, float*, float*, float*);
+        std::vector<int64_t> shapes;
+        std::vector<float*> args;
+    };
+
+    class Engine {
+    public:
+        float* alloc_buffer(int64_t size) {
+            float* buf = (float*)malloc(sizeof(float) * size);
+            buffers.push_back(buf);
+            return buf;
+        }
+        void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) {
+            Layer layer;
+            layer.shapes.push_back(dim1);
+            layer.shapes.push_back(dim2);
+            layer.args.push_back(in1);
+            layer.args.push_back(in2);
+            layer.args.push_back(out);
+
+            if (op == "add_2d")
+                layer.op = &add_2d;
+            else if (op == "sub_2d")
+                layer.op = &sub_2d;
+            else if (op == "mul_2d")
+                layer.op = &mul_2d;
+            net.push_back(layer);
+            return ;
+        }
+
+        void run(std::string graph, std::vector<float*> args, float* out) {
+            std::stringstream ss(graph);
+            std::string line;
+            int layer_idx = 0;
+            int arg_idx = 0;
+            float* buf = nullptr;
+
+            while (std::getline(ss, line, '\n')) {
+                std::stringstream ss2(line);
+                std::string token;
+                std::vector<std::string> attrs;
+                while (std::getline(ss2, token, ',')) {
+                    attrs.push_back(token);
+                }
+                int64_t dim1 = stoll(attrs[1]);
+                int64_t dim2 = stoll(attrs[2]);
+                auto out_buf = this->alloc_buffer(dim1 * dim2);
+
+                if (layer_idx == 0) {
+                    this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf);
+                    buf = out_buf;
+                    arg_idx = 2;
+                }
+                else {
+                    this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf);
+                    buf = out_buf;
+                    arg_idx++;
+                }
+                layer_idx++;
+            }
+            this->net.back().args.back() = out;
+
+            for (auto layer : net) {
+                (*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]);
+            }
+        }
+        ~Engine() {
+            for (auto buf : buffers) {
+                free(buf);
+            }
+        }
+    private:
+        std::vector<Layer> net;
+        std::vector<float*> buffers;
+    };
+
+    #endif
+    '''
+    with open('gcc_engine.h', 'w') as f:
+        f.write(code)
 
 def get_synthetic_lib():
     x = relay.var('x', shape=(10, 10))
@@ -159,7 +340,7 @@ def get_synthetic_lib():
     return lib
 
 
-def get_json():
+def get_whole_graph_json():
     nodex = {"op": "null", "name": "x", "inputs": []}
     node0 = {"op": "null", "name": "w0", "inputs": []}
     node1 = {"op": "null", "name": "w1", "inputs": []}
@@ -250,23 +431,26 @@ def get_json():
     return json.dumps(graph)
 
 
-def test_extern_dso_runtime():
+def test_extern(label, get_extern_src, **kwargs):
     if which("gcc") is None:
         print("Skip test because gcc is not available.")
 
+    obj_name = "{}.o".format(label)
+    lib_name = "external_{}.so".format(label)
+
     # Get Json and the compiled library.
-    json = get_json()
+    json = get_whole_graph_json()
     lib = get_synthetic_lib()
-    cur_lib = lib.save("lib.o")
+    lib.save(obj_name)
 
     # library that contains external code.
-    csource_module = generate_csource_module()
+    csource_module = get_extern_src()
     # csource_module.save("external.cc", "cc")
-    kwargs = {"options": ["lib.o", "-O2", "-std=c++11"]}
+    kwargs["options"] = [obj_name] + kwargs["options"]
     # csource_module.save("external.cc")
-    csource_module.export_library("external.so", fcompile=False, **kwargs)
+    csource_module.export_library(lib_name, fcompile=False, **kwargs)
     # load module for execution.
-    lib = tvm.module.load("external.so")
+    lib = tvm.module.load(lib_name)
     mod = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0))
 
     x_data = np.random.rand(10, 10).astype('float32')
@@ -282,12 +466,11 @@ def test_extern_dso_runtime():
     out = mod.get_output(0, out)
     tvm.testing.assert_allclose(
         out.asnumpy(),
-        np.concatenate(
-            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
-             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-             x_data + w_data[6] - w_data[7]),
-            axis=0))
-
+        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                        x_data + w_data[6] - w_data[7]),
+                       axis=0))
 
 if __name__ == "__main__":
-    test_extern_dso_runtime()
+    test_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
+    test_extern("engine", generate_engine_module, options=["-O2", "-std=c++11", "-I."])

From a73949f34e0548b469a6551b55ccc84a20cd2af3 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 14 Nov 2019 06:57:31 +0000
Subject: [PATCH 06/22] uses tmp dir

---
 tests/python/relay/test_external_runtime.py | 36 +++++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index c5257c16eb36..158a9dc50a37 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -22,6 +22,9 @@
 import tvm
 from tvm import relay
 from tvm import module as _tvm_module
+from tvm.contrib import util
+
+tmp_path = util.tempdir()
 
 
 def generate_csource_module():
@@ -111,6 +114,7 @@ def generate_csource_module():
     csource_module = _tvm_module.csource_module_create(code, "cc")
     return csource_module
 
+
 def generate_engine_module():
     """
     Mock the codegen of an external backend with its own runtime engine
@@ -185,6 +189,7 @@ def generate_engine_module():
     csource_module = _tvm_module.csource_module_create(code, "cc")
     return csource_module
 
+
 def gen_gcc_engine():
     """An example of external backend runtime engine. This is supposed to be provided
       by third-party vendors and included when building the generated external kernel code.
@@ -290,9 +295,11 @@ class Engine {
 
     #endif
     '''
-    with open('gcc_engine.h', 'w') as f:
+    header_file = tmp_path.relpath("gcc_engine.h")
+    with open(header_file, 'w') as f:
         f.write(code)
 
+
 def get_synthetic_lib():
     x = relay.var('x', shape=(10, 10))
     w0 = relay.var('w0', shape=(10, 10))
@@ -431,7 +438,7 @@ def get_whole_graph_json():
     return json.dumps(graph)
 
 
-def test_extern(label, get_extern_src, **kwargs):
+def check_extern(label, get_extern_src, **kwargs):
     if which("gcc") is None:
         print("Skip test because gcc is not available.")
 
@@ -439,19 +446,18 @@ def test_extern(label, get_extern_src, **kwargs):
     lib_name = "external_{}.so".format(label)
 
     # Get Json and the compiled library.
-    json = get_whole_graph_json()
+    graph_json = get_whole_graph_json()
     lib = get_synthetic_lib()
     lib.save(obj_name)
 
     # library that contains external code.
     csource_module = get_extern_src()
-    # csource_module.save("external.cc", "cc")
     kwargs["options"] = [obj_name] + kwargs["options"]
-    # csource_module.save("external.cc")
-    csource_module.export_library(lib_name, fcompile=False, **kwargs)
+    lib_path = tmp_path.relpath(lib_name)
+    csource_module.export_library(lib_path, fcompile=False, **kwargs)
     # load module for execution.
-    lib = tvm.module.load(lib_name)
-    mod = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0))
+    lib = tvm.module.load(lib_path)
+    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
 
     x_data = np.random.rand(10, 10).astype('float32')
     mod.set_input("x", x_data)
@@ -471,6 +477,16 @@ def test_extern(label, get_extern_src, **kwargs):
                         x_data + w_data[6] - w_data[7]),
                        axis=0))
 
+
+def test_dso_extern():
+    check_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
+
+
+def test_engine_extern():
+    check_extern("engine", generate_engine_module,
+                 options=["-O2", "-std=c++11", "-I"+tmp_path.relpath("")])
+
+
 if __name__ == "__main__":
-    test_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
-    test_extern("engine", generate_engine_module, options=["-O2", "-std=c++11", "-I."])
+    test_dso_extern()
+    test_engine_extern()

From f1d1540b1e1d9aa3c561dca0393dc3e0a9f84bbd Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 14 Nov 2019 23:19:55 +0000
Subject: [PATCH 07/22] free buf

---
 tests/python/relay/test_external_runtime.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 158a9dc50a37..742f2b617ba3 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -64,6 +64,8 @@ def generate_csource_module():
       gcc_1_2(gcc_input4, gcc_input5, buf_0);
       gcc_1_1(buf_0, gcc_input6, buf_1);
       gcc_1_0(buf_1, gcc_input7, out);
+      free(buf_0);
+      free(buf_1);
     }
 
     extern "C" int gcc_1(TVMValue* value, int* type_code, int nargs) {
@@ -93,6 +95,8 @@ def generate_csource_module():
       gcc_0_2(gcc_input0, gcc_input1, buf_0);
       gcc_0_1(buf_0, gcc_input2, buf_1);
       gcc_0_0(buf_1, gcc_input3, out);
+      free(buf_0);
+      free(buf_1);
     }
 
     extern "C" int gcc_0(TVMValue* value, int* type_code, int nargs) {

From e753aab689f4849048fba316187af053fdd05831 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Sat, 16 Nov 2019 23:03:29 +0000
Subject: [PATCH 08/22] cpp test for example json runtime

---
 tests/cpp/external_runtime_test.cc | 234 +++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 tests/cpp/external_runtime_test.cc

diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
new file mode 100644
index 000000000000..f63b61e03882
--- /dev/null
+++ b/tests/cpp/external_runtime_test.cc
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file external_runtime_test.cc
+ * \brief Test an example runtime module to interpreting a json string.
+ */
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <sstream>
+#include <string>
+
+using tvm::runtime::Module;
+using tvm::runtime::ModuleNode;
+using tvm::runtime::NDArray;
+using tvm::runtime::Object;
+using tvm::runtime::ObjectPtr;
+using tvm::runtime::PackedFunc;
+using tvm::runtime::TVMArgsSetter;
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+
+void Add_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+int Add(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Add_(static_cast<float*>(arg0->data), arg0->shape[0],
+       static_cast<float*>(arg1->data), arg1->shape[0],
+       static_cast<float*>(out->data));
+  return 0;
+}
+
+void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] - b[i];
+  }
+}
+
+int Sub(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Sub_(static_cast<float*>(arg0->data), arg0->shape[0],
+       static_cast<float*>(arg1->data), arg1->shape[0],
+       static_cast<float*>(out->data));
+  return 0;
+}
+
+class ExampleJSonModule : public ModuleNode {
+ public:
+  PackedFunc GetFunction(const std::string& name,
+                         const ObjectPtr<Object>& sptr_to_self) final {
+    if (name == "example_json_rt") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK_EQ(args.size(), 3U);
+        NDArray arg0 = args[0];
+        NDArray arg1 = args[1];
+        NDArray arg2 = args[2];
+        this->data_entry_[0].CopyFrom(arg0);
+        this->data_entry_[1].CopyFrom(arg1);
+        this->data_entry_[2].CopyFrom(arg2);
+        for (const auto& it : this->graph_) {
+          this->run(it.first, it.second);
+        }
+        *rv = data_entry_.back();
+      });
+    } else {
+      LOG(FATAL) << "Unkown runtime type: " << name << "\n";
+      return PackedFunc();
+    }
+  }
+
+  void run(int id, const std::vector<int>& inputs) {
+    std::vector<TVMValue> values(inputs.size());
+    std::vector<int> type_codes(inputs.size());
+    TVMArgsSetter setter(values.data(), type_codes.data());
+
+    if (op_id_[id] == "add" || op_id_[id] == "sub") {
+      for (size_t i = 0; i < inputs.size(); i++) {
+        setter(i, data_entry_[inputs[i]]);
+      }
+    }
+
+    if (op_id_[id] == "add") {
+      Add(values.data(), type_codes.data(), inputs.size());
+    } else if (op_id_[id] == "sub") {
+      Sub(values.data(), type_codes.data(), inputs.size());
+    }
+  }
+
+  const char* type_key() const { return "examplejson"; }
+
+  void SaveToBinary(dmlc::Stream* stream) final {
+    // Write to a json string.
+  }
+
+  // Note this is a very simple json that only serves for demostration purpose.
+  // Users usually have their own format and they can serialize it using the
+  // SaveToBinary method and deserialize it using LoadFromFile.
+  void ParseJson(const std::string& json) {
+    std::string line;
+    std::stringstream ss(json);
+
+    while (std::getline(ss, line, '\n')) {
+      std::stringstream ss2(line);
+      std::string token;
+      int id = 0;
+
+      ss2 >> token;
+      ss2 >> id;
+      if (op_id_.size() <= static_cast<size_t>(id)) {
+        op_id_.resize(id + 1);
+        data_entry_.resize(id + 1);
+      }
+
+      int64_t total_elements = 1;
+      std::vector<int64_t> shape;
+      if (token == "input") {
+        int64_t size = 0;
+        while (ss2 >> size) {
+          total_elements *= size;
+          shape.push_back(size);
+        }
+      } else {
+        op_id_[id] = token;
+        bool shape_data = false;
+        while (ss2 >> token) {
+          if (token == "shape:") {
+            shape_data = true;
+          } else if (shape_data) {
+            total_elements *= std::stoll(token);
+            shape.push_back(std::stoll(token));
+          } else if (token != "inputs:") {
+            graph_[id].push_back(std::stoi(token));
+          }
+        }
+        graph_[id].push_back(id);
+      }
+      DLContext ctx;
+      ctx.device_type = static_cast<DLDeviceType>(1);
+      ctx.device_id = 0;
+      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
+    }
+  }
+
+  static Module LoadFromFile(const std::string& json, const std::string& format) {
+    auto n = tvm::runtime::make_object<ExampleJSonModule>();
+    n->ParseJson(json);
+    return Module(n);
+  }
+
+ private:
+  // op -> inputs
+  std::map<int, std::vector<int> > graph_;
+  std::vector<NDArray> data_entry_;
+  // id -> op
+  std::vector<std::string> op_id_;
+};
+
+TEST(ExampleModule, Basic) {
+  // This is a simple json format used for testing. Users/vendors can define
+  // their own format.
+  std::string json =
+      "input 0 10 10\n"
+      "input 1 10 10\n"
+      "input 2 10 10\n"
+      "add 3 inputs: 0 1 shape: 10 10\n"
+      "sub 4 inputs: 3 2 shape: 10 10";
+
+  Module mod = ExampleJSonModule::LoadFromFile(json, "");
+  PackedFunc f = mod.GetFunction("example_json_rt", false);
+
+  auto a_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto b_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto c_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  float* pa = (float*)a_val.ToDLPack()->dl_tensor.data;
+  float* pb = (float*)b_val.ToDLPack()->dl_tensor.data;
+  float* pc = (float*)c_val.ToDLPack()->dl_tensor.data;
+
+  // Assign values.
+  for (int i = 0; i < 10 * 10; i++) {
+    pa[i] = i;
+    pb[i] = i + 1.0;
+    pc[i] = i + 2.0;
+  }
+
+  NDArray out = f(a_val, b_val, c_val);
+  float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
+
+  // Check correctness of result
+  for (int i = 0; i < 10; i++) {
+    CHECK_LT(std::fabs(p_out[i] - ((i + (i + 1.0) - (i + 2.0)))), 1e-5);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}

From d17ecc1f2a2200c0ed08a315dca70897c0435e37 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sat, 16 Nov 2019 17:12:21 -0800
Subject: [PATCH 09/22] add constructor and desctor

---
 tests/cpp/external_runtime_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
index f63b61e03882..043892a999a7 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/tests/cpp/external_runtime_test.cc
@@ -81,6 +81,9 @@ int Sub(TVMValue* value, int* type_code, int nargs) {
 
 class ExampleJSonModule : public ModuleNode {
  public:
+  ExampleJSonModule() {}
+  ~ExampleJSonModule() {}
+
   PackedFunc GetFunction(const std::string& name,
                          const ObjectPtr<Object>& sptr_to_self) final {
     if (name == "example_json_rt") {

From f01c466f85b36bf38ca37c96e4bff01feaeb2d18 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Sun, 17 Nov 2019 05:05:50 +0000
Subject: [PATCH 10/22] fix ci

---
 tests/cpp/external_runtime_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
index 043892a999a7..64228dc3d16f 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/tests/cpp/external_runtime_test.cc
@@ -185,6 +185,9 @@ class ExampleJSonModule : public ModuleNode {
     return Module(n);
   }
 
+  void SaveToFile(const std::string& file_name, const std::string& format) final {}
+  std::string GetSource(const std::string& format = "") final { return ""; }
+
  private:
   // op -> inputs
   std::map<int, std::vector<int> > graph_;

From a530f92774ecc46447273670786de8e1515427b6 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sat, 16 Nov 2019 21:58:13 -0800
Subject: [PATCH 11/22] move JSON runtime to src

---
 CMakeLists.txt                              |   1 +
 cmake/modules/contrib/Extern.cmake          |  25 +++
 include/tvm/runtime/contrib/gcc.h           |  76 +++++++++
 src/runtime/contrib/gcc/gcc.cc              | 170 ++++++++++++++++++++
 tests/cpp/external_runtime_test.cc          | 154 +-----------------
 tests/python/relay/test_external_runtime.py |  49 ++++++
 6 files changed, 326 insertions(+), 149 deletions(-)
 create mode 100644 cmake/modules/contrib/Extern.cmake
 create mode 100644 include/tvm/runtime/contrib/gcc.h
 create mode 100644 src/runtime/contrib/gcc/gcc.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bea818b7581..e5d7c7c1ede1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,6 +246,7 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
+include(cmake/modules/contrib/Extern.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
new file mode 100644
index 000000000000..498ad3c9819c
--- /dev/null
+++ b/cmake/modules/contrib/Extern.cmake
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one	
+# or more contributor license agreements.  See the NOTICE file	
+# distributed with this work for additional information	
+# regarding copyright ownership.  The ASF licenses this file	
+# to you under the Apache License, Version 2.0 (the	
+# "License"); you may not use this file except in compliance	
+# with the License.  You may obtain a copy of the License at	
+#	
+#   http://www.apache.org/licenses/LICENSE-2.0	
+#	
+# Unless required by applicable law or agreed to in writing,	
+# software distributed under the License is distributed on an	
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY	
+# KIND, either express or implied.  See the License for the	
+# specific language governing permissions and limitations	
+# under the License.	
+
+message(STATUS "Build with relay.backend.contrib")	
+
+list(FIND USE_EXTERN "gcc" _gcc_idx)	
+if(_gcc_idx GREATER -1)	
+    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})	
+    message(STATUS "Use extern runtime: GCC")	
+endif()
diff --git a/include/tvm/runtime/contrib/gcc.h b/include/tvm/runtime/contrib/gcc.h
new file mode 100644
index 000000000000..d4df80377e8f
--- /dev/null
+++ b/include/tvm/runtime/contrib/gcc.h
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file external_runtime_test.cc
+ * \brief Test an example runtime module to interpreting a json string.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_GCC_H_
+#define TVM_RUNTIME_CONTRIB_GCC_H_
+
+#include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+class ExampleJSonModule : public ModuleNode {
+ public:
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
+
+  void run(int id, const std::vector<int>& inputs);
+
+  const char* type_key() const { return "examplejson"; }
+
+  void SaveToBinary(dmlc::Stream* stream) final {
+    // Write to a json string.
+  }
+
+  // Note this is a very simple json that only serves for demostration purpose.
+  // Users usually have their own format and they can serialize it using the
+  // SaveToBinary method and deserialize it using LoadFromFile.
+  void ParseJson(const std::string& json);
+
+  static Module LoadFromFile(const std::string& json, const std::string& format) {
+    auto n = tvm::runtime::make_object<ExampleJSonModule>();
+    n->ParseJson(json);
+    return Module(n);
+  }
+
+ private:
+  std::string curr_subgraph_;
+  // op -> inputs
+  std::map<std::string, std::map<int, std::vector<int> >> graph_;
+  std::vector<NDArray> data_entry_;
+  // id -> op
+  std::vector<std::string> op_id_;
+};
+#endif
+
+}
+}
\ No newline at end of file
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
new file mode 100644
index 000000000000..8a4860dc1e0c
--- /dev/null
+++ b/src/runtime/contrib/gcc/gcc.cc
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file external_runtime_test.cc
+ * \brief Test an example runtime module to interpreting a json string.
+ */
+#include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/contrib/gcc.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+void Add_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+int Add(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Add_(static_cast<float*>(arg0->data), arg0->shape[0], static_cast<float*>(arg1->data),
+       arg1->shape[0], static_cast<float*>(out->data));
+  return 0;
+}
+
+void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] - b[i];
+  }
+}
+
+int Sub(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Sub_(static_cast<float*>(arg0->data), arg0->shape[0], static_cast<float*>(arg1->data),
+       arg1->shape[0], static_cast<float*>(out->data));
+  return 0;
+}
+
+PackedFunc ExampleJSonModule::GetFunction(const std::string& name,
+                                          const ObjectPtr<Object>& sptr_to_self) {
+  if (this->graph_.find(name) != this->graph_.end()) {
+    this->curr_subgraph_ = name;
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      for (uint32_t i = 0; i < args.size(); ++i) {
+        NDArray arg = args[i];
+        this->data_entry_[i].CopyFrom(arg);
+      }
+      for (const auto& it : this->graph_[this->curr_subgraph_]) {
+        this->run(it.first, it.second);
+      }
+      *rv = data_entry_.back();
+    });
+  }
+  else {
+    LOG(FATAL) << "Unkown runtime type: " << name << "\n";
+    return PackedFunc();
+  }
+}
+
+void ExampleJSonModule::run(int id, const std::vector<int>& inputs) {
+  std::vector<TVMValue> values(inputs.size());
+  std::vector<int> type_codes(inputs.size());
+  TVMArgsSetter setter(values.data(), type_codes.data());
+
+  if (op_id_[id] == "add" || op_id_[id] == "sub") {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      setter(i, data_entry_[inputs[i]]);
+    }
+  }
+
+  if (op_id_[id] == "add") {
+    Add(values.data(), type_codes.data(), inputs.size());
+  } else if (op_id_[id] == "sub") {
+    Sub(values.data(), type_codes.data(), inputs.size());
+  }
+}
+
+// Note this is a very simple json that only serves for demostration purpose.
+// Users usually have their own format and they can serialize it using the
+// SaveToBinary method and deserialize it using LoadFromFile.
+void ExampleJSonModule::ParseJson(const std::string& json) {
+  std::string line;
+  std::string curr_subgraph;
+  std::stringstream ss(json);
+
+  while (std::getline(ss, line, '\n')) {
+    std::stringstream ss2(line);
+    std::string token;
+    int id = 0;
+
+    ss2 >> token;
+    if (token.find("gcc_") != std::string::npos) {
+      curr_subgraph = token;
+      graph_[curr_subgraph];
+      continue;
+    }
+
+    ss2 >> id;
+    if (op_id_.size() <= static_cast<size_t>(id)) {
+      op_id_.resize(id + 1);
+      data_entry_.resize(id + 1);
+    }
+
+    int64_t total_elements = 1;
+    std::vector<int64_t> shape;
+    if (token == "input") {
+      int64_t size = 0;
+      while (ss2 >> size) {
+        total_elements *= size;
+        shape.push_back(size);
+      }
+    } else {
+      op_id_[id] = token;
+      bool shape_data = false;
+      while (ss2 >> token) {
+        if (token == "shape:") {
+          shape_data = true;
+        } else if (shape_data) {
+          total_elements *= std::stoll(token);
+          shape.push_back(std::stoll(token));
+        } else if (token != "inputs:") {
+          graph_[curr_subgraph][id].push_back(std::stoi(token));
+        }
+      }
+      graph_[curr_subgraph][id].push_back(id);
+    }
+    DLContext ctx;
+    ctx.device_type = static_cast<DLDeviceType>(1);
+    ctx.device_id = 0;
+    data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
+  }
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_gcc").set_body_typed(ExampleJSonModule::LoadFromFile);
+
+}
+}
\ No newline at end of file
diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
index 043892a999a7..76e4d87d7250 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/tests/cpp/external_runtime_test.cc
@@ -25,6 +25,7 @@
 #include <gtest/gtest.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/memory.h>
+#include <tvm/runtime/contrib/gcc.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
@@ -35,168 +36,23 @@
 #include <sstream>
 #include <string>
 
+using tvm::runtime::ExampleJSonModule;
 using tvm::runtime::Module;
 using tvm::runtime::ModuleNode;
 using tvm::runtime::NDArray;
 using tvm::runtime::Object;
 using tvm::runtime::ObjectPtr;
 using tvm::runtime::PackedFunc;
-using tvm::runtime::TVMArgsSetter;
 using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMArgsSetter;
 using tvm::runtime::TVMRetValue;
 
-void Add_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-int Add(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Add_(static_cast<float*>(arg0->data), arg0->shape[0],
-       static_cast<float*>(arg1->data), arg1->shape[0],
-       static_cast<float*>(out->data));
-  return 0;
-}
-
-void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] - b[i];
-  }
-}
-
-int Sub(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Sub_(static_cast<float*>(arg0->data), arg0->shape[0],
-       static_cast<float*>(arg1->data), arg1->shape[0],
-       static_cast<float*>(out->data));
-  return 0;
-}
-
-class ExampleJSonModule : public ModuleNode {
- public:
-  ExampleJSonModule() {}
-  ~ExampleJSonModule() {}
-
-  PackedFunc GetFunction(const std::string& name,
-                         const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "example_json_rt") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 3U);
-        NDArray arg0 = args[0];
-        NDArray arg1 = args[1];
-        NDArray arg2 = args[2];
-        this->data_entry_[0].CopyFrom(arg0);
-        this->data_entry_[1].CopyFrom(arg1);
-        this->data_entry_[2].CopyFrom(arg2);
-        for (const auto& it : this->graph_) {
-          this->run(it.first, it.second);
-        }
-        *rv = data_entry_.back();
-      });
-    } else {
-      LOG(FATAL) << "Unkown runtime type: " << name << "\n";
-      return PackedFunc();
-    }
-  }
-
-  void run(int id, const std::vector<int>& inputs) {
-    std::vector<TVMValue> values(inputs.size());
-    std::vector<int> type_codes(inputs.size());
-    TVMArgsSetter setter(values.data(), type_codes.data());
-
-    if (op_id_[id] == "add" || op_id_[id] == "sub") {
-      for (size_t i = 0; i < inputs.size(); i++) {
-        setter(i, data_entry_[inputs[i]]);
-      }
-    }
-
-    if (op_id_[id] == "add") {
-      Add(values.data(), type_codes.data(), inputs.size());
-    } else if (op_id_[id] == "sub") {
-      Sub(values.data(), type_codes.data(), inputs.size());
-    }
-  }
-
-  const char* type_key() const { return "examplejson"; }
-
-  void SaveToBinary(dmlc::Stream* stream) final {
-    // Write to a json string.
-  }
-
-  // Note this is a very simple json that only serves for demostration purpose.
-  // Users usually have their own format and they can serialize it using the
-  // SaveToBinary method and deserialize it using LoadFromFile.
-  void ParseJson(const std::string& json) {
-    std::string line;
-    std::stringstream ss(json);
-
-    while (std::getline(ss, line, '\n')) {
-      std::stringstream ss2(line);
-      std::string token;
-      int id = 0;
-
-      ss2 >> token;
-      ss2 >> id;
-      if (op_id_.size() <= static_cast<size_t>(id)) {
-        op_id_.resize(id + 1);
-        data_entry_.resize(id + 1);
-      }
-
-      int64_t total_elements = 1;
-      std::vector<int64_t> shape;
-      if (token == "input") {
-        int64_t size = 0;
-        while (ss2 >> size) {
-          total_elements *= size;
-          shape.push_back(size);
-        }
-      } else {
-        op_id_[id] = token;
-        bool shape_data = false;
-        while (ss2 >> token) {
-          if (token == "shape:") {
-            shape_data = true;
-          } else if (shape_data) {
-            total_elements *= std::stoll(token);
-            shape.push_back(std::stoll(token));
-          } else if (token != "inputs:") {
-            graph_[id].push_back(std::stoi(token));
-          }
-        }
-        graph_[id].push_back(id);
-      }
-      DLContext ctx;
-      ctx.device_type = static_cast<DLDeviceType>(1);
-      ctx.device_id = 0;
-      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
-    }
-  }
-
-  static Module LoadFromFile(const std::string& json, const std::string& format) {
-    auto n = tvm::runtime::make_object<ExampleJSonModule>();
-    n->ParseJson(json);
-    return Module(n);
-  }
-
- private:
-  // op -> inputs
-  std::map<int, std::vector<int> > graph_;
-  std::vector<NDArray> data_entry_;
-  // id -> op
-  std::vector<std::string> op_id_;
-};
 
 TEST(ExampleModule, Basic) {
   // This is a simple json format used for testing. Users/vendors can define
   // their own format.
   std::string json =
+      "gcc_0\n"
       "input 0 10 10\n"
       "input 1 10 10\n"
       "input 2 10 10\n"
@@ -204,7 +60,7 @@ TEST(ExampleModule, Basic) {
       "sub 4 inputs: 3 2 shape: 10 10";
 
   Module mod = ExampleJSonModule::LoadFromFile(json, "");
-  PackedFunc f = mod.GetFunction("example_json_rt", false);
+  PackedFunc f = mod.GetFunction("gcc_0", false);
 
   auto a_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto b_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 742f2b617ba3..70ce5ce6d77d 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -491,6 +491,55 @@ def test_engine_extern():
                  options=["-O2", "-std=c++11", "-I"+tmp_path.relpath("")])
 
 
+def test_json_extern():
+    if which("gcc") is None:
+        print("Skip test because gcc is not available.")
+
+    # Get Json.
+    graph_json = get_whole_graph_json()
+
+    # Get subgraph Json.
+    subgraph_json = ("gcc_0\n" +
+                     "input 0 10 10\n" +
+                     "input 1 10 10\n" +
+                     "input 2 10 10\n" +
+                     "input 3 10 10\n" +
+                     "add 4 inputs: 0 1 shape: 10 10\n" +
+                     "sub 5 inputs: 4 2 shape: 10 10\n" +
+                     "mul 6 inputs: 5 3 shape: 10 10\n" +
+                     "gcc_1\n" +
+                     "input 0 10 10\n" +
+                     "input 1 10 10\n" +
+                     "input 2 10 10\n" +
+                     "input 3 10 10\n" +
+                     "add 4 inputs: 0 1 shape: 10 10\n" +
+                     "sub 5 inputs: 4 2 shape: 10 10\n" +
+                     "mul 6 inputs: 5 3 shape: 10 10")
+
+    # load module for execution.
+    lib = tvm.module.load(subgraph_json, 'gcc')
+    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
+
+    x_data = np.random.rand(10, 10).astype('float32')
+    mod.set_input("x", x_data)
+    w_data = []
+    for i in range(8):
+        data = np.random.rand(10, 10).astype('float32')
+        w_data.append(data)
+        var = "w" + str(i)
+        mod.set_input(var, data)
+    mod.run()
+    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
+    out = mod.get_output(0, out)
+    tvm.testing.assert_allclose(
+        out.asnumpy(),
+        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                        x_data + w_data[6] - w_data[7]),
+                       axis=0))
+
+
 if __name__ == "__main__":
     test_dso_extern()
     test_engine_extern()
+    #test_json_extern()
\ No newline at end of file

From c0303b15d0f06b908e32a33fd9d6534b6b4b70df Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sat, 16 Nov 2019 22:02:48 -0800
Subject: [PATCH 12/22] fix merge

---
 include/tvm/runtime/contrib/gcc.h  |  11 ++-
 src/runtime/contrib/gcc/gcc.cc     |   7 +-
 tests/cpp/external_runtime_test.cc | 151 -----------------------------
 3 files changed, 9 insertions(+), 160 deletions(-)

diff --git a/include/tvm/runtime/contrib/gcc.h b/include/tvm/runtime/contrib/gcc.h
index 83e9c49c2164..8d75dad1db74 100644
--- a/include/tvm/runtime/contrib/gcc.h
+++ b/include/tvm/runtime/contrib/gcc.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file external_runtime_test.cc
+ * \file gcc.h
  * \brief Test an example runtime module to interpreting a json string.
  */
 #ifndef TVM_RUNTIME_CONTRIB_GCC_H_
@@ -33,8 +33,10 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <map>
 #include <sstream>
 #include <string>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -73,7 +75,6 @@ class ExampleJSonModule : public ModuleNode {
   // id -> op
   std::vector<std::string> op_id_;
 };
-#endif
-
-}
-}
\ No newline at end of file
+#endif  // TVM_RUNTIME_CONTRIB_GCC_H_
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
index 8a4860dc1e0c..62fc3b264555 100644
--- a/src/runtime/contrib/gcc/gcc.cc
+++ b/src/runtime/contrib/gcc/gcc.cc
@@ -83,8 +83,7 @@ PackedFunc ExampleJSonModule::GetFunction(const std::string& name,
       }
       *rv = data_entry_.back();
     });
-  }
-  else {
+  } else {
     LOG(FATAL) << "Unkown runtime type: " << name << "\n";
     return PackedFunc();
   }
@@ -166,5 +165,5 @@ void ExampleJSonModule::ParseJson(const std::string& json) {
 
 TVM_REGISTER_GLOBAL("module.loadfile_gcc").set_body_typed(ExampleJSonModule::LoadFromFile);
 
-}
-}
\ No newline at end of file
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
index 591e5f008c91..f73eda0455df 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/tests/cpp/external_runtime_test.cc
@@ -47,157 +47,6 @@ using tvm::runtime::TVMArgs;
 using tvm::runtime::TVMArgsSetter;
 using tvm::runtime::TVMRetValue;
 
-void Add_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-int Add(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Add_(static_cast<float*>(arg0->data), arg0->shape[0],
-       static_cast<float*>(arg1->data), arg1->shape[0],
-       static_cast<float*>(out->data));
-  return 0;
-}
-
-void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] - b[i];
-  }
-}
-
-int Sub(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Sub_(static_cast<float*>(arg0->data), arg0->shape[0],
-       static_cast<float*>(arg1->data), arg1->shape[0],
-       static_cast<float*>(out->data));
-  return 0;
-}
-
-class ExampleJSonModule : public ModuleNode {
- public:
-  ExampleJSonModule() {}
-  ~ExampleJSonModule() {}
-
-  PackedFunc GetFunction(const std::string& name,
-                         const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "example_json_rt") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 3U);
-        NDArray arg0 = args[0];
-        NDArray arg1 = args[1];
-        NDArray arg2 = args[2];
-        this->data_entry_[0].CopyFrom(arg0);
-        this->data_entry_[1].CopyFrom(arg1);
-        this->data_entry_[2].CopyFrom(arg2);
-        for (const auto& it : this->graph_) {
-          this->run(it.first, it.second);
-        }
-        *rv = data_entry_.back();
-      });
-    } else {
-      LOG(FATAL) << "Unkown runtime type: " << name << "\n";
-      return PackedFunc();
-    }
-  }
-
-  void run(int id, const std::vector<int>& inputs) {
-    std::vector<TVMValue> values(inputs.size());
-    std::vector<int> type_codes(inputs.size());
-    TVMArgsSetter setter(values.data(), type_codes.data());
-
-    if (op_id_[id] == "add" || op_id_[id] == "sub") {
-      for (size_t i = 0; i < inputs.size(); i++) {
-        setter(i, data_entry_[inputs[i]]);
-      }
-    }
-
-    if (op_id_[id] == "add") {
-      Add(values.data(), type_codes.data(), inputs.size());
-    } else if (op_id_[id] == "sub") {
-      Sub(values.data(), type_codes.data(), inputs.size());
-    }
-  }
-
-  const char* type_key() const { return "examplejson"; }
-
-  void SaveToBinary(dmlc::Stream* stream) final {
-    // Write to a json string.
-  }
-
-  // Note this is a very simple json that only serves for demostration purpose.
-  // Users usually have their own format and they can serialize it using the
-  // SaveToBinary method and deserialize it using LoadFromFile.
-  void ParseJson(const std::string& json) {
-    std::string line;
-    std::stringstream ss(json);
-
-    while (std::getline(ss, line, '\n')) {
-      std::stringstream ss2(line);
-      std::string token;
-      int id = 0;
-
-      ss2 >> token;
-      ss2 >> id;
-      if (op_id_.size() <= static_cast<size_t>(id)) {
-        op_id_.resize(id + 1);
-        data_entry_.resize(id + 1);
-      }
-
-      int64_t total_elements = 1;
-      std::vector<int64_t> shape;
-      if (token == "input") {
-        int64_t size = 0;
-        while (ss2 >> size) {
-          total_elements *= size;
-          shape.push_back(size);
-        }
-      } else {
-        op_id_[id] = token;
-        bool shape_data = false;
-        while (ss2 >> token) {
-          if (token == "shape:") {
-            shape_data = true;
-          } else if (shape_data) {
-            total_elements *= std::stoll(token);
-            shape.push_back(std::stoll(token));
-          } else if (token != "inputs:") {
-            graph_[id].push_back(std::stoi(token));
-          }
-        }
-        graph_[id].push_back(id);
-      }
-      DLContext ctx;
-      ctx.device_type = static_cast<DLDeviceType>(1);
-      ctx.device_id = 0;
-      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
-    }
-  }
-
-  static Module LoadFromFile(const std::string& json, const std::string& format) {
-    auto n = tvm::runtime::make_object<ExampleJSonModule>();
-    n->ParseJson(json);
-    return Module(n);
-  }
-
-  void SaveToFile(const std::string& file_name, const std::string& format) final {}
-  std::string GetSource(const std::string& format = "") final { return ""; }
-
- private:
-  // op -> inputs
-  std::map<int, std::vector<int> > graph_;
-  std::vector<NDArray> data_entry_;
-  // id -> op
-  std::vector<std::string> op_id_;
-};
-
 TEST(ExampleModule, Basic) {
   // This is a simple json format used for testing. Users/vendors can define
   // their own format.

From 49b0b2f7b3ac8b1e2a2244cdc3a38a5168836ba4 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sat, 16 Nov 2019 22:52:13 -0800
Subject: [PATCH 13/22] fix ci

---
 cmake/modules/contrib/Extern.cmake | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 498ad3c9819c..c84d11c3cfdd 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -17,9 +17,6 @@
 
 message(STATUS "Build with relay.backend.contrib")	
 
-list(FIND USE_EXTERN "gcc" _gcc_idx)	
-if(_gcc_idx GREATER -1)	
-    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})	
-    message(STATUS "Use extern runtime: GCC")	
-endif()
+file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})	
+message(STATUS "Use extern runtime: GCC")	

From 3c45d50a9c7fb404a066638ed404cb6a8889598d Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sun, 17 Nov 2019 08:55:41 +0000
Subject: [PATCH 14/22] revert

---
 CMakeLists.txt                              |   1 -
 cmake/modules/contrib/Extern.cmake          |  22 ---
 include/tvm/runtime/contrib/gcc.h           |  80 ---------
 src/runtime/contrib/gcc/gcc.cc              | 169 --------------------
 tests/cpp/external_runtime_test.cc          | 161 ++++++++++++++++++-
 tests/python/relay/test_external_runtime.py |   4 +-
 6 files changed, 160 insertions(+), 277 deletions(-)
 delete mode 100644 cmake/modules/contrib/Extern.cmake
 delete mode 100644 include/tvm/runtime/contrib/gcc.h
 delete mode 100644 src/runtime/contrib/gcc/gcc.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5d7c7c1ede1..2bea818b7581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,7 +246,6 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
-include(cmake/modules/contrib/Extern.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
deleted file mode 100644
index c84d11c3cfdd..000000000000
--- a/cmake/modules/contrib/Extern.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one	
-# or more contributor license agreements.  See the NOTICE file	
-# distributed with this work for additional information	
-# regarding copyright ownership.  The ASF licenses this file	
-# to you under the Apache License, Version 2.0 (the	
-# "License"); you may not use this file except in compliance	
-# with the License.  You may obtain a copy of the License at	
-#	
-#   http://www.apache.org/licenses/LICENSE-2.0	
-#	
-# Unless required by applicable law or agreed to in writing,	
-# software distributed under the License is distributed on an	
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY	
-# KIND, either express or implied.  See the License for the	
-# specific language governing permissions and limitations	
-# under the License.	
-
-message(STATUS "Build with relay.backend.contrib")	
-
-file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})	
-message(STATUS "Use extern runtime: GCC")	
diff --git a/include/tvm/runtime/contrib/gcc.h b/include/tvm/runtime/contrib/gcc.h
deleted file mode 100644
index 8d75dad1db74..000000000000
--- a/include/tvm/runtime/contrib/gcc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file gcc.h
- * \brief Test an example runtime module to interpreting a json string.
- */
-#ifndef TVM_RUNTIME_CONTRIB_GCC_H_
-#define TVM_RUNTIME_CONTRIB_GCC_H_
-
-#include <dmlc/logging.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/memory.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include <map>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-
-class ExampleJSonModule : public ModuleNode {
- public:
-  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
-
-  void run(int id, const std::vector<int>& inputs);
-
-  const char* type_key() const { return "examplejson"; }
-
-  void SaveToBinary(dmlc::Stream* stream) final {
-    // Write to a json string.
-  }
-
-  // Note this is a very simple json that only serves for demostration purpose.
-  // Users usually have their own format and they can serialize it using the
-  // SaveToBinary method and deserialize it using LoadFromFile.
-  void ParseJson(const std::string& json);
-
-  static Module LoadFromFile(const std::string& json, const std::string& format) {
-    auto n = tvm::runtime::make_object<ExampleJSonModule>();
-    n->ParseJson(json);
-    return Module(n);
-  }
-
-  void SaveToFile(const std::string& file_name, const std::string& format) final {}
-  std::string GetSource(const std::string& format = "") final { return ""; }
-
- private:
-  std::string curr_subgraph_;
-  // op -> inputs
-  std::map<std::string, std::map<int, std::vector<int> >> graph_;
-  std::vector<NDArray> data_entry_;
-  // id -> op
-  std::vector<std::string> op_id_;
-};
-#endif  // TVM_RUNTIME_CONTRIB_GCC_H_
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
deleted file mode 100644
index 62fc3b264555..000000000000
--- a/src/runtime/contrib/gcc/gcc.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file external_runtime_test.cc
- * \brief Test an example runtime module to interpreting a json string.
- */
-#include <dmlc/logging.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/contrib/gcc.h>
-#include <tvm/runtime/memory.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include <sstream>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-
-void Add_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] + b[i];
-  }
-}
-
-int Add(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Add_(static_cast<float*>(arg0->data), arg0->shape[0], static_cast<float*>(arg1->data),
-       arg1->shape[0], static_cast<float*>(out->data));
-  return 0;
-}
-
-void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
-  for (int i = 0; i < len_a * len_b; i++) {
-    c[i] = a[i] - b[i];
-  }
-}
-
-int Sub(TVMValue* value, int* type_code, int nargs) {
-  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
-  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-  Sub_(static_cast<float*>(arg0->data), arg0->shape[0], static_cast<float*>(arg1->data),
-       arg1->shape[0], static_cast<float*>(out->data));
-  return 0;
-}
-
-PackedFunc ExampleJSonModule::GetFunction(const std::string& name,
-                                          const ObjectPtr<Object>& sptr_to_self) {
-  if (this->graph_.find(name) != this->graph_.end()) {
-    this->curr_subgraph_ = name;
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      for (uint32_t i = 0; i < args.size(); ++i) {
-        NDArray arg = args[i];
-        this->data_entry_[i].CopyFrom(arg);
-      }
-      for (const auto& it : this->graph_[this->curr_subgraph_]) {
-        this->run(it.first, it.second);
-      }
-      *rv = data_entry_.back();
-    });
-  } else {
-    LOG(FATAL) << "Unkown runtime type: " << name << "\n";
-    return PackedFunc();
-  }
-}
-
-void ExampleJSonModule::run(int id, const std::vector<int>& inputs) {
-  std::vector<TVMValue> values(inputs.size());
-  std::vector<int> type_codes(inputs.size());
-  TVMArgsSetter setter(values.data(), type_codes.data());
-
-  if (op_id_[id] == "add" || op_id_[id] == "sub") {
-    for (size_t i = 0; i < inputs.size(); i++) {
-      setter(i, data_entry_[inputs[i]]);
-    }
-  }
-
-  if (op_id_[id] == "add") {
-    Add(values.data(), type_codes.data(), inputs.size());
-  } else if (op_id_[id] == "sub") {
-    Sub(values.data(), type_codes.data(), inputs.size());
-  }
-}
-
-// Note this is a very simple json that only serves for demostration purpose.
-// Users usually have their own format and they can serialize it using the
-// SaveToBinary method and deserialize it using LoadFromFile.
-void ExampleJSonModule::ParseJson(const std::string& json) {
-  std::string line;
-  std::string curr_subgraph;
-  std::stringstream ss(json);
-
-  while (std::getline(ss, line, '\n')) {
-    std::stringstream ss2(line);
-    std::string token;
-    int id = 0;
-
-    ss2 >> token;
-    if (token.find("gcc_") != std::string::npos) {
-      curr_subgraph = token;
-      graph_[curr_subgraph];
-      continue;
-    }
-
-    ss2 >> id;
-    if (op_id_.size() <= static_cast<size_t>(id)) {
-      op_id_.resize(id + 1);
-      data_entry_.resize(id + 1);
-    }
-
-    int64_t total_elements = 1;
-    std::vector<int64_t> shape;
-    if (token == "input") {
-      int64_t size = 0;
-      while (ss2 >> size) {
-        total_elements *= size;
-        shape.push_back(size);
-      }
-    } else {
-      op_id_[id] = token;
-      bool shape_data = false;
-      while (ss2 >> token) {
-        if (token == "shape:") {
-          shape_data = true;
-        } else if (shape_data) {
-          total_elements *= std::stoll(token);
-          shape.push_back(std::stoll(token));
-        } else if (token != "inputs:") {
-          graph_[curr_subgraph][id].push_back(std::stoi(token));
-        }
-      }
-      graph_[curr_subgraph][id].push_back(id);
-    }
-    DLContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(1);
-    ctx.device_id = 0;
-    data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
-  }
-}
-
-TVM_REGISTER_GLOBAL("module.loadfile_gcc").set_body_typed(ExampleJSonModule::LoadFromFile);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/tests/cpp/external_runtime_test.cc b/tests/cpp/external_runtime_test.cc
index f73eda0455df..9dc5ec0ed52e 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/tests/cpp/external_runtime_test.cc
@@ -25,7 +25,6 @@
 #include <gtest/gtest.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/memory.h>
-#include <tvm/runtime/contrib/gcc.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
@@ -33,10 +32,11 @@
 #include <tvm/runtime/registry.h>
 
 #include <cmath>
+#include <map>
 #include <sstream>
 #include <string>
+#include <vector>
 
-using tvm::runtime::ExampleJSonModule;
 using tvm::runtime::Module;
 using tvm::runtime::ModuleNode;
 using tvm::runtime::NDArray;
@@ -47,11 +47,163 @@ using tvm::runtime::TVMArgs;
 using tvm::runtime::TVMArgsSetter;
 using tvm::runtime::TVMRetValue;
 
+void Add_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+int Add(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Add_(static_cast<float*>(arg0->data), arg0->shape[0],
+       static_cast<float*>(arg1->data), arg1->shape[0],
+       static_cast<float*>(out->data));
+  return 0;
+}
+
+void Sub_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] - b[i];
+  }
+}
+
+int Sub(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Sub_(static_cast<float*>(arg0->data), arg0->shape[0],
+       static_cast<float*>(arg1->data), arg1->shape[0],
+       static_cast<float*>(out->data));
+  return 0;
+}
+
+class ExampleJSonModule : public ModuleNode {
+ public:
+  PackedFunc GetFunction(const std::string& name,
+                         const ObjectPtr<Object>& sptr_to_self) final {
+    if (this->graph_.find(name) != this->graph_.end()) {
+      this->curr_subgraph_ = name;
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        for (auto i = 0; i < args.size(); ++i) {
+          NDArray arg = args[i];
+          this->data_entry_[i].CopyFrom(arg);
+        }
+        for (const auto& it : this->graph_[this->curr_subgraph_]) {
+          this->run(it.first, it.second);
+        }
+        *rv = data_entry_.back();
+      });
+    } else {
+      LOG(FATAL) << "Unkown runtime type: " << name << "\n";
+      return PackedFunc();
+    }
+  }
+
+  void run(int id, const std::vector<int>& inputs) {
+    std::vector<TVMValue> values(inputs.size());
+    std::vector<int> type_codes(inputs.size());
+    TVMArgsSetter setter(values.data(), type_codes.data());
+
+    if (op_id_[id] == "add" || op_id_[id] == "sub") {
+      for (size_t i = 0; i < inputs.size(); i++) {
+        setter(i, data_entry_[inputs[i]]);
+      }
+    }
+
+    if (op_id_[id] == "add") {
+      Add(values.data(), type_codes.data(), inputs.size());
+    } else if (op_id_[id] == "sub") {
+      Sub(values.data(), type_codes.data(), inputs.size());
+    }
+  }
+
+  const char* type_key() const { return "examplejson"; }
+
+  void SaveToBinary(dmlc::Stream* stream) final {
+    // Write to a json string.
+  }
+
+  // Note this is a very simple json that only serves for demostration purpose.
+  // Users usually have their own format and they can serialize it using the
+  // SaveToBinary method and deserialize it using LoadFromFile.
+  void ParseJson(const std::string& json) {
+    std::string line;
+    std::string curr_subgraph;
+    std::stringstream ss(json);
+
+    while (std::getline(ss, line, '\n')) {
+      std::stringstream ss2(line);
+      std::string token;
+      int id = 0;
+
+      ss2 >> token;
+      if (token.find("json_rt_") != std::string::npos) {
+        curr_subgraph = token;
+        graph_[curr_subgraph];
+        continue;
+      }
+
+      ss2 >> id;
+      if (op_id_.size() <= static_cast<size_t>(id)) {
+        op_id_.resize(id + 1);
+        data_entry_.resize(id + 1);
+      }
+
+      int64_t total_elements = 1;
+      std::vector<int64_t> shape;
+      if (token == "input") {
+        int64_t size = 0;
+        while (ss2 >> size) {
+          total_elements *= size;
+          shape.push_back(size);
+        }
+      } else {
+        op_id_[id] = token;
+        bool shape_data = false;
+        while (ss2 >> token) {
+          if (token == "shape:") {
+            shape_data = true;
+          } else if (shape_data) {
+            total_elements *= std::stoll(token);
+            shape.push_back(std::stoll(token));
+          } else if (token != "inputs:") {
+            graph_[curr_subgraph][id].push_back(std::stoi(token));
+          }
+        }
+        graph_[curr_subgraph][id].push_back(id);
+      }
+      DLContext ctx;
+      ctx.device_type = static_cast<DLDeviceType>(1);
+      ctx.device_id = 0;
+      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx);
+    }
+  }
+
+  static Module LoadFromFile(const std::string& json,
+                             const std::string& format) {
+    auto n = tvm::runtime::make_object<ExampleJSonModule>();
+    n->ParseJson(json);
+    return Module(n);
+  }
+
+ private:
+  std::string curr_subgraph_;
+  // op -> inputs
+  std::map<std::string, std::map<int, std::vector<int>>> graph_;
+  std::vector<NDArray> data_entry_;
+  // id -> op
+  std::vector<std::string> op_id_;
+};
+
 TEST(ExampleModule, Basic) {
   // This is a simple json format used for testing. Users/vendors can define
   // their own format.
   std::string json =
-      "gcc_0\n"
+      "json_rt_0\n"
       "input 0 10 10\n"
       "input 1 10 10\n"
       "input 2 10 10\n"
@@ -59,7 +211,7 @@ TEST(ExampleModule, Basic) {
       "sub 4 inputs: 3 2 shape: 10 10";
 
   Module mod = ExampleJSonModule::LoadFromFile(json, "");
-  PackedFunc f = mod.GetFunction("gcc_0", false);
+  PackedFunc f = mod.GetFunction("json_rt_0", false);
 
   auto a_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
   auto b_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
@@ -90,3 +242,4 @@ int main(int argc, char** argv) {
   testing::FLAGS_gtest_death_test_style = "threadsafe";
   return RUN_ALL_TESTS();
 }
+
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 70ce5ce6d77d..c81580075110 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -17,6 +17,7 @@
 """Unit tests for external runtime."""
 from shutil import which
 import json
+import pytest
 import numpy as np
 
 import tvm
@@ -491,6 +492,7 @@ def test_engine_extern():
                  options=["-O2", "-std=c++11", "-I"+tmp_path.relpath("")])
 
 
+@pytest.mark.skip(reason="Support subgraph for json runtime later.")
 def test_json_extern():
     if which("gcc") is None:
         print("Skip test because gcc is not available.")
@@ -542,4 +544,4 @@ def test_json_extern():
 if __name__ == "__main__":
     test_dso_extern()
     test_engine_extern()
-    #test_json_extern()
\ No newline at end of file
+    # test_json_extern()

From d87e632f670855f04ddac0c8e114d5a32cdee41b Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 18 Nov 2019 02:56:05 +0000
Subject: [PATCH 15/22] TVM_DLL ModuleNode

---
 include/tvm/runtime/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index ff096eec5a43..b63b9bb74ee9 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -111,7 +111,7 @@ class Module : public ObjectRef {
  *
  * \endcode
  */
-class ModuleNode : public Object {
+class TVM_DLL ModuleNode : public Object {
  public:
   /*! \brief virtual destructor */
   virtual ~ModuleNode() {}

From 07fbe5192b7e7fc352027fe826cd6436115c2030 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Tue, 19 Nov 2019 01:31:32 +0000
Subject: [PATCH 16/22] move runtime examples to apps

---
 apps/README.md                                |   1 +
 apps/ext_runtime/Makefile                     |  34 ++
 apps/ext_runtime/README.md                    |  26 +
 .../python/ext_json_rt/__init__.py            |  19 +
 .../ext_runtime/src/ext_json_rt.cc            | 134 +++--
 apps/ext_runtime/tests/test_rt.py             | 551 ++++++++++++++++++
 src/runtime/graph/graph_runtime.cc            |   2 +-
 tests/python/relay/test_external_runtime.py   |  50 --
 8 files changed, 705 insertions(+), 112 deletions(-)
 create mode 100644 apps/ext_runtime/Makefile
 create mode 100644 apps/ext_runtime/README.md
 create mode 100644 apps/ext_runtime/python/ext_json_rt/__init__.py
 rename tests/cpp/external_runtime_test.cc => apps/ext_runtime/src/ext_json_rt.cc (67%)
 create mode 100644 apps/ext_runtime/tests/test_rt.py

diff --git a/apps/README.md b/apps/README.md
index 685750633493..58c9447f9bd8 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -26,3 +26,4 @@ If you are interested in writing optimized kernels with TVM, checkout [TOPI: TVM
 - [android_rpc](android_rpc) Android RPC server.
 - [benchmark](benchmark) Example end to end compilation benchmarks
 - [howto_deploy](howto_deploy) Tutorial on how to deploy TVM with minimum code dependency.
+- [ext_runtime](ext_runtime) How to extend TVM runtime for external backends.
diff --git a/apps/ext_runtime/Makefile b/apps/ext_runtime/Makefile
new file mode 100644
index 000000000000..ec50477a7247
--- /dev/null
+++ b/apps/ext_runtime/Makefile
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Minimum Makefile for the extension package
+TVM_ROOT=$(shell cd ../..; pwd)
+PKG_CFLAGS = -std=c++11 -O2 -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+
+PKG_LDFLAGS =-L${TVM_ROOT}/build
+UNAME_S := $(shell uname -s)
+
+ifeq ($(UNAME_S), Darwin)
+	PKG_LDFLAGS += -undefined dynamic_lookup
+endif
+
+lib/libtvm_ext_json_rt.so: src/ext_json_rt.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -shared -o $@ $^ $(PKG_LDFLAGS)
diff --git a/apps/ext_runtime/README.md b/apps/ext_runtime/README.md
new file mode 100644
index 000000000000..142309fc42e9
--- /dev/null
+++ b/apps/ext_runtime/README.md
@@ -0,0 +1,26 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+Example External Runtime
+========================
+This folder contains external runtime examples of TVM.
+
+- Extend DSO module as an external runtime.
+- Extend TVM runtime module as a customized JSON runtime.
+- The python module that creates new external runtime and intergrates into TVM exported library.
+
diff --git a/apps/ext_runtime/python/ext_json_rt/__init__.py b/apps/ext_runtime/python/ext_json_rt/__init__.py
new file mode 100644
index 000000000000..e18a1a423322
--- /dev/null
+++ b/apps/ext_runtime/python/ext_json_rt/__init__.py
@@ -0,0 +1,19 @@
+"""Example extension package of TVM."""
+from __future__ import absolute_import
+import os
+import ctypes
+# Import TVM first to get library symbols
+import tvm
+
+def load_lib():
+    """Load library, the functions will be registered into TVM"""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    # load in as global so the global extern symbol is visible to other dll.
+    lib = ctypes.CDLL(
+        os.path.join(curr_path, "../../lib/libtvm_ext_json_rt.so"), ctypes.RTLD_GLOBAL)
+    return lib
+
+_LIB = load_lib()
+
+create_json_rt = tvm.get_global_func("ext_json_rt.create_json_rt")
+
diff --git a/tests/cpp/external_runtime_test.cc b/apps/ext_runtime/src/ext_json_rt.cc
similarity index 67%
rename from tests/cpp/external_runtime_test.cc
rename to apps/ext_runtime/src/ext_json_rt.cc
index 9dc5ec0ed52e..721f078f83ad 100644
--- a/tests/cpp/external_runtime_test.cc
+++ b/apps/ext_runtime/src/ext_json_rt.cc
@@ -22,7 +22,6 @@
  * \brief Test an example runtime module to interpreting a json string.
  */
 #include <dmlc/logging.h>
-#include <gtest/gtest.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/module.h>
@@ -37,15 +36,10 @@
 #include <string>
 #include <vector>
 
-using tvm::runtime::Module;
-using tvm::runtime::ModuleNode;
-using tvm::runtime::NDArray;
-using tvm::runtime::Object;
-using tvm::runtime::ObjectPtr;
-using tvm::runtime::PackedFunc;
-using tvm::runtime::TVMArgs;
-using tvm::runtime::TVMArgsSetter;
-using tvm::runtime::TVMRetValue;
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace runtime {
 
 void Add_(float* a, int len_a, float* b, int len_b, float* c) {
   for (int i = 0; i < len_a * len_b; i++) {
@@ -81,19 +75,57 @@ int Sub(TVMValue* value, int* type_code, int nargs) {
   return 0;
 }
 
-class ExampleJSonModule : public ModuleNode {
+void Mul_(float* a, int len_a, float* b, int len_b, float* c) {
+  for (int i = 0; i < len_a * len_b; i++) {
+    c[i] = a[i] * b[i];
+  }
+}
+
+int Mul(TVMValue* value, int* type_code, int nargs) {
+  CHECK_EQ(nargs, 3U) << "Expect 3 args, but get " << nargs << "\n";
+  DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+  DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+  DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+  Mul_(static_cast<float*>(arg0->data), arg0->shape[0],
+       static_cast<float*>(arg1->data), arg1->shape[0],
+       static_cast<float*>(out->data));
+  return 0;
+}
+
+class ExampleJsonModule : public ModuleNode {
  public:
+  ExampleJsonModule(std::string graph_json) {
+      this->graph_json_ = graph_json;
+      ParseJson(graph_json);
+  }
+
   PackedFunc GetFunction(const std::string& name,
                          const ObjectPtr<Object>& sptr_to_self) final {
     if (this->graph_.find(name) != this->graph_.end()) {
       this->curr_subgraph_ = name;
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         for (auto i = 0; i < args.size(); ++i) {
-          NDArray arg = args[i];
-          this->data_entry_[i].CopyFrom(arg);
+          CHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle)
+              << "Expect NDArray or DLTensor as inputs"
+              << "\n";
+          if (args[i].type_code() == kArrayHandle) {
+            DLTensor* arg = args[i];
+            this->data_entry_[i].CopyFrom(arg);
+          } else {
+            NDArray arg = args[i];
+            this->data_entry_[i].CopyFrom(arg);
+          }
         }
         for (const auto& it : this->graph_[this->curr_subgraph_]) {
-          this->run(it.first, it.second);
+          this->Run(it.first, it.second);
+        }
+        auto out_idx = outs_[this->curr_subgraph_];
+        if (args[args.size() - 1].type_code() == kArrayHandle) {
+          DLTensor* arg = args[args.size() - 1];
+          this->data_entry_[out_idx].CopyTo(arg);
+        } else {
+          NDArray arg = args[args.size() - 1];
+          this->data_entry_[out_idx].CopyTo(arg);
         }
         *rv = data_entry_.back();
       });
@@ -103,12 +135,12 @@ class ExampleJSonModule : public ModuleNode {
     }
   }
 
-  void run(int id, const std::vector<int>& inputs) {
+  void Run(int id, const std::vector<int>& inputs) {
     std::vector<TVMValue> values(inputs.size());
     std::vector<int> type_codes(inputs.size());
     TVMArgsSetter setter(values.data(), type_codes.data());
 
-    if (op_id_[id] == "add" || op_id_[id] == "sub") {
+    if (op_id_[id] == "add" || op_id_[id] == "sub" || op_id_[id] == "mul") {
       for (size_t i = 0; i < inputs.size(); i++) {
         setter(i, data_entry_[inputs[i]]);
       }
@@ -118,13 +150,17 @@ class ExampleJSonModule : public ModuleNode {
       Add(values.data(), type_codes.data(), inputs.size());
     } else if (op_id_[id] == "sub") {
       Sub(values.data(), type_codes.data(), inputs.size());
+    } else if (op_id_[id] == "mul") {
+      Mul(values.data(), type_codes.data(), inputs.size());
+    } else {
+      LOG(FATAL) << "Unknown op: " << op_id_[id] << "\n";
     }
   }
 
   const char* type_key() const { return "examplejson"; }
 
   void SaveToBinary(dmlc::Stream* stream) final {
-    // Write to a json string.
+      stream->Write(this->graph_json_);
   }
 
   // Note this is a very simple json that only serves for demostration purpose.
@@ -175,6 +211,7 @@ class ExampleJSonModule : public ModuleNode {
           }
         }
         graph_[curr_subgraph][id].push_back(id);
+        outs_[curr_subgraph] = id;
       }
       DLContext ctx;
       ctx.device_type = static_cast<DLDeviceType>(1);
@@ -185,61 +222,36 @@ class ExampleJSonModule : public ModuleNode {
 
   static Module LoadFromFile(const std::string& json,
                              const std::string& format) {
-    auto n = tvm::runtime::make_object<ExampleJSonModule>();
-    n->ParseJson(json);
+    auto n = tvm::runtime::make_object<ExampleJsonModule>(json);
     return Module(n);
   }
 
+  static Module LoadFromBinary(void* strm) {
+      dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+      std::string graph_json;
+      stream->Read(&graph_json);
+      auto n = tvm::runtime::make_object<ExampleJsonModule>(graph_json);
+      return Module(n);
+  }
+
  private:
+  std::string graph_json_;
   std::string curr_subgraph_;
-  // op -> inputs
+  // subgraph_id -> op -> inputs
   std::map<std::string, std::map<int, std::vector<int>>> graph_;
+  // subgraph_id -> out
+  std::map<std::string, int> outs_;
   std::vector<NDArray> data_entry_;
   // id -> op
   std::vector<std::string> op_id_;
 };
 
-TEST(ExampleModule, Basic) {
-  // This is a simple json format used for testing. Users/vendors can define
-  // their own format.
-  std::string json =
-      "json_rt_0\n"
-      "input 0 10 10\n"
-      "input 1 10 10\n"
-      "input 2 10 10\n"
-      "add 3 inputs: 0 1 shape: 10 10\n"
-      "sub 4 inputs: 3 2 shape: 10 10";
-
-  Module mod = ExampleJSonModule::LoadFromFile(json, "");
-  PackedFunc f = mod.GetFunction("json_rt_0", false);
-
-  auto a_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto b_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-  auto c_val = NDArray::Empty({10, 10}, {kDLFloat, 32, 1}, {kDLCPU, 0});
-
-  float* pa = (float*)a_val.ToDLPack()->dl_tensor.data;
-  float* pb = (float*)b_val.ToDLPack()->dl_tensor.data;
-  float* pc = (float*)c_val.ToDLPack()->dl_tensor.data;
-
-  // Assign values.
-  for (int i = 0; i < 10 * 10; i++) {
-    pa[i] = i;
-    pb[i] = i + 1.0;
-    pc[i] = i + 2.0;
-  }
-
-  NDArray out = f(a_val, b_val, c_val);
-  float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
+TVM_REGISTER_GLOBAL("ext_json_rt.create_json_rt")
+    .set_body_typed(ExampleJsonModule::LoadFromFile);
 
-  // Check correctness of result
-  for (int i = 0; i < 10; i++) {
-    CHECK_LT(std::fabs(p_out[i] - ((i + (i + 1.0) - (i + 2.0)))), 1e-5);
-  }
-}
+TVM_REGISTER_GLOBAL("module.loadbinary_examplejson")
+    .set_body_typed(ExampleJsonModule::LoadFromBinary);
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  return RUN_ALL_TESTS();
-}
+}  // namespace runtime
+}  // namespace tvm
 
diff --git a/apps/ext_runtime/tests/test_rt.py b/apps/ext_runtime/tests/test_rt.py
new file mode 100644
index 000000000000..78c565495ffa
--- /dev/null
+++ b/apps/ext_runtime/tests/test_rt.py
@@ -0,0 +1,551 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import ext_json_rt
+from shutil import which
+import json
+import pytest
+import sys
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import module as _tvm_module
+from tvm.contrib import util
+
+tmp_path = util.tempdir()
+
+
+def generate_csource_module():
+    """Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""
+
+    code = r'''
+    #include <tvm/runtime/c_runtime_api.h>
+    #include <dlpack/dlpack.h>
+    #include <cstdint>
+    #include <cstring>
+    #include <iostream>
+
+    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          out[i] = a[i] p_OP_ b[i];                           \
+        }                                                     \
+      }
+
+    #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {             \
+            int64_t k = i * p_DIM2_ + j;                      \
+            out[k] = a[k] p_OP_ b[k];                         \
+          }                                                   \
+        }                                                     \
+      }
+    GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
+    GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
+    GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
+
+    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
+                           float* gcc_input6, float* gcc_input7, float* out) {
+      float* buf_0 = (float*)malloc(4 * 100);
+      float* buf_1 = (float*)malloc(4 * 100);
+      gcc_1_2(gcc_input4, gcc_input5, buf_0);
+      gcc_1_1(buf_0, gcc_input6, buf_1);
+      gcc_1_0(buf_1, gcc_input7, out);
+      free(buf_0);
+      free(buf_1);
+    }
+
+    extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
+      if (nargs != 5) {
+        printf("Expect 5 args, but get %d", nargs);
+        return 1;
+      }
+      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+      gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+             static_cast<float*>(out->data));
+      return 0;
+    }
+
+    GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
+    GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
+    GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
+
+    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
+                           float* gcc_input2, float* gcc_input3, float* out) {
+      float* buf_0 = (float*)malloc(4 * 100);
+      float* buf_1 = (float*)malloc(4 * 100);
+      gcc_0_2(gcc_input0, gcc_input1, buf_0);
+      gcc_0_1(buf_0, gcc_input2, buf_1);
+      gcc_0_0(buf_1, gcc_input3, out);
+      free(buf_0);
+      free(buf_1);
+    }
+
+    extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
+      if (nargs != 5) {
+        printf("Expect 5 args, but get %d", nargs);
+        return 1;
+      }
+      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+      gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+             static_cast<float*>(out->data));
+      return 0;
+    }
+    '''
+    csource_module = _tvm_module.csource_module_create(code, "cc")
+    return csource_module
+
+
+def generate_engine_module():
+    """
+    Mock the codegen of an external backend with its own runtime engine
+    (e.g., MKL-DNN/TensorRT)
+    """
+
+    code = r'''
+    #include <tvm/runtime/c_runtime_api.h>
+    #include <dlpack/dlpack.h>
+    #include "gcc_engine.h"
+
+    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
+            float* gcc_input6, float* gcc_input7, float* out) {
+            
+        std::string graph =
+            "add_2d,10,10\n"
+            "sub_2d,10,10\n"
+            "mul_2d,10,10\n";
+
+        Engine engine;
+        engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out);
+    }
+
+
+    extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
+        if (nargs != 5) {
+            printf("Expect 5 args, but get %d", nargs);
+            return 1;
+        }
+        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+        gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+                static_cast<float*>(out->data));
+        return 0;
+    }
+
+    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
+            float* gcc_input2, float* gcc_input3, float* out) {
+            
+        std::string graph =
+            "add_2d,10,10\n"
+            "sub_2d,10,10\n"
+            "mul_2d,10,10\n";
+
+        Engine engine;
+        engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out);
+
+    }
+
+    extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
+        if (nargs != 5) {
+            printf("Expect 5 args, but get %d", nargs);
+            return 1;
+        }
+        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
+        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
+        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
+        gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
+                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
+                static_cast<float*>(out->data));
+        return 0;
+    }
+    '''
+
+    gen_gcc_engine()
+    csource_module = _tvm_module.csource_module_create(code, "cc")
+    return csource_module
+
+
+def gen_gcc_engine():
+    """An example of external backend runtime engine. This is supposed to be provided
+      by third-party vendors and included when building the generated external kernel code.
+    """
+
+    code = r'''
+    #ifndef _GCC_ENGINE_H_
+    #define _GCC_ENGINE_H_
+    #include <cstdint>
+    #include <string>
+    #include <sstream>
+    #include <vector>
+
+    #define GCC_BINARY_OP_2D(p_ID_, p_OP_)  \
+      void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < dim1; ++i) {                                   \
+          for (int64_t j = 0; j < dim2; ++j) {                                 \
+            int64_t k = i * dim2 + j;                                          \
+            out[k] = a[k] p_OP_ b[k];                                          \
+          }                                                                    \
+        }                                                                      \
+      }
+    GCC_BINARY_OP_2D(add_2d, +);
+    GCC_BINARY_OP_2D(sub_2d, -);
+    GCC_BINARY_OP_2D(mul_2d, *);
+
+    struct Layer {
+        void (*op)(int64_t, int64_t, float*, float*, float*);
+        std::vector<int64_t> shapes;
+        std::vector<float*> args;
+    };
+
+    class Engine {
+    public:
+        float* alloc_buffer(int64_t size) {
+            float* buf = (float*)malloc(sizeof(float) * size);
+            buffers.push_back(buf);
+            return buf;
+        }
+        void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) {
+            Layer layer;
+            layer.shapes.push_back(dim1);
+            layer.shapes.push_back(dim2);
+            layer.args.push_back(in1);
+            layer.args.push_back(in2);
+            layer.args.push_back(out);
+
+            if (op == "add_2d")
+                layer.op = &add_2d;
+            else if (op == "sub_2d")
+                layer.op = &sub_2d;
+            else if (op == "mul_2d")
+                layer.op = &mul_2d;
+            net.push_back(layer);
+            return ;
+        }
+
+        void run(std::string graph, std::vector<float*> args, float* out) {
+            std::stringstream ss(graph);
+            std::string line;
+            int layer_idx = 0;
+            int arg_idx = 0;
+            float* buf = nullptr;
+
+            while (std::getline(ss, line, '\n')) {
+                std::stringstream ss2(line);
+                std::string token;
+                std::vector<std::string> attrs;
+                while (std::getline(ss2, token, ',')) {
+                    attrs.push_back(token);
+                }
+                int64_t dim1 = stoll(attrs[1]);
+                int64_t dim2 = stoll(attrs[2]);
+                auto out_buf = this->alloc_buffer(dim1 * dim2);
+
+                if (layer_idx == 0) {
+                    this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf);
+                    buf = out_buf;
+                    arg_idx = 2;
+                }
+                else {
+                    this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf);
+                    buf = out_buf;
+                    arg_idx++;
+                }
+                layer_idx++;
+            }
+            this->net.back().args.back() = out;
+
+            for (auto layer : net) {
+                (*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]);
+            }
+        }
+        ~Engine() {
+            for (auto buf : buffers) {
+                free(buf);
+            }
+        }
+    private:
+        std::vector<Layer> net;
+        std::vector<float*> buffers;
+    };
+
+    #endif
+    '''
+    header_file = tmp_path.relpath("gcc_engine.h")
+    with open(header_file, 'w') as f:
+        f.write(code)
+
+
+def get_synthetic_lib():
+    x = relay.var('x', shape=(10, 10))
+    w0 = relay.var('w0', shape=(10, 10))
+    w1 = relay.var('w1', shape=(10, 10))
+    w2 = relay.var('w2', shape=(10, 10))
+    w3 = relay.var('w3', shape=(10, 10))
+    w4 = relay.var('w4', shape=(10, 10))
+    w5 = relay.var('w5', shape=(10, 10))
+    w6 = relay.var('w6', shape=(10, 10))
+    w7 = relay.var('w7', shape=(10, 10))
+
+    # subgraph0
+    gcc_input0 = relay.var('gcc_input0', shape=(10, 10))
+    gcc_input1 = relay.var('gcc_input1', shape=(10, 10))
+    gcc_input2 = relay.var('gcc_input2', shape=(10, 10))
+    gcc_input3 = relay.var('gcc_input3', shape=(10, 10))
+    subgraph0 = relay.Function([gcc_input0, gcc_input1, gcc_input2,
+                                gcc_input3], relay.copy(gcc_input0))
+    subgraph0 = subgraph0.set_attribute(
+        "Primitive", tvm.expr.IntImm("int32", 1))
+
+    # Call subgraph0
+    subgraph0_ret = relay.Call(subgraph0, [x, w0, w1, w2])
+
+    # subgraph1
+    gcc_input4 = relay.var('gcc_input4', shape=(10, 10))
+    gcc_input5 = relay.var('gcc_input5', shape=(10, 10))
+    gcc_input6 = relay.var('gcc_input6', shape=(10, 10))
+    gcc_input7 = relay.var('gcc_input7', shape=(10, 10))
+    subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6,
+                                gcc_input7], relay.copy(gcc_input4))
+    subgraph1 = subgraph1.set_attribute(
+        "Primitive", tvm.expr.IntImm("int32", 1))
+
+    # Call subgraph1
+    subgraph1_ret = relay.Call(subgraph1, [x, w3, w4, w5])
+
+    # Other ops that will be executed on TVM.
+    add2 = relay.add(x, w6)
+    sub2 = relay.subtract(add2, w7)
+    ret = relay.concatenate((subgraph0_ret, subgraph1_ret, sub2), 0)
+    func = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], ret)
+    mod = relay.Module.from_expr(func)
+    _, lib, _ = relay.build(mod, "llvm")
+    return lib
+
+def get_whole_graph_json():
+    nodex = {"op": "null", "name": "x", "inputs": []}
+    node0 = {"op": "null", "name": "w0", "inputs": []}
+    node1 = {"op": "null", "name": "w1", "inputs": []}
+    node2 = {"op": "null", "name": "w2", "inputs": []}
+    node3 = {"op": "null", "name": "w3", "inputs": []}
+    node4 = {"op": "null", "name": "w4", "inputs": []}
+    node5 = {"op": "null", "name": "w5", "inputs": []}
+    node6 = {"op": "null", "name": "w6", "inputs": []}
+    node7 = {"op": "null", "name": "w7", "inputs": []}
+
+    subgraph0 = {
+        "op": "tvm_op",
+        "name": "json_rt_0",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "4",
+            "func_name": "json_rt_0",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [0, 0, 0],
+            [1, 0, 0],
+            [2, 0, 0],
+            [3, 0, 0],
+        ]
+    }
+    subgraph1 = {
+        "op": "tvm_op",
+        "name": "json_rt_1",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "4",
+            "func_name": "json_rt_1",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [0, 0, 0],
+            [4, 0, 0],
+            [5, 0, 0],
+            [6, 0, 0],
+        ]
+    }
+
+    fused_op = {
+        "op": "tvm_op",
+        "name": "fused_add_subtract_concatenate",
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "5",
+            "func_name": "fused_add_subtract_concatenate",
+            "flatten_data": "0"
+        },
+        "inputs": [
+            [9, 0, 0],
+            [10, 0, 0],
+            [0, 0, 0],
+            [7, 0, 0],
+            [8, 0, 0]
+        ]
+    }
+    nodes = [nodex, node0, node1, node2, node3, node4,
+             node5, node6, node7, subgraph0, subgraph1, fused_op]
+    arg_nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
+    heads = [[11, 0, 0]]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    storage_id = ["list_int", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
+
+    shape = ["list_shape", [
+        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [10, 10],
+        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [30, 10]]]
+
+    dltype = ["list_str", [
+        "float32", "float32", "float32", "float32", "float32", "float32",
+        "float32", "float32", "float32", "float32", "float32", "float32"]]
+
+    attrs = {
+        "shape": shape,
+        "dltype": dltype,
+        "storage_id": storage_id,
+    }
+
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+
+    return json.dumps(graph)
+
+
+def run_extern(label, get_extern_src, **kwargs):
+    if which("gcc") is None:
+        print("Skip test because gcc is not available.")
+
+    obj_name = "{}.o".format(label)
+    lib_name = "external_{}.so".format(label)
+
+    # Get Json and the compiled library.
+    graph_json = get_whole_graph_json()
+    lib = get_synthetic_lib()
+    lib.save(obj_name)
+
+    # library that contains external code.
+    csource_module = get_extern_src()
+    kwargs["options"] = [obj_name] + kwargs["options"]
+    lib_path = tmp_path.relpath(lib_name)
+    csource_module.export_library(lib_path, fcompile=False, **kwargs)
+    # load module for execution.
+    lib = tvm.module.load(lib_path)
+    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
+
+    x_data = np.random.rand(10, 10).astype('float32')
+    mod.set_input("x", x_data)
+    w_data = []
+    for i in range(8):
+        data = np.random.rand(10, 10).astype('float32')
+        w_data.append(data)
+        var = "w" + str(i)
+        mod.set_input(var, data)
+    mod.run()
+    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
+    out = mod.get_output(0, out)
+    tvm.testing.assert_allclose(
+        out.asnumpy(),
+        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                        x_data + w_data[6] - w_data[7]),
+                       axis=0))
+
+
+def tutorial_dso_extern():
+    run_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
+
+
+def tutorial_engine_extern():
+    run_extern("engine",
+               generate_engine_module,
+               options=["-O2", "-std=c++11", "-I" + tmp_path.relpath("")])
+
+def tutorial_json_extern():
+    if which("gcc") is None:
+        print("Skip test because gcc is not available.")
+
+    # Get subgraph Json.
+    subgraph_json = ("json_rt_0\n" +
+                     "input 0 10 10\n" +
+                     "input 1 10 10\n" +
+                     "input 2 10 10\n" +
+                     "input 3 10 10\n" +
+                     "add 4 inputs: 0 1 shape: 10 10\n" +
+                     "sub 5 inputs: 4 2 shape: 10 10\n" +
+                     "mul 6 inputs: 5 3 shape: 10 10\n" +
+                     "json_rt_1\n" +
+                     "input 0 10 10\n" +
+                     "input 1 10 10\n" +
+                     "input 2 10 10\n" +
+                     "input 3 10 10\n" +
+                     "add 4 inputs: 0 1 shape: 10 10\n" +
+                     "sub 5 inputs: 4 2 shape: 10 10\n" +
+                     "mul 6 inputs: 5 3 shape: 10 10")
+
+    # Get Json and module.
+    graph_json = get_whole_graph_json()
+    lib = get_synthetic_lib()
+    ext_lib = ext_json_rt.create_json_rt(subgraph_json, "")
+    lib.import_module(ext_lib)
+    lib.export_library('external.so')
+
+    # load module for execution.
+    lib = tvm.module.load('external.so')
+    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
+
+    x_data = np.random.rand(10, 10).astype('float32')
+    mod.set_input("x", x_data)
+    w_data = []
+    for i in range(8):
+        data = np.random.rand(10, 10).astype('float32')
+        w_data.append(data)
+        var = "w" + str(i)
+        mod.set_input(var, data)
+
+    mod.run()
+    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
+    out = mod.get_output(0, out)
+    tvm.testing.assert_allclose(
+        out.asnumpy(),
+        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                        x_data + w_data[6] - w_data[7]),
+                       axis=0))
+
+
+if __name__ == "__main__":
+    tutorial_dso_extern()
+    tutorial_engine_extern()
+    tutorial_json_extern()
\ No newline at end of file
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 9ad10c1232c3..06e5fef43de7 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -396,7 +396,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
 
   // Get compiled function from the module that contains both host and device
   // code.
-  tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
+  tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true);
   CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
 
   auto fexec = [arg_ptr, pf]() {
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index c81580075110..504d5020093f 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -492,56 +492,6 @@ def test_engine_extern():
                  options=["-O2", "-std=c++11", "-I"+tmp_path.relpath("")])
 
 
-@pytest.mark.skip(reason="Support subgraph for json runtime later.")
-def test_json_extern():
-    if which("gcc") is None:
-        print("Skip test because gcc is not available.")
-
-    # Get Json.
-    graph_json = get_whole_graph_json()
-
-    # Get subgraph Json.
-    subgraph_json = ("gcc_0\n" +
-                     "input 0 10 10\n" +
-                     "input 1 10 10\n" +
-                     "input 2 10 10\n" +
-                     "input 3 10 10\n" +
-                     "add 4 inputs: 0 1 shape: 10 10\n" +
-                     "sub 5 inputs: 4 2 shape: 10 10\n" +
-                     "mul 6 inputs: 5 3 shape: 10 10\n" +
-                     "gcc_1\n" +
-                     "input 0 10 10\n" +
-                     "input 1 10 10\n" +
-                     "input 2 10 10\n" +
-                     "input 3 10 10\n" +
-                     "add 4 inputs: 0 1 shape: 10 10\n" +
-                     "sub 5 inputs: 4 2 shape: 10 10\n" +
-                     "mul 6 inputs: 5 3 shape: 10 10")
-
-    # load module for execution.
-    lib = tvm.module.load(subgraph_json, 'gcc')
-    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
-
-    x_data = np.random.rand(10, 10).astype('float32')
-    mod.set_input("x", x_data)
-    w_data = []
-    for i in range(8):
-        data = np.random.rand(10, 10).astype('float32')
-        w_data.append(data)
-        var = "w" + str(i)
-        mod.set_input(var, data)
-    mod.run()
-    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
-    out = mod.get_output(0, out)
-    tvm.testing.assert_allclose(
-        out.asnumpy(),
-        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
-                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-                        x_data + w_data[6] - w_data[7]),
-                       axis=0))
-
-
 if __name__ == "__main__":
     test_dso_extern()
     test_engine_extern()
-    # test_json_extern()

From 3ce618cfb0505a0cf6f5d6bf3d7dee43baefb74c Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Tue, 19 Nov 2019 01:41:56 +0000
Subject: [PATCH 17/22] fix ci

---
 .../python/ext_json_rt/__init__.py            |  16 +
 tests/python/relay/test_external_runtime.py   | 497 ------------------
 2 files changed, 16 insertions(+), 497 deletions(-)
 delete mode 100644 tests/python/relay/test_external_runtime.py

diff --git a/apps/ext_runtime/python/ext_json_rt/__init__.py b/apps/ext_runtime/python/ext_json_rt/__init__.py
index e18a1a423322..8d60a663cf8a 100644
--- a/apps/ext_runtime/python/ext_json_rt/__init__.py
+++ b/apps/ext_runtime/python/ext_json_rt/__init__.py
@@ -1,3 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 """Example extension package of TVM."""
 from __future__ import absolute_import
 import os
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
deleted file mode 100644
index 504d5020093f..000000000000
--- a/tests/python/relay/test_external_runtime.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for external runtime."""
-from shutil import which
-import json
-import pytest
-import numpy as np
-
-import tvm
-from tvm import relay
-from tvm import module as _tvm_module
-from tvm.contrib import util
-
-tmp_path = util.tempdir()
-
-
-def generate_csource_module():
-    """Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""
-
-    code = r'''
-    #include <tvm/runtime/c_runtime_api.h>
-    #include <dlpack/dlpack.h>
-    #include <cstdint>
-    #include <cstring>
-    #include <iostream>
-
-    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
-      extern "C" void p_ID_(float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-          out[i] = a[i] p_OP_ b[i];                           \
-        }                                                     \
-      }
-
-    #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-      extern "C" void p_ID_(float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-          for (int64_t j = 0; j < p_DIM2_; ++j) {             \
-            int64_t k = i * p_DIM2_ + j;                      \
-            out[k] = a[k] p_OP_ b[k];                         \
-          }                                                   \
-        }                                                     \
-      }
-    GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
-    GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
-    GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
-
-    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
-                           float* gcc_input6, float* gcc_input7, float* out) {
-      float* buf_0 = (float*)malloc(4 * 100);
-      float* buf_1 = (float*)malloc(4 * 100);
-      gcc_1_2(gcc_input4, gcc_input5, buf_0);
-      gcc_1_1(buf_0, gcc_input6, buf_1);
-      gcc_1_0(buf_1, gcc_input7, out);
-      free(buf_0);
-      free(buf_1);
-    }
-
-    extern "C" int gcc_1(TVMValue* value, int* type_code, int nargs) {
-      if (nargs != 5) {
-        printf("Expect 5 args, but get %d", nargs);
-        return 1;
-      }
-      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
-      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
-      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
-      gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
-             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
-             static_cast<float*>(out->data));
-      return 0;
-    }
-
-    GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
-    GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
-    GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
-
-    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
-                           float* gcc_input2, float* gcc_input3, float* out) {
-      float* buf_0 = (float*)malloc(4 * 100);
-      float* buf_1 = (float*)malloc(4 * 100);
-      gcc_0_2(gcc_input0, gcc_input1, buf_0);
-      gcc_0_1(buf_0, gcc_input2, buf_1);
-      gcc_0_0(buf_1, gcc_input3, out);
-      free(buf_0);
-      free(buf_1);
-    }
-
-    extern "C" int gcc_0(TVMValue* value, int* type_code, int nargs) {
-      if (nargs != 5) {
-        printf("Expect 5 args, but get %d", nargs);
-        return 1;
-      }
-      DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-      DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-      DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
-      DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
-      DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
-      gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
-             static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
-             static_cast<float*>(out->data));
-      return 0;
-    }
-    '''
-    csource_module = _tvm_module.csource_module_create(code, "cc")
-    return csource_module
-
-
-def generate_engine_module():
-    """
-    Mock the codegen of an external backend with its own runtime engine
-    (e.g., MKL-DNN/TensorRT)
-    """
-
-    code = r'''
-    #include <tvm/runtime/c_runtime_api.h>
-    #include <dlpack/dlpack.h>
-    #include "gcc_engine.h"
-
-    extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
-            float* gcc_input6, float* gcc_input7, float* out) {
-            
-        std::string graph =
-            "add_2d,10,10\n"
-            "sub_2d,10,10\n"
-            "mul_2d,10,10\n";
-
-        Engine engine;
-        engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out);
-    }
-
-
-    extern "C" int gcc_1(TVMValue* value, int* type_code, int nargs) {
-        if (nargs != 5) {
-            printf("Expect 5 args, but get %d", nargs);
-            return 1;
-        }
-        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
-        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
-        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
-        gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
-                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
-                static_cast<float*>(out->data));
-        return 0;
-    }
-
-    extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
-            float* gcc_input2, float* gcc_input3, float* out) {
-            
-        std::string graph =
-            "add_2d,10,10\n"
-            "sub_2d,10,10\n"
-            "mul_2d,10,10\n";
-
-        Engine engine;
-        engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out);
-
-    }
-
-    extern "C" int gcc_0(TVMValue* value, int* type_code, int nargs) {
-        if (nargs != 5) {
-            printf("Expect 5 args, but get %d", nargs);
-            return 1;
-        }
-        DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-        DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-        DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
-        DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
-        DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
-        gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
-                static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
-                static_cast<float*>(out->data));
-        return 0;
-    }
-    '''
-
-    gen_gcc_engine()
-    csource_module = _tvm_module.csource_module_create(code, "cc")
-    return csource_module
-
-
-def gen_gcc_engine():
-    """An example of external backend runtime engine. This is supposed to be provided
-      by third-party vendors and included when building the generated external kernel code.
-    """
-
-    code = r'''
-    #ifndef _GCC_ENGINE_H_
-    #define _GCC_ENGINE_H_
-    #include <cstdint>
-    #include <string>
-    #include <sstream>
-    #include <vector>
-
-    #define GCC_BINARY_OP_2D(p_ID_, p_OP_)  \
-      void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < dim1; ++i) {                                   \
-          for (int64_t j = 0; j < dim2; ++j) {                                 \
-            int64_t k = i * dim2 + j;                                          \
-            out[k] = a[k] p_OP_ b[k];                                          \
-          }                                                                    \
-        }                                                                      \
-      }
-    GCC_BINARY_OP_2D(add_2d, +);
-    GCC_BINARY_OP_2D(sub_2d, -);
-    GCC_BINARY_OP_2D(mul_2d, *);
-
-    struct Layer {
-        void (*op)(int64_t, int64_t, float*, float*, float*);
-        std::vector<int64_t> shapes;
-        std::vector<float*> args;
-    };
-
-    class Engine {
-    public:
-        float* alloc_buffer(int64_t size) {
-            float* buf = (float*)malloc(sizeof(float) * size);
-            buffers.push_back(buf);
-            return buf;
-        }
-        void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) {
-            Layer layer;
-            layer.shapes.push_back(dim1);
-            layer.shapes.push_back(dim2);
-            layer.args.push_back(in1);
-            layer.args.push_back(in2);
-            layer.args.push_back(out);
-
-            if (op == "add_2d")
-                layer.op = &add_2d;
-            else if (op == "sub_2d")
-                layer.op = &sub_2d;
-            else if (op == "mul_2d")
-                layer.op = &mul_2d;
-            net.push_back(layer);
-            return ;
-        }
-
-        void run(std::string graph, std::vector<float*> args, float* out) {
-            std::stringstream ss(graph);
-            std::string line;
-            int layer_idx = 0;
-            int arg_idx = 0;
-            float* buf = nullptr;
-
-            while (std::getline(ss, line, '\n')) {
-                std::stringstream ss2(line);
-                std::string token;
-                std::vector<std::string> attrs;
-                while (std::getline(ss2, token, ',')) {
-                    attrs.push_back(token);
-                }
-                int64_t dim1 = stoll(attrs[1]);
-                int64_t dim2 = stoll(attrs[2]);
-                auto out_buf = this->alloc_buffer(dim1 * dim2);
-
-                if (layer_idx == 0) {
-                    this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf);
-                    buf = out_buf;
-                    arg_idx = 2;
-                }
-                else {
-                    this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf);
-                    buf = out_buf;
-                    arg_idx++;
-                }
-                layer_idx++;
-            }
-            this->net.back().args.back() = out;
-
-            for (auto layer : net) {
-                (*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]);
-            }
-        }
-        ~Engine() {
-            for (auto buf : buffers) {
-                free(buf);
-            }
-        }
-    private:
-        std::vector<Layer> net;
-        std::vector<float*> buffers;
-    };
-
-    #endif
-    '''
-    header_file = tmp_path.relpath("gcc_engine.h")
-    with open(header_file, 'w') as f:
-        f.write(code)
-
-
-def get_synthetic_lib():
-    x = relay.var('x', shape=(10, 10))
-    w0 = relay.var('w0', shape=(10, 10))
-    w1 = relay.var('w1', shape=(10, 10))
-    w2 = relay.var('w2', shape=(10, 10))
-    w3 = relay.var('w3', shape=(10, 10))
-    w4 = relay.var('w4', shape=(10, 10))
-    w5 = relay.var('w5', shape=(10, 10))
-    w6 = relay.var('w6', shape=(10, 10))
-    w7 = relay.var('w7', shape=(10, 10))
-
-    # subgraph0
-    gcc_input0 = relay.var('gcc_input0', shape=(10, 10))
-    gcc_input1 = relay.var('gcc_input1', shape=(10, 10))
-    gcc_input2 = relay.var('gcc_input2', shape=(10, 10))
-    gcc_input3 = relay.var('gcc_input3', shape=(10, 10))
-    subgraph0 = relay.Function([gcc_input0, gcc_input1, gcc_input2,
-                                gcc_input3], relay.copy(gcc_input0))
-    subgraph0 = subgraph0.set_attribute(
-        "Primitive", tvm.expr.IntImm("int32", 1))
-
-    # Call subgraph0
-    subgraph0_ret = relay.Call(subgraph0, [x, w0, w1, w2])
-
-    # subgraph1
-    gcc_input4 = relay.var('gcc_input4', shape=(10, 10))
-    gcc_input5 = relay.var('gcc_input5', shape=(10, 10))
-    gcc_input6 = relay.var('gcc_input6', shape=(10, 10))
-    gcc_input7 = relay.var('gcc_input7', shape=(10, 10))
-    subgraph1 = relay.Function([gcc_input4, gcc_input5, gcc_input6,
-                                gcc_input7], relay.copy(gcc_input4))
-    subgraph1 = subgraph1.set_attribute(
-        "Primitive", tvm.expr.IntImm("int32", 1))
-
-    # Call subgraph1
-    subgraph1_ret = relay.Call(subgraph1, [x, w3, w4, w5])
-
-    # Other ops that will be executed on TVM.
-    add2 = relay.add(x, w6)
-    sub2 = relay.subtract(add2, w7)
-    ret = relay.concatenate((subgraph0_ret, subgraph1_ret, sub2), 0)
-    func = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], ret)
-    mod = relay.Module.from_expr(func)
-    _, lib, _ = relay.build(mod, "llvm")
-    return lib
-
-
-def get_whole_graph_json():
-    nodex = {"op": "null", "name": "x", "inputs": []}
-    node0 = {"op": "null", "name": "w0", "inputs": []}
-    node1 = {"op": "null", "name": "w1", "inputs": []}
-    node2 = {"op": "null", "name": "w2", "inputs": []}
-    node3 = {"op": "null", "name": "w3", "inputs": []}
-    node4 = {"op": "null", "name": "w4", "inputs": []}
-    node5 = {"op": "null", "name": "w5", "inputs": []}
-    node6 = {"op": "null", "name": "w6", "inputs": []}
-    node7 = {"op": "null", "name": "w7", "inputs": []}
-
-    subgraph0 = {
-        "op": "tvm_op",
-        "name": "gcc_0",
-        "attrs": {
-            "num_outputs": "1",
-            "num_inputs": "4",
-            "func_name": "gcc_0",
-            "flatten_data": "0"
-        },
-        "inputs": [
-            [0, 0, 0],
-            [1, 0, 0],
-            [2, 0, 0],
-            [3, 0, 0],
-        ]
-    }
-    subgraph1 = {
-        "op": "tvm_op",
-        "name": "gcc_1",
-        "attrs": {
-            "num_outputs": "1",
-            "num_inputs": "4",
-            "func_name": "gcc_1",
-            "flatten_data": "0"
-        },
-        "inputs": [
-            [0, 0, 0],
-            [4, 0, 0],
-            [5, 0, 0],
-            [6, 0, 0],
-        ]
-    }
-
-    fused_op = {
-        "op": "tvm_op",
-        "name": "fused_add_subtract_concatenate",
-        "attrs": {
-            "num_outputs": "1",
-            "num_inputs": "5",
-            "func_name": "fused_add_subtract_concatenate",
-            "flatten_data": "0"
-        },
-        "inputs": [
-            [9, 0, 0],
-            [10, 0, 0],
-            [0, 0, 0],
-            [7, 0, 0],
-            [8, 0, 0]
-        ]
-    }
-    nodes = [nodex, node0, node1, node2, node3, node4,
-             node5, node6, node7, subgraph0, subgraph1, fused_op]
-    arg_nodes = [0, 1, 2, 3, 4, 5, 6, 7, 8]
-    heads = [[11, 0, 0]]
-    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
-    storage_id = ["list_int", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
-
-    shape = ["list_shape", [
-        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [10, 10],
-        [10, 10], [10, 10], [10, 10], [10, 10], [10, 10], [30, 10]]]
-
-    dltype = ["list_str", [
-        "float32", "float32", "float32", "float32", "float32", "float32",
-        "float32", "float32", "float32", "float32", "float32", "float32"]]
-
-    attrs = {
-        "shape": shape,
-        "dltype": dltype,
-        "storage_id": storage_id,
-    }
-
-    graph = {"nodes": nodes,
-             "arg_nodes": arg_nodes,
-             "node_row_ptr": node_row_ptr,
-             "heads": heads,
-             "attrs": attrs}
-
-    return json.dumps(graph)
-
-
-def check_extern(label, get_extern_src, **kwargs):
-    if which("gcc") is None:
-        print("Skip test because gcc is not available.")
-
-    obj_name = "{}.o".format(label)
-    lib_name = "external_{}.so".format(label)
-
-    # Get Json and the compiled library.
-    graph_json = get_whole_graph_json()
-    lib = get_synthetic_lib()
-    lib.save(obj_name)
-
-    # library that contains external code.
-    csource_module = get_extern_src()
-    kwargs["options"] = [obj_name] + kwargs["options"]
-    lib_path = tmp_path.relpath(lib_name)
-    csource_module.export_library(lib_path, fcompile=False, **kwargs)
-    # load module for execution.
-    lib = tvm.module.load(lib_path)
-    mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
-
-    x_data = np.random.rand(10, 10).astype('float32')
-    mod.set_input("x", x_data)
-    w_data = []
-    for i in range(8):
-        data = np.random.rand(10, 10).astype('float32')
-        w_data.append(data)
-        var = "w" + str(i)
-        mod.set_input(var, data)
-    mod.run()
-    out = tvm.nd.empty((30, 10), ctx=tvm.cpu())
-    out = mod.get_output(0, out)
-    tvm.testing.assert_allclose(
-        out.asnumpy(),
-        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
-                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-                        x_data + w_data[6] - w_data[7]),
-                       axis=0))
-
-
-def test_dso_extern():
-    check_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
-
-
-def test_engine_extern():
-    check_extern("engine", generate_engine_module,
-                 options=["-O2", "-std=c++11", "-I"+tmp_path.relpath("")])
-
-
-if __name__ == "__main__":
-    test_dso_extern()
-    test_engine_extern()

From e30d004c171832433148a285b30dd413e2b99147 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 19 Nov 2019 07:40:46 +0000
Subject: [PATCH 18/22] Add a simple NodeEntry

---
 apps/ext_runtime/src/ext_json_rt.cc | 65 ++++++++++++++++-------------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/apps/ext_runtime/src/ext_json_rt.cc b/apps/ext_runtime/src/ext_json_rt.cc
index 721f078f83ad..8aea79b43cdd 100644
--- a/apps/ext_runtime/src/ext_json_rt.cc
+++ b/apps/ext_runtime/src/ext_json_rt.cc
@@ -36,11 +36,16 @@
 #include <string>
 #include <vector>
 
-using namespace tvm::runtime;
-
 namespace tvm {
 namespace runtime {
 
+// A simple JSON node that contains multiple inputs and a single output.
+struct NodeEntry {
+  int id;
+  int output;
+  std::vector<int> inputs;
+};
+
 void Add_(float* a, int len_a, float* b, int len_b, float* c) {
   for (int i = 0; i < len_a * len_b; i++) {
     c[i] = a[i] + b[i];
@@ -95,8 +100,8 @@ int Mul(TVMValue* value, int* type_code, int nargs) {
 class ExampleJsonModule : public ModuleNode {
  public:
   ExampleJsonModule(std::string graph_json) {
-      this->graph_json_ = graph_json;
-      ParseJson(graph_json);
+    this->graph_json_ = graph_json;
+    ParseJson(graph_json);
   }
 
   PackedFunc GetFunction(const std::string& name,
@@ -117,9 +122,10 @@ class ExampleJsonModule : public ModuleNode {
           }
         }
         for (const auto& it : this->graph_[this->curr_subgraph_]) {
-          this->Run(it.first, it.second);
+          this->Run(it.id, it.inputs, it.output);
         }
-        auto out_idx = outs_[this->curr_subgraph_];
+        CHECK_GT(graph_.count(this->curr_subgraph_), 0U);
+        auto out_idx = graph_[this->curr_subgraph_].back().output;
         if (args[args.size() - 1].type_code() == kArrayHandle) {
           DLTensor* arg = args[args.size() - 1];
           this->data_entry_[out_idx].CopyTo(arg);
@@ -135,23 +141,25 @@ class ExampleJsonModule : public ModuleNode {
     }
   }
 
-  void Run(int id, const std::vector<int>& inputs) {
-    std::vector<TVMValue> values(inputs.size());
-    std::vector<int> type_codes(inputs.size());
+  void Run(int id, const std::vector<int>& inputs, int output) {
+    std::vector<int> args(inputs.begin(), inputs.end());
+    args.push_back(output);
+    std::vector<TVMValue> values(args.size());
+    std::vector<int> type_codes(args.size());
     TVMArgsSetter setter(values.data(), type_codes.data());
 
     if (op_id_[id] == "add" || op_id_[id] == "sub" || op_id_[id] == "mul") {
-      for (size_t i = 0; i < inputs.size(); i++) {
-        setter(i, data_entry_[inputs[i]]);
+      for (size_t i = 0; i < args.size(); i++) {
+        setter(i, data_entry_[args[i]]);
       }
     }
 
     if (op_id_[id] == "add") {
-      Add(values.data(), type_codes.data(), inputs.size());
+      Add(values.data(), type_codes.data(), args.size());
     } else if (op_id_[id] == "sub") {
-      Sub(values.data(), type_codes.data(), inputs.size());
+      Sub(values.data(), type_codes.data(), args.size());
     } else if (op_id_[id] == "mul") {
-      Mul(values.data(), type_codes.data(), inputs.size());
+      Mul(values.data(), type_codes.data(), args.size());
     } else {
       LOG(FATAL) << "Unknown op: " << op_id_[id] << "\n";
     }
@@ -179,7 +187,6 @@ class ExampleJsonModule : public ModuleNode {
       ss2 >> token;
       if (token.find("json_rt_") != std::string::npos) {
         curr_subgraph = token;
-        graph_[curr_subgraph];
         continue;
       }
 
@@ -200,6 +207,7 @@ class ExampleJsonModule : public ModuleNode {
       } else {
         op_id_[id] = token;
         bool shape_data = false;
+        NodeEntry entry;
         while (ss2 >> token) {
           if (token == "shape:") {
             shape_data = true;
@@ -207,11 +215,12 @@ class ExampleJsonModule : public ModuleNode {
             total_elements *= std::stoll(token);
             shape.push_back(std::stoll(token));
           } else if (token != "inputs:") {
-            graph_[curr_subgraph][id].push_back(std::stoi(token));
+            entry.inputs.push_back(std::stoi(token));
           }
         }
-        graph_[curr_subgraph][id].push_back(id);
-        outs_[curr_subgraph] = id;
+        entry.id = id;
+        entry.output = id;
+        graph_[curr_subgraph].push_back(entry);
       }
       DLContext ctx;
       ctx.device_type = static_cast<DLDeviceType>(1);
@@ -227,30 +236,28 @@ class ExampleJsonModule : public ModuleNode {
   }
 
   static Module LoadFromBinary(void* strm) {
-      dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-      std::string graph_json;
-      stream->Read(&graph_json);
-      auto n = tvm::runtime::make_object<ExampleJsonModule>(graph_json);
-      return Module(n);
+    dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+    std::string graph_json;
+    stream->Read(&graph_json);
+    auto n = tvm::runtime::make_object<ExampleJsonModule>(graph_json);
+    return Module(n);
   }
 
  private:
   std::string graph_json_;
   std::string curr_subgraph_;
-  // subgraph_id -> op -> inputs
-  std::map<std::string, std::map<int, std::vector<int>>> graph_;
-  // subgraph_id -> out
-  std::map<std::string, int> outs_;
+  // A simple graph.
+  std::map<std::string, std::vector<NodeEntry> > graph_;
   std::vector<NDArray> data_entry_;
   // id -> op
   std::vector<std::string> op_id_;
 };
 
 TVM_REGISTER_GLOBAL("ext_json_rt.create_json_rt")
-    .set_body_typed(ExampleJsonModule::LoadFromFile);
+.set_body_typed(ExampleJsonModule::LoadFromFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_examplejson")
-    .set_body_typed(ExampleJsonModule::LoadFromBinary);
+.set_body_typed(ExampleJsonModule::LoadFromBinary);
 
 }  // namespace runtime
 }  // namespace tvm

From a3c5a2f09aff4800d05de961940fdec2b60cd130 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 19 Nov 2019 19:18:35 +0000
Subject: [PATCH 19/22] retrigger ci


From 3793bd711b756d543f46438bf100a3110444c0ae Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 21 Nov 2019 18:50:21 +0000
Subject: [PATCH 20/22] add comment

---
 apps/ext_runtime/src/ext_json_rt.cc | 60 +++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/apps/ext_runtime/src/ext_json_rt.cc b/apps/ext_runtime/src/ext_json_rt.cc
index 8aea79b43cdd..f2e78f443b3a 100644
--- a/apps/ext_runtime/src/ext_json_rt.cc
+++ b/apps/ext_runtime/src/ext_json_rt.cc
@@ -20,6 +20,22 @@
 /*!
  * \file external_runtime_test.cc
  * \brief Test an example runtime module to interpreting a json string.
+ *
+ * This is an exmaple runtime employed to show how we can interprete and execute
+ * a json string that represents a simple computational (sub)graph. Users will
+ * mainly need to implement four functions as follows:
+ *  - GetFunction. It is used to get the packed function from the json runtime
+ * module using a provided function name. This function returns a PackedFunc
+ * that can be directly invoked by feeding it with parameters.
+ *  - SaveToBinary. This function is used to achieve the serialization purpose.
+ * The emitted binary stream can be directly saved to disk so that users can
+ * load then back when needed.
+ *  - LoadFromFile. This is a static function that acts as a helper to create
+ * a json runtime module using a given json string. The json string could be
+ * conveniently loaded from the front-end and passed to this interface through
+ * PackedFunc.
+ *  - LoadFromBinary. This function uses binary stream to load the json that
+ * saved by SaveToBinary which essentially performs deserialization.
  */
 #include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
@@ -104,6 +120,14 @@ class ExampleJsonModule : public ModuleNode {
     ParseJson(graph_json);
   }
 
+  /*!
+   * \brief Get a PackedFunc from the example json module.
+   *
+   * \param name the name of the function.
+   * \param sptr_to_self The ObjectPtr that points to this module node.
+   *
+   * \return The function pointer when it is found, otherwise, PackedFunc(nullptr).
+   */
   PackedFunc GetFunction(const std::string& name,
                          const ObjectPtr<Object>& sptr_to_self) final {
     if (this->graph_.find(name) != this->graph_.end()) {
@@ -141,6 +165,15 @@ class ExampleJsonModule : public ModuleNode {
     }
   }
 
+  /*!
+   * \brief Execute a function with provided arguments. The output will be
+   * packed to the last argument according to TVM's calling convention.
+   *
+   * \param id The id of the function.
+   * \param inputs The input indices that indicate where the data should be
+   * fetched in the data entry pool.
+   * \param output The output index.
+   */
   void Run(int id, const std::vector<int>& inputs, int output) {
     std::vector<int> args(inputs.begin(), inputs.end());
     args.push_back(output);
@@ -171,9 +204,15 @@ class ExampleJsonModule : public ModuleNode {
       stream->Write(this->graph_json_);
   }
 
-  // Note this is a very simple json that only serves for demostration purpose.
-  // Users usually have their own format and they can serialize it using the
-  // SaveToBinary method and deserialize it using LoadFromFile.
+  /*!
+   * \brief Parse the example json string.
+   *
+   * \param json. The json string that represents a simple computational graph.
+   *
+   * \Note this is a very simple json that only serves for demostration purpose.
+   * Users usually have their own format and they can serialize it using the
+   * SaveToBinary method and deserialize it using LoadFromFile.
+   */
   void ParseJson(const std::string& json) {
     std::string line;
     std::string curr_subgraph;
@@ -229,12 +268,27 @@ class ExampleJsonModule : public ModuleNode {
     }
   }
 
+  /*!
+   * \brief Load a json module from a json string.
+   *
+   * \param json The json string that represents a computational graph.
+   * \param format The format of the file which is not used here.
+   *
+   * \return The created json module.
+   */
   static Module LoadFromFile(const std::string& json,
                              const std::string& format) {
     auto n = tvm::runtime::make_object<ExampleJsonModule>(json);
     return Module(n);
   }
 
+  /*!
+   * \brief Load a json module from stream.
+   *
+   * \param strm The binary stream to load json.
+   *
+   * \return The created json module.
+   */
   static Module LoadFromBinary(void* strm) {
     dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
     std::string graph_json;

From 65e3510bc88f6dc6772e8e1642a180fba8213385 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 21 Nov 2019 20:39:25 +0000
Subject: [PATCH 21/22] Move back to contrib

---
 CMakeLists.txt                                |  6 ++++
 Jenkinsfile                                   |  4 +++
 apps/README.md                                |  1 -
 apps/ext_runtime/Makefile                     | 34 ------------------
 apps/ext_runtime/README.md                    | 26 --------------
 .../python/ext_json_rt/__init__.py            | 35 -------------------
 cmake/config.cmake                            |  3 ++
 .../example_ext_runtime.cc                    | 28 ++++++++++++---
 .../python/relay/test_external_runtime.py     | 21 +++++------
 9 files changed, 48 insertions(+), 110 deletions(-)
 delete mode 100644 apps/ext_runtime/Makefile
 delete mode 100644 apps/ext_runtime/README.md
 delete mode 100644 apps/ext_runtime/python/ext_json_rt/__init__.py
 rename apps/ext_runtime/src/ext_json_rt.cc => src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc (91%)
 rename apps/ext_runtime/tests/test_rt.py => tests/python/relay/test_external_runtime.py (98%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bea818b7581..ece951387632 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,6 +231,12 @@ if(USE_VM_PROFILER)
   list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
 endif(USE_VM_PROFILER)
 
+if(USE_EXAMPLE_EXT_RUNTIME)
+  message(STATUS "Build with Relay VM profiler support...")
+  file(GLOB RUNTIME_EXAMPLE_EXTERNAL_SRCS src/runtime/contrib/exmaple_ext_runtime/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_EXAMPLE_EXTERNAL_SRCS})
+endif(USE_EXAMPLE_EXT_RUNTIME)
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/CUDA.cmake)
diff --git a/Jenkinsfile b/Jenkinsfile
index 5426cb5ab500..736a67489e62 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -155,6 +155,7 @@ stage('Build') {
            echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
+           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
@@ -173,6 +174,7 @@ stage('Build') {
            echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
+           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER clang-7\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -193,6 +195,7 @@ stage('Build') {
            echo set\\(USE_MICRO_STANDALONE_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
+           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-8\\) >> config.cmake
            echo set\\(USE_NNPACK ON\\) >> config.cmake
            echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
@@ -225,6 +228,7 @@ stage('Build') {
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_MICRO_STANDALONE_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
+           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/apps/README.md b/apps/README.md
index 58c9447f9bd8..685750633493 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -26,4 +26,3 @@ If you are interested in writing optimized kernels with TVM, checkout [TOPI: TVM
 - [android_rpc](android_rpc) Android RPC server.
 - [benchmark](benchmark) Example end to end compilation benchmarks
 - [howto_deploy](howto_deploy) Tutorial on how to deploy TVM with minimum code dependency.
-- [ext_runtime](ext_runtime) How to extend TVM runtime for external backends.
diff --git a/apps/ext_runtime/Makefile b/apps/ext_runtime/Makefile
deleted file mode 100644
index ec50477a7247..000000000000
--- a/apps/ext_runtime/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Minimum Makefile for the extension package
-TVM_ROOT=$(shell cd ../..; pwd)
-PKG_CFLAGS = -std=c++11 -O2 -fPIC\
-	-I${TVM_ROOT}/include\
-	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include\
-
-PKG_LDFLAGS =-L${TVM_ROOT}/build
-UNAME_S := $(shell uname -s)
-
-ifeq ($(UNAME_S), Darwin)
-	PKG_LDFLAGS += -undefined dynamic_lookup
-endif
-
-lib/libtvm_ext_json_rt.so: src/ext_json_rt.cc
-	@mkdir -p $(@D)
-	$(CXX) $(PKG_CFLAGS) -shared -o $@ $^ $(PKG_LDFLAGS)
diff --git a/apps/ext_runtime/README.md b/apps/ext_runtime/README.md
deleted file mode 100644
index 142309fc42e9..000000000000
--- a/apps/ext_runtime/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-Example External Runtime
-========================
-This folder contains external runtime examples of TVM.
-
-- Extend DSO module as an external runtime.
-- Extend TVM runtime module as a customized JSON runtime.
-- The python module that creates new external runtime and intergrates into TVM exported library.
-
diff --git a/apps/ext_runtime/python/ext_json_rt/__init__.py b/apps/ext_runtime/python/ext_json_rt/__init__.py
deleted file mode 100644
index 8d60a663cf8a..000000000000
--- a/apps/ext_runtime/python/ext_json_rt/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Example extension package of TVM."""
-from __future__ import absolute_import
-import os
-import ctypes
-# Import TVM first to get library symbols
-import tvm
-
-def load_lib():
-    """Load library, the functions will be registered into TVM"""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    # load in as global so the global extern symbol is visible to other dll.
-    lib = ctypes.CDLL(
-        os.path.join(curr_path, "../../lib/libtvm_ext_json_rt.so"), ctypes.RTLD_GLOBAL)
-    return lib
-
-_LIB = load_lib()
-
-create_json_rt = tvm.get_global_func("ext_json_rt.create_json_rt")
-
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 51c929233aa6..27907b85d6fc 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -161,3 +161,6 @@ set(USE_VTA_TSIM ON)
 
 # Whether to build VTA FPGA driver (device side only)
 set(USE_VTA_FPGA OFF)
+
+# Whether to build the example external runtime module
+set(USE_EXAMPLE_EXT_RUNTIME OFF)
diff --git a/apps/ext_runtime/src/ext_json_rt.cc b/src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc
similarity index 91%
rename from apps/ext_runtime/src/ext_json_rt.cc
rename to src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc
index f2e78f443b3a..59cf4dd078ff 100644
--- a/apps/ext_runtime/src/ext_json_rt.cc
+++ b/src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc
@@ -62,6 +62,12 @@ struct NodeEntry {
   std::vector<int> inputs;
 };
 
+/*!
+ * \brief The following 6 functions are examples for demonstration. Users need
+ * to provide their own API when they use the external library. The ones that
+ * accecpt TVMValue are wrappers used to bridge the PackedFunc and user-defined
+ * kernels.
+ */
 void Add_(float* a, int len_a, float* b, int len_b, float* c) {
   for (int i = 0; i < len_a * len_b; i++) {
     c[i] = a[i] + b[i];
@@ -113,9 +119,14 @@ int Mul(TVMValue* value, int* type_code, int nargs) {
   return 0;
 }
 
+/*!
+ * \brief The example json runtime module. Here we define a simple format for
+ * the computational graph using json for demonstration purpose. Users should
+ * customize their own format.
+ */
 class ExampleJsonModule : public ModuleNode {
  public:
-  ExampleJsonModule(std::string graph_json) {
+  explicit ExampleJsonModule(std::string graph_json) {
     this->graph_json_ = graph_json;
     ParseJson(graph_json);
   }
@@ -200,6 +211,12 @@ class ExampleJsonModule : public ModuleNode {
 
   const char* type_key() const { return "examplejson"; }
 
+  /*!
+   * \brief Save the json runtime to a binary stream, which can then be
+   * serialized to disk.
+   *
+   * \param stream. The stream to save the binary.
+   */
   void SaveToBinary(dmlc::Stream* stream) final {
       stream->Write(this->graph_json_);
   }
@@ -298,16 +315,19 @@ class ExampleJsonModule : public ModuleNode {
   }
 
  private:
+  /* \brief The json string that represents a computational graph. */
   std::string graph_json_;
+  /* \brief The subgraph that being processed. */
   std::string curr_subgraph_;
-  // A simple graph.
+  /*! \brief A simple graph from subgraph id to node entries. */
   std::map<std::string, std::vector<NodeEntry> > graph_;
+  /* \brief A simple pool to contain the tensor for each node in the graph. */
   std::vector<NDArray> data_entry_;
-  // id -> op
+  /* \brief A mapping from node id to op name. */
   std::vector<std::string> op_id_;
 };
 
-TVM_REGISTER_GLOBAL("ext_json_rt.create_json_rt")
+TVM_REGISTER_GLOBAL("module.loadfile_examplejson")
 .set_body_typed(ExampleJsonModule::LoadFromFile);
 
 TVM_REGISTER_GLOBAL("module.loadbinary_examplejson")
diff --git a/apps/ext_runtime/tests/test_rt.py b/tests/python/relay/test_external_runtime.py
similarity index 98%
rename from apps/ext_runtime/tests/test_rt.py
rename to tests/python/relay/test_external_runtime.py
index 78c565495ffa..4abc4fffc366 100644
--- a/apps/ext_runtime/tests/test_rt.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import ext_json_rt
 from shutil import which
 import json
 import pytest
@@ -483,16 +482,16 @@ def run_extern(label, get_extern_src, **kwargs):
                        axis=0))
 
 
-def tutorial_dso_extern():
+def test_dso_extern():
     run_extern("lib", generate_csource_module, options=["-O2", "-std=c++11"])
 
 
-def tutorial_engine_extern():
+def test_engine_extern():
     run_extern("engine",
                generate_engine_module,
                options=["-O2", "-std=c++11", "-I" + tmp_path.relpath("")])
 
-def tutorial_json_extern():
+def test_json_extern():
     if which("gcc") is None:
         print("Skip test because gcc is not available.")
 
@@ -517,12 +516,14 @@ def tutorial_json_extern():
     # Get Json and module.
     graph_json = get_whole_graph_json()
     lib = get_synthetic_lib()
-    ext_lib = ext_json_rt.create_json_rt(subgraph_json, "")
+    ext_lib = tvm.module.load(subgraph_json, "examplejson")
     lib.import_module(ext_lib)
-    lib.export_library('external.so')
+    lib_name = 'external.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path)
 
     # load module for execution.
-    lib = tvm.module.load('external.so')
+    lib = tvm.module.load(lib_path)
     mod = tvm.contrib.graph_runtime.create(graph_json, lib, tvm.cpu(0))
 
     x_data = np.random.rand(10, 10).astype('float32')
@@ -546,6 +547,6 @@ def tutorial_json_extern():
 
 
 if __name__ == "__main__":
-    tutorial_dso_extern()
-    tutorial_engine_extern()
-    tutorial_json_extern()
\ No newline at end of file
+    test_dso_extern()
+    test_engine_extern()
+    test_json_extern()

From 2484b50c8d2a1b3b0b56b5964aff6fc72f9e7243 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Thu, 21 Nov 2019 22:10:19 +0000
Subject: [PATCH 22/22] Fix comments

---
 CMakeLists.txt                                |  4 +--
 Jenkinsfile                                   |  4 ---
 .../example_ext_runtime.cc                    | 30 +++++++++++--------
 tests/python/relay/test_external_runtime.py   |  8 ++++-
 4 files changed, 27 insertions(+), 19 deletions(-)
 rename src/runtime/contrib/{exmaple_ext_runtime => example_ext_runtime}/example_ext_runtime.cc (94%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ece951387632..6c0fd97d0496 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -232,8 +232,8 @@ if(USE_VM_PROFILER)
 endif(USE_VM_PROFILER)
 
 if(USE_EXAMPLE_EXT_RUNTIME)
-  message(STATUS "Build with Relay VM profiler support...")
-  file(GLOB RUNTIME_EXAMPLE_EXTERNAL_SRCS src/runtime/contrib/exmaple_ext_runtime/*.cc)
+  message(STATUS "Build with example external runtime...")
+  file(GLOB RUNTIME_EXAMPLE_EXTERNAL_SRCS src/runtime/contrib/example_ext_runtime/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_EXAMPLE_EXTERNAL_SRCS})
 endif(USE_EXAMPLE_EXT_RUNTIME)
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 736a67489e62..5426cb5ab500 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -155,7 +155,6 @@ stage('Build') {
            echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
-           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
@@ -174,7 +173,6 @@ stage('Build') {
            echo set\\(USE_MICRO ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
-           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER clang-7\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -195,7 +193,6 @@ stage('Build') {
            echo set\\(USE_MICRO_STANDALONE_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
-           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-8\\) >> config.cmake
            echo set\\(USE_NNPACK ON\\) >> config.cmake
            echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
@@ -228,7 +225,6 @@ stage('Build') {
            echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_MICRO_STANDALONE_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_VM_PROFILER ON\\) >> config.cmake
-           echo set\\(USE_EXAMPLE_EXT_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc b/src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc
similarity index 94%
rename from src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc
rename to src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc
index 59cf4dd078ff..ef6fc870f7f0 100644
--- a/src/runtime/contrib/exmaple_ext_runtime/example_ext_runtime.cc
+++ b/src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc
@@ -30,10 +30,6 @@
  *  - SaveToBinary. This function is used to achieve the serialization purpose.
  * The emitted binary stream can be directly saved to disk so that users can
  * load then back when needed.
- *  - LoadFromFile. This is a static function that acts as a helper to create
- * a json runtime module using a given json string. The json string could be
- * conveniently loaded from the front-end and passed to this interface through
- * PackedFunc.
  *  - LoadFromBinary. This function uses binary stream to load the json that
  * saved by SaveToBinary which essentially performs deserialization.
  */
@@ -46,6 +42,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <fstream>
 #include <cmath>
 #include <map>
 #include <sstream>
@@ -128,7 +125,7 @@ class ExampleJsonModule : public ModuleNode {
  public:
   explicit ExampleJsonModule(std::string graph_json) {
     this->graph_json_ = graph_json;
-    ParseJson(graph_json);
+    ParseJson(this->graph_json_);
   }
 
   /*!
@@ -286,16 +283,23 @@ class ExampleJsonModule : public ModuleNode {
   }
 
   /*!
-   * \brief Load a json module from a json string.
+   * \brief Create a module from a file path of a serialized graph.
    *
-   * \param json The json string that represents a computational graph.
-   * \param format The format of the file which is not used here.
+   * \param path The file path contains a computational graph representation.
    *
    * \return The created json module.
    */
-  static Module LoadFromFile(const std::string& json,
-                             const std::string& format) {
-    auto n = tvm::runtime::make_object<ExampleJsonModule>(json);
+  static Module Create(const std::string& path) {
+    std::ifstream filep;
+    filep.open(path, std::ios::in);
+    std::string graph_json;
+    std::string line;
+    while (std::getline(filep, line)) {
+      graph_json += line;
+      graph_json += "\n";
+    }
+    filep.close();
+    auto n = tvm::runtime::make_object<ExampleJsonModule>(graph_json);
     return Module(n);
   }
 
@@ -328,7 +332,9 @@ class ExampleJsonModule : public ModuleNode {
 };
 
 TVM_REGISTER_GLOBAL("module.loadfile_examplejson")
-.set_body_typed(ExampleJsonModule::LoadFromFile);
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = ExampleJsonModule::Create(args[0]);
+});
 
 TVM_REGISTER_GLOBAL("module.loadbinary_examplejson")
 .set_body_typed(ExampleJsonModule::LoadFromBinary);
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 4abc4fffc366..887d9dc33871 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -513,10 +513,16 @@ def test_json_extern():
                      "sub 5 inputs: 4 2 shape: 10 10\n" +
                      "mul 6 inputs: 5 3 shape: 10 10")
 
+    subgraph_path = tmp_path.relpath('subgraph.examplejson')
+    with open(subgraph_path, 'w') as f:
+        f.write(subgraph_json)
+
     # Get Json and module.
     graph_json = get_whole_graph_json()
+
+
     lib = get_synthetic_lib()
-    ext_lib = tvm.module.load(subgraph_json, "examplejson")
+    ext_lib = tvm.module.load(subgraph_path, "examplejson")
     lib.import_module(ext_lib)
     lib_name = 'external.so'
     lib_path = tmp_path.relpath(lib_name)