From 487cee02c6cb9fec41cf454d633d4968145ff4f5 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 13 Aug 2019 06:31:44 +0000
Subject: [PATCH 01/34] graph partitioning

---
 include/tvm/relay/attrs/annotation.h          |  13 ++
 include/tvm/relay/transform.h                 |   8 +
 python/tvm/relay/op/annotation/annotation.py  |  40 ++++
 python/tvm/relay/transform.py                 |  12 +
 src/relay/backend/compile_engine.cc           |  28 ++-
 src/relay/backend/compile_engine.h            |   2 +
 src/relay/backend/test_external_codegen.cc    | 165 ++++++++++++++
 src/relay/backend/test_external_library.cc    |  54 +++++
 src/relay/backend/test_external_library.h     |  26 +++
 src/relay/op/annotation/annotation.cc         |  46 ++++
 src/relay/pass/partition_graph.cc             | 214 ++++++++++++++++++
 .../python/relay/test_pass_partition_graph.py |  63 ++++++
 12 files changed, 669 insertions(+), 2 deletions(-)
 create mode 100644 src/relay/backend/test_external_codegen.cc
 create mode 100644 src/relay/backend/test_external_library.cc
 create mode 100644 src/relay/backend/test_external_library.h
 create mode 100644 src/relay/pass/partition_graph.cc
 create mode 100644 tests/python/relay/test_pass_partition_graph.py
diff --git a/include/tvm/relay/attrs/annotation.h b/include/tvm/relay/attrs/annotation.h
index fd21db5a9c14..cc7803ecde6f 100644
--- a/include/tvm/relay/attrs/annotation.h
+++ b/include/tvm/relay/attrs/annotation.h
@@ -57,6 +57,19 @@ struct CastHintAttrs : public tvm::AttrsNode<CastHintAttrs> {
   }
 };
 
+/*!
+ * \brief Options for the subgraph operators.
+ */
+struct SubgraphAttrs : public tvm::AttrsNode<SubgraphAttrs> {
+  /*! \brief The 3rd party compiler for subgraph code generation. */
+  std::string compiler;
+
+  TVM_DECLARE_ATTRS(SubgraphAttrs, "relay.attrs.SubgraphAttrs") {
+    TVM_ATTR_FIELD(compiler)
+      .describe("The 3rd compiler used for subgraph code generation.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_ANNOTATION_H_
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index ddadbe4fc31d..92eb99f2cd94 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -576,6 +576,14 @@ TVM_DLL Pass EtaExpand(bool expand_constructor, bool expand_global_var);
  */
 TVM_DLL Pass PrintIR(bool show_meta_data = true);
 
+/*!
+ * \brief Partition a Relay program into regions that can be executed on
+ * different backends.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass PartitionGraph();
+
 }  // namespace transform
 
 /*!
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 2b9d4bcd81bc..835a04c5bec9 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -62,6 +62,7 @@ def stop_fusion(data):
     """
     return _make.stop_fusion(data)
 
+
 def checkpoint(data):
     """Annotate an expression to be a checkpoint for the checkpointing memory optimization.
 
@@ -78,3 +79,42 @@ def checkpoint(data):
     return _make.checkpoint(data)
 
 register_schedule("annotation.checkpoint", schedule_injective)
+
+
+def subgraph_begin(data, compiler):
+    """Annotate an expression to indicate that it is the beginning of
+    a subgraph.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The expression to be annotated.
+
+    compiler : Str
+        The compiler used to generate code of a subgraph.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The annotated expression.
+    """
+    return _make.subgraph_begin(data, compiler)
+
+
+def subgraph_end(data, compiler):
+    """Annotate an expression to indicate that it is the end of a subgraph.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The expression to be annotated.
+
+    compiler : Str
+        The compiler used to generate code of a subgraph.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The annotated expression.
+    """
+    return _make.subgraph_end(data, compiler)
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index 540c1f5b79cd..beb75f528c64 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -635,6 +635,18 @@ def PrintIR(show_meta_data=True):
     return _transform.PrintIR(show_meta_data)
 
 
+def PartitionGraph():
+    """Partition a Relay program into regions that can be executed on different
+    backends.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that partitions the Relay program.
+    """
+    return _transform.PartitionGraph()
+
+
 def gradient(expr, mod=None, mode='higher_order'):
     """
     Transform the input function,
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 083fa5d5610c..534d96b814ae 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/schedule.h>
 #include <tvm/packed_func_ext.h>
+#include <tvm/ir.h>
 #include <tvm/operation.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/relay/attrs/device_copy.h>
@@ -594,8 +595,14 @@ class CompileEngineImpl : public CompileEngineNode {
   PackedFunc JIT(const CCacheKey& key) final {
     CCacheValue value = LowerInternal(key);
     if (value->packed_func != nullptr) return value->packed_func;
-    // build the function.
-    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+    // Handle 3rd party generated code library.
+    if (value->lib.operator->()) {
+      auto name = FunctionGetAttr(key->source_func, "func_name");
+      const tvm::ir::StringImm* func_name = name.as<tvm::ir::StringImm>();
+      CHECK(func_name);
+      value->packed_func = value->lib.GetFunction(func_name->value);
+    } else if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+      // build the function.
       tvm::runtime::Module m = (*f)(value->cached_func->funcs, key->target);
       value->packed_func = m.GetFunction(value->cached_func->func_name);
     } else {
@@ -648,6 +655,23 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
+
+    auto compiler = FunctionGetAttr(key->source_func, "External");
+    if (compiler.defined()) {
+      const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
+      CHECK(code_gen);
+      std::string ext_name = "relay.ext." + code_gen->value;
+      auto pf = tvm::runtime::Registry::Get(ext_name);
+      CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+
+      // Invoke the 3rd party codegen to generate a library for the subgraph.
+      runtime::Module mod = (*pf)(key->source_func);
+      value->lib = mod;
+      value->cached_func = CachedFunc();
+      // value->packed_func = (*pf)(key->source_func);;
+      return value;
+    }
+
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 31e246ecf1fe..2849ca7fe7ad 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -137,6 +137,8 @@ class CCacheValueNode : public Node {
   CachedFunc cached_func;
   /*! \brief Result of Packed function generated by JIT */
   PackedFunc packed_func;
+  /*! \brief An external library generated by the 3rd party codegen. */
+  runtime::Module lib;
   /*! \brief usage statistics */
   int use_count{0};
 
diff --git a/src/relay/backend/test_external_codegen.cc b/src/relay/backend/test_external_codegen.cc
new file mode 100644
index 000000000000..232d4c61b45b
--- /dev/null
+++ b/src/relay/backend/test_external_codegen.cc
@@ -0,0 +1,165 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "test_external_library.h"
+
+namespace tvm {
+namespace relay {
+
+typedef void (*sub)(ExternalTensor a, ExternalTensor b, ExternalTensor* out);
+
+class ExternalModuleNode : public runtime:: ModuleNode {
+ public:
+  ExternalModuleNode() = default;
+  ~ExternalModuleNode() {
+    if (handle_ != nullptr) {
+      dlclose(handle_);
+    }
+  }
+
+  // void Init(const std::string& bin_path);
+  // void Exec(const std::string& fun_name, const TVMArgs& args);
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters.
+   *
+   * \param name the name of the external function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    if (name == "Subtract") {
+      CHECK(handle_) << "You need to build the external module first";
+      func_s_ = reinterpret_cast<sub>(dlsym(handle_,"Subtract"));
+      char* error = dlerror();
+      if (error != NULL) {
+        LOG(FATAL) << error;
+        return PackedFunc();
+      }
+
+      return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+        CHECK_EQ(args.size(), 3U);
+        runtime::NDArray a = args[0];
+        ExternalTensor lhs;
+        lhs.data = a->data;
+        lhs.ndim = a.Shape().size();
+        // lhs.shape = a.Shape().data();
+        lhs.shape = new int64_t[lhs.ndim];
+
+        runtime::NDArray b = args[1];
+        ExternalTensor rhs;
+        rhs.data = b->data;
+        rhs.ndim = b.Shape().size();
+        rhs.shape = new int64_t[rhs.ndim];
+        // rhs.shape = b.Shape().data();
+
+        runtime::NDArray c = args[2];
+        ExternalTensor out;
+        out.data = c->data;
+        out.ndim = c.Shape().size();
+        out.shape = c.Shape().data();
+
+        for (int i = 0; i < lhs.ndim; i++) {
+          lhs.shape[i] = a.Shape()[i];
+          rhs.shape[i] = b.Shape()[i];
+        }
+        (*func_s_)(lhs, rhs, &out);
+        *rv = c;
+      });
+    } else {
+      LOG(FATAL) << "Unknow function found when invoking extern library: " << name;
+      return PackedFunc();
+    }
+  }
+
+  /*!
+   * \brief Get the source code of the external module.
+   *
+   * \param format The format of the source code.
+   *
+   * \return The source code of the external library module in the text form.
+   */
+  TVM_DLL std::string GetSource(const std::string& format = "") override {
+    return "";
+  }
+
+  const char* type_key() const final {
+    return "ExternalModule";
+  }
+
+  void Build() {
+    std::system(
+        "g++ -std=c++11 -shared -fPIC -ldl src/relay/backend/test_external_library.cc -o /tmp/subtract.so");
+    handle_ = dlopen("/tmp/subtract.so", RTLD_LAZY);
+    if (!handle_) {
+      LOG(FATAL) << "Cannot open library: " << dlerror() << '\n';
+    }
+  }
+
+ private:
+  void* handle_{nullptr};
+  sub func_s_;
+};
+
+runtime::Module CreateExternalModule() {
+  std::shared_ptr<ExternalModuleNode> n = std::make_shared<ExternalModuleNode>();
+  n->Build();
+  return runtime::Module(n);
+}
+
+}  // namespace relay
+}  // namespace tvm
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression and
+ * compile it into a runtime module.
+ *
+ * The external codegen tool should have been registered similiarly to LLVM,
+ * CUDA, etc, under TVM so the generated code could be packed in a runtime
+ * module. This module simplifies code serialization and invocation.
+ *
+ * TODO(@zhiics)
+ *  1. Let the external compiler ingest a Relay module instead of
+ * a single expression/function.
+ *  2. Return runtime::Module.
+ */
+runtime::Module Compiler(const Expr& expr) {
+  Function func = Downcast<Function>(expr);
+  CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+  return CreateExternalModule();
+}
+
+TVM_REGISTER_API("relay.ext.gcc")
+.set_body_typed(Compiler);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/test_external_library.cc b/src/relay/backend/test_external_library.cc
new file mode 100644
index 000000000000..b3a47d59d63c
--- /dev/null
+++ b/src/relay/backend/test_external_library.cc
@@ -0,0 +1,54 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "test_external_library.h"
+
+#include <cstdint>
+#include <iostream>
+
+extern "C" void Subtract(ExternalTensor a, ExternalTensor b, ExternalTensor* out) {
+  if (a.ndim > 2 || a.ndim != b.ndim || a.ndim  != out->ndim) {
+    std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim
+              << ", b.ndim = " << b.ndim
+              << ", out ndim = " << out->ndim << std::endl;
+  }
+  for (int i = 0; i < a.ndim; i++) {
+    if (a.shape[i] != b.shape[i]) {
+      std::cerr << "shape[" << i << "]: a = " << a.shape[i] << ", b = " << b.shape[i] << std::endl;
+    }
+  }
+  std::cout << "dim: " << a.ndim << " shape: " << std::endl;
+  for (int i = 0; i < a.ndim; i++) {
+    std::cout << a.shape[i] << " " << b.shape[i] << std::endl;
+  }
+  float* a_ptr = static_cast<float*>(a.data);
+  float* b_ptr = static_cast<float*>(b.data);
+  float* out_ptr = static_cast<float*>(out->data);
+  if (a.ndim == 1) {
+    for (int64_t i = 0; i < a.shape[0]; i++) {
+      out_ptr[i] = a_ptr[i] - b_ptr[i];
+    }
+  } else {
+    for (int64_t i = 0; i < a.shape[0]; i++) {
+      for (int64_t j = 0; j < a.shape[1]; j++) {
+        int64_t k = i * a.shape[1] + j;
+        out_ptr[k] = a_ptr[k] - b_ptr[k];
+      }
+    }
+  }
+}
diff --git a/src/relay/backend/test_external_library.h b/src/relay/backend/test_external_library.h
new file mode 100644
index 000000000000..0467567d9795
--- /dev/null
+++ b/src/relay/backend/test_external_library.h
@@ -0,0 +1,26 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstdint>
+#include <iostream>
+
+typedef struct {
+  void* data;
+  int ndim;
+  int64_t* shape;
+} ExternalTensor;
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index f5674fa06adb..76525071006b 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -171,5 +171,51 @@ Mark a checkpoint for checkpointing memory optimization.
                          return outputs;
                        });
 
+RELAY_REGISTER_OP("annotation.subgraph_begin")
+.describe(R"code(Begin region of a subgraph.)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         return {topi::identity(inputs[0])};
+                       });
+
+TVM_REGISTER_API("relay.op.annotation._make.subgraph_begin")
+.set_body_typed<Expr(Expr, std::string)>([](Expr expr, std::string compiler) {
+  auto attrs = make_node<SubgraphAttrs>();
+  attrs->compiler = compiler;
+  static const Op& op = Op::Get("annotation.subgraph_begin");
+  return CallNode::make(op, {expr}, Attrs(attrs), {});
+});
+
+RELAY_REGISTER_OP("annotation.subgraph_end")
+.describe(R"code(End region of a subgraph.)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         return {topi::identity(inputs[0])};
+                       });
+
+TVM_REGISTER_API("relay.op.annotation._make.subgraph_end")
+.set_body_typed<Expr(Expr, std::string)>([](Expr expr, std::string compiler) {
+  auto attrs = make_node<SubgraphAttrs>();
+  attrs->compiler = compiler;
+  static const Op& op = Op::Get("annotation.subgraph_end");
+  return CallNode::make(op, {expr}, Attrs(attrs), {});
+});
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
new file mode 100644
index 000000000000..83256db59e0d
--- /dev/null
+++ b/src/relay/pass/partition_graph.cc
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*! Copyright (c) 2019 by Contributorsr
+ * \file src/relay/pass/partition_graph.cc
+ *
+ * \brief  Partition an input function into multiple Functions according based
+ * on the inserted annotation nodes (i.e. begin and end). These nodes are used
+ * as boundaries to partition the Relay function into multiple regions that can
+ * be offloaded to different accelerators.
+ *
+ * Each of these paritioned functions, a.k.a subgraphs, will be viewed as
+ * external functions, and they will use external tools for codegen.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+namespace graph_partitioning {
+
+/*!
+ * \brief The checker that verifies if a Relay program is annotated correctly
+ * for graph partitioning.
+ */
+class AnnotationChecker : public ExprVisitor {
+ public:
+  bool Check(const Expr& expr) {
+    return true;
+  }
+};
+
+/*! \brief This class partitions the graph labeled with begin and end annoations
+ * into function containing multiple subgraphs. Each subgraph is labeled as
+ * external.
+ *
+ * TODO(@zhiics) This following algorithm is not adequate to handle all cases,
+ * i.e. multiple `end` nodes.
+ */
+class Partitioner : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* call) final {
+    auto op_node = call->op.as<OpNode>();
+
+    // Use the default visitor to traverse the nodes that are not subgraph
+    // nodes.
+    if (op_node == nullptr || call->attrs.as<SubgraphAttrs>() == nullptr) {
+      return ExprMutator::VisitExpr_(call);
+    }
+
+    if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
+      auto input_expr = VisitExpr(call->args[0]);
+      auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
+      auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_count_++),
+                               call->args[0]->checked_type());
+      subgraph_args_.push_back({var, input_expr});
+      return std::move(var);
+    } else {
+      CHECK(GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end"));
+      CHECK(!found_end_) << "Nested subgraph is not allowed." << "\n";
+      found_end_ = true;
+
+      auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
+      CHECK(subgraph_attrs);
+      auto input = VisitExpr(call->args[0]);
+      Array<Var> params;
+      Array<Expr> args;
+
+      for (auto pair : subgraph_args_) {
+        params.push_back(pair.first);
+        args.push_back(pair.second);
+      }
+
+      auto subgraph_func = FunctionNode::make(params, input, Type(), {}, Attrs());
+
+      subgraph_func =
+          FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make("Subtract"));
+      subgraph_func = FunctionSetAttr(subgraph_func, "Primitive", tvm::Integer(1));
+      subgraph_func = FunctionSetAttr(subgraph_func, "External",
+                                      tvm::ir::StringImm::make(subgraph_attrs->compiler));
+      subgraph_args_.clear();
+      var_count_ = 0;
+      found_end_ = false;
+      return CallNode::make(subgraph_func, args);
+    }
+  }
+
+  /*
+   * \brief For cases like the following:
+   *
+   *       op1
+   *        |
+   *       end
+   *        |
+   *       op2
+   *      /   \
+   *     x     y
+   *
+   * where x and y could be inputs, e.g. vars and/or constants. Here, we should
+   * group all nodes/expressions that are dominated by op2 in the same subgraph.
+   */
+  Expr VisitExpr_(const VarNode* vn) final {
+    Expr var = GetRef<Var>(vn);
+    return var;
+  }
+
+  Expr VisitExpr_(const ConstantNode* cn) final {
+    Expr constant = GetRef<Constant>(cn);
+    return constant;
+  }
+
+ private:
+  int var_count_{0};
+  bool found_end_{false};
+  std::vector<std::pair<Var, Expr> > subgraph_args_;
+};
+
+/*!
+ * \brief Combine parallel subgraphs that belong to the same codegen backend.
+ *
+ * For example, sg1 and sg2 should be combined if they belong to the same
+ * codegen tool in the following case.
+ *
+ *      op1
+ *     /   \
+ *   sg1   sg2
+ *
+ *       |
+ *      \|/
+ *
+ *      op1
+ *       |
+ *    sg1_sg2
+ *
+ * where the return type of the new subgraph sg1_sg2 is a tuple, and op1 has two
+ * inputs that obtained from the tuple.
+ */
+class ParallelSubgraphCombiner : public ExprMutator {
+  using ParallelGroup = std::vector<std::unordered_map<std::string, CallNode*>>;
+
+ public:
+  Expr Combine(const Expr& expr) {
+    ParallelGroup groups = GroupFinder().FindGroups(expr);
+    return expr;
+  }
+
+ private:
+  class GroupFinder : public ExprVisitor {
+   public:
+    ParallelGroup FindGroups(const Expr& expr) {
+      this->VisitExpr(expr);
+      return groups_;
+    }
+
+    void VisitExpr_(const CallNode* call) final {
+      ExprVisitor::VisitExpr_(call);
+    }
+
+   private:
+    ParallelGroup groups_;
+  };
+};
+
+Expr PartitionGraph(const Expr& expr) {
+  Partitioner part;
+  return part.Mutate(expr);
+}
+
+}  // namespace graph_partitioning
+
+namespace transform {
+
+Pass PartitionGraph() {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+    [=](Function f, Module m, PassContext pc) {
+    return Downcast<Function>(graph_partitioning::PartitionGraph(f));
+  };
+  auto partitioned = CreateFunctionPass(pass_func, 1, "PartitionGraph", {});
+  return Sequential({partitioned, InferType()});
+}
+
+TVM_REGISTER_API("relay._transform.PartitionGraph")
+.set_body_typed(transform::PartitionGraph);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
new file mode 100644
index 000000000000..a2c8d8933ff6
--- /dev/null
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for graph partitioning."""
+import numpy as np
+
+import tvm
+from tvm import relay
+import tvm.relay.testing
+import tvm.relay.transform
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.annotation import subgraph_begin, subgraph_end
+
+class MyAnnotator(ExprMutator):
+    def visit_call(self, call):
+        print(call.op.name)
+        if call.op.name == "subtract":
+            lhs = subgraph_begin(call.args[0], "gcc")
+            rhs = subgraph_begin(call.args[1], "gcc")
+            op = relay.subtract(lhs, rhs)
+            return subgraph_end(op, "gcc")
+
+        return super().visit_call(call)
+
+    def visit_function(self, func):
+        return relay.Function(func.params, self.visit(func.body))
+
+def annotate(expr):
+    ann = MyAnnotator()
+    return ann.visit(expr)
+
+def test_subgraph():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(10, 10))
+    z = x + x
+    f = relay.Function([x, y], y - z)
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+    mod = relay.Module()
+    mod["main"] = annotate(f)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    print(mod['main'])
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, y_data)
+    tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+    # import pdb; pdb.set_trace()
+
+if __name__ == "__main__":
+    test_subgraph()

From e0877e84c6d0d0d87ac3b5144ca566674cf36972 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 23 Aug 2019 17:55:22 +0000
Subject: [PATCH 02/34] extern op coloring infra

---
 include/tvm/relay/op_attr_types.h             | 23 ++++-
 python/tvm/relay/op/__init__.py               |  2 +-
 python/tvm/relay/op/contrib/__init__.py       |  1 +
 python/tvm/relay/op/contrib/extern_op.py      | 50 ++++++++++
 python/tvm/relay/op/contrib/gcc/__init__.py   | 20 ++++
 python/tvm/relay/op/contrib/gcc/extern_op.py  | 29 ++++++
 python/tvm/relay/op/op.py                     | 20 ++++
 python/tvm/relay/transform.py                 | 18 ++++
 src/relay/pass/extern_op.cc                   | 98 +++++++++++++++++++
 .../python/relay/test_pass_partition_graph.py | 18 +++-
 10 files changed, 274 insertions(+), 5 deletions(-)
 create mode 100644 python/tvm/relay/op/contrib/extern_op.py
 create mode 100644 python/tvm/relay/op/contrib/gcc/__init__.py
 create mode 100644 python/tvm/relay/op/contrib/gcc/extern_op.py
 create mode 100644 src/relay/pass/extern_op.cc

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 741e8b478828..02e1a6ac08d4 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -29,6 +29,7 @@
 #include <tvm/build_module.h>
 #include <tvm/relay/type.h>
 #include <tvm/relay/expr.h>
+#include <string>
 
 namespace tvm {
 namespace relay {
@@ -122,7 +123,7 @@ using FTVMSchedule = runtime::TypedPackedFunc<
  *  operator with other expressions. This function will be invoked
  *  in AlterOpLayout pass.
  * \param attrs The attribute of the original node.
- * \param inputs The input symbols of the original node.
+ * \param args The input symbols of the original node.
  * \param tinfos An array of placeholders, use for getting the inferred shape
  *               and dtype of the inputs.
  * \return new_expr The modified expression.
@@ -136,8 +137,8 @@ using FTVMAlterOpLayout = runtime::TypedPackedFunc<
  * \brief Legalizes an expression with another expression. This function will be
  *  invoked in Legalize pass. It is a target-dependent pass.
  * \param attrs The attribute of the original node.
- * \param inputs The input symbols of the original node.
- * \param tinfos An array of placeholders, use for getting the inferred shape
+ * \param args The input symbols of the original node.
+ * \param arg_types An array of placeholders, use for getting the inferred shape
  *               and dtype of the inputs.
  * \return new_expr The modified expression.
  */
@@ -146,6 +147,22 @@ using FTVMLegalize = runtime::TypedPackedFunc<
        const Array<Expr>& args,
        const Array<tvm::relay::Type>& arg_types)>;
 
+/*!
+ * \brief Annotates an expression to indicate which external codegen tool an op
+ * should be scheduled to. It is a hardware dependent pass.
+ *
+ * \param attrs The attribute of the original expr.
+ * \param args The arguments of the original expr.
+ * \param compiler The external compiler that is used for external ops.
+ *
+ * \return true if this op should be registered with external codegen tool,
+ * otherwise, false.
+ */
+using FTVMExternOp = runtime::TypedPackedFunc<
+bool(const Attrs& attrs,
+     const Array<Expr>& args,
+     const std::string& compiler)>;
+
 /*!
  * \brief Forward rewriting rule for a specific op.
  *
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index a089cab669c9..f246750e5cd9 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -19,7 +19,7 @@
 # operator defs
 from .op import get, register, register_schedule, register_compute, register_gradient, \
     register_pattern, register_alter_op_layout, register_legalize, \
-    schedule_injective, Op, OpPattern, debug
+    register_extern_op, schedule_injective, Op, OpPattern, debug
 
 # Operators
 from .reduce import *
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 3159006486b3..a369f143d4c0 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -18,4 +18,5 @@
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
 from .contrib import *
+from .extern_op import *
 from . import _contrib
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
new file mode 100644
index 000000000000..2346034faa09
--- /dev/null
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""
+External compiler related feature registration.
+
+It implements dispatchers that check if an operator should use the external
+codegen tool.
+
+Each compiler can customize the support of the operator. For example, they can
+check the attribute of an operator and/or the features of the input arguments
+to decide if we should use the external compiler.
+"""
+from __future__ import absolute_import
+
+from . import gcc
+from .. import op as reg
+
+@reg.register_extern_op("nn.conv2d")
+def external_conv2d(attrs, args, compiler):
+    """Check if the external compiler should be used for conv2d.
+    """
+    if compiler == "gcc":
+        return gcc.extern_op.conv2d(attrs, args)
+
+    raise RuntimeError("conv2d in {} is not registered" % (compiler))
+
+
+@reg.register_extern_op("subtract")
+def external_subtract(attrs, args, compiler):
+    """Check if the external compiler should be used for conv2d.
+    """
+    if compiler == "gcc":
+        return gcc.extern_op.subtract(attrs, args)
+
+    raise RuntimeError("subtract in {} is not registered" % (compiler))
diff --git a/python/tvm/relay/op/contrib/gcc/__init__.py b/python/tvm/relay/op/contrib/gcc/__init__.py
new file mode 100644
index 000000000000..0da426ab4741
--- /dev/null
+++ b/python/tvm/relay/op/contrib/gcc/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .extern_op import *
diff --git a/python/tvm/relay/op/contrib/gcc/extern_op.py b/python/tvm/relay/op/contrib/gcc/extern_op.py
new file mode 100644
index 000000000000..958195913905
--- /dev/null
+++ b/python/tvm/relay/op/contrib/gcc/extern_op.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""GCC compiler supported operators."""
+from __future__ import absolute_import
+
+def conv2d(attrs, args):
+    """Check if the external codegen should be used for conv2d.
+    """
+    return False
+
+def subtract(attrs, args):
+    """Check if the external codegen should be used for conv2d.
+    """
+    return True
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 355496e42b48..a70068c95047 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -229,6 +229,7 @@ def register_pattern(op_name, pattern, level=10):
     """
     return register(op_name, "TOpPattern", pattern, level)
 
+
 def register_gradient(op_name, fgradient=None, level=10):
     """Register operator pattern for an op.
 
@@ -266,6 +267,25 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     get(op_name).set_attr("TShapeDataDependant", data_dependant, level)
     return register(op_name, "FShapeFunc", shape_func, level)
 
+def register_extern_op(op_name, fextern=None, level=10):
+    """Register the external codegen tool for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the operator.
+
+    fextern: function (attrs: Attrs, args: List[Expr], compiler: str) ->
+    new_expr: Expr
+        The function for wrapping a call expr with subgraph_start and
+        subgraph_end.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMExternOp", fextern, level)
+
+
 _init_api("relay.op", __name__)
 
 @register_func("relay.op.compiler._lower")
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index beb75f528c64..81474e207233 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -480,6 +480,24 @@ def Legalize(legalize_map_attr_name="FTVMLegalize"):
     return _transform.Legalize(legalize_map_attr_name)
 
 
+def ExternOp(compiler):
+    """Set ops in an experession as external ops so that it will use the
+    external codegen tool.
+
+    Parameters
+    ----------
+    compiler : str
+        The compiler used for external codegen.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The annotated pass that wrapps ops with subgraph_start and
+        subgraph_end.
+    """
+    return _transform.ExternOp(compiler)
+
+
 def RewriteAnnotatedOps(fallback_device):
     """Rewrite the annotated program where annotation operators, e.g.
     `on_deivce`, mark which device an expression should be scheduled to.
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
new file mode 100644
index 000000000000..829ea91ef9a0
--- /dev/null
+++ b/src/relay/pass/extern_op.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file src/relay/pass/extern_op.cc
+ * \brief Wraps a call with subgraph_begin and subgraph_end to indicate that the op of this call
+ * node will use external compiler.
+ */
+
+#include <tvm/operation.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace extern_op {
+
+class ExternOpWrapper : public ExprMutator {
+ public:
+  explicit ExternOpWrapper(const std::string& compiler) : compiler_(compiler) {}
+
+  Expr VisitExpr_(const CallNode* cn) {
+    auto new_e = ExprMutator::VisitExpr_(cn);
+
+    Call call = Downcast<Call>(new_e);
+    static auto fextern = Op::GetAttr<FTVMExternOp>("FTVMExternOp");
+    Op op = Downcast<Op>(call->op);
+    CHECK(op.operator->());
+
+    if (fextern.count(op)) {
+      bool external = fextern[op](call->attrs, call->args, compiler_);
+      if (external) {
+        tvm::Array<tvm::relay::Expr> subgraph_begins;
+        for (const auto& it : call->args) {
+          const auto* begin_op =
+            runtime::Registry::Get("relay.op.annotation._make.subgraph_begin");
+          CHECK(begin_op);
+          Expr begin = (*begin_op)(it, compiler_);
+          subgraph_begins.push_back(begin);
+        }
+        Expr begin_call = CallNode::make(call->op, subgraph_begins, call->attrs);
+        const auto* end_op =
+          runtime::Registry::Get("relay.op.annotation._make.subgraph_end");
+        CHECK(end_op);
+        Expr end = (*end_op)(begin_call, compiler_);
+        return end;
+      }
+    }
+    return new_e;
+  }
+
+ private:
+  std::string compiler_;
+};
+
+Expr ExternOp(const Expr& expr, const std::string& compiler) {
+  return ExternOpWrapper(compiler).Mutate(expr);
+}
+
+}  // namespace extern_op
+
+namespace transform {
+
+Pass ExternOp(const std::string& compiler) {
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
+      [=](Function f, Module m, PassContext pc) {
+        return Downcast<Function>(relay::extern_op::ExternOp(f, compiler));
+      };
+  auto func_pass = CreateFunctionPass(pass_func, 1, "ExternOpFunc",
+                                      {ir::StringImm::make("InferType")});
+  return transform::Sequential({func_pass, InferType()}, "ExternOp");
+}
+
+TVM_REGISTER_API("relay._transform.ExternOp")
+.set_body_typed(ExternOp);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index a2c8d8933ff6..9536ce6c0dc0 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -57,7 +57,23 @@ def test_subgraph():
     ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
     res = ex.evaluate()(x_data, y_data)
     tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
-    # import pdb; pdb.set_trace()
+
+def test_extern():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(10, 10))
+    z = x + x
+    f = relay.Function([x, y], y - z)
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+    mod = relay.Module()
+    mod["main"] = f
+    mod = relay.transform.Sequential([relay.transform.ExternOp("gcc"),
+                                      relay.transform.PartitionGraph()])(mod)
+    print(mod['main'])
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, y_data)
+    tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
 
 if __name__ == "__main__":
     test_subgraph()
+    test_extern()

From 1b2041da20ff2eeb4d5a1151699c4c913b54e02a Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Sun, 25 Aug 2019 21:57:36 +0000
Subject: [PATCH 03/34] eliminate redundant subgraph annotation

---
 src/relay/pass/extern_op.cc       | 61 +++++++++++++++++++++++++++++--
 src/relay/pass/partition_graph.cc |  4 --
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index 829ea91ef9a0..872e38c7616c 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -20,11 +20,12 @@
 /*!
  * Copyright (c) 2019 by Contributors
  * \file src/relay/pass/extern_op.cc
- * \brief Wraps a call with subgraph_begin and subgraph_end to indicate that the op of this call
- * node will use external compiler.
+ * \brief Wraps a call with subgraph_begin and subgraph_end to indicate that
+ * the op of this call node will use external compiler.
  */
 
 #include <tvm/operation.h>
+#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/transform.h>
@@ -71,8 +72,62 @@ class ExternOpWrapper : public ExprMutator {
   std::string compiler_;
 };
 
+/*!
+ * \brief Eleminates the back-to-back subgraph_begin and end annotations if they
+ * are using the same external compiler. For example, the following graph
+ *
+ *  subgraph_begin
+ *       |
+ *      op1
+ *       |
+ *  subgraph_end
+ *       |
+ *  subgraph_begin
+ *       |
+ *      op2
+ *       |
+ *  subgraph_end
+ *
+ * will be updated to if op1 and op2 require codegen from the same external
+ * compiler.
+ *
+ *  subgraph_begin
+ *       |
+ *      op1
+ *       |
+ *      op2
+ *       |
+ *  subgraph_end
+ */
+struct EliminateAnnotation : public ExprMutator {
+  Expr VisitExpr_(const CallNode* cn) {
+    Expr new_e = ExprMutator::VisitExpr_(cn);
+    const auto* op_node = cn->op.as<OpNode>();
+    if (op_node && GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
+      Expr input = cn->args[0];
+      if (input.as<CallNode>() == nullptr) return new_e;
+      Call input_call = Downcast<Call>(input);
+      if (input_call.defined()) {
+        const auto* call_op = input_call->op.as<OpNode>();
+        if (call_op &&
+            GetRef<Op>(call_op) == Op::Get("annotation.subgraph_end")) {
+          auto end_attrs = cn->attrs.as<SubgraphAttrs>();
+          auto begin_attrs = input_call->attrs.as<SubgraphAttrs>();
+          if (end_attrs && begin_attrs &&
+              end_attrs->compiler == begin_attrs->compiler) {
+            // Eliminate end and begin
+            return input_call->args[0];
+          }
+        }
+      }
+    }
+    return new_e;
+  }
+};
+
 Expr ExternOp(const Expr& expr, const std::string& compiler) {
-  return ExternOpWrapper(compiler).Mutate(expr);
+  Expr annotated = ExternOpWrapper(compiler).Mutate(expr);
+  return EliminateAnnotation().Mutate(annotated);
 }
 
 }  // namespace extern_op
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 83256db59e0d..5f7554b4d806 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -83,8 +83,6 @@ class Partitioner : public ExprMutator {
       return std::move(var);
     } else {
       CHECK(GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end"));
-      CHECK(!found_end_) << "Nested subgraph is not allowed." << "\n";
-      found_end_ = true;
 
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
       CHECK(subgraph_attrs);
@@ -106,7 +104,6 @@ class Partitioner : public ExprMutator {
                                       tvm::ir::StringImm::make(subgraph_attrs->compiler));
       subgraph_args_.clear();
       var_count_ = 0;
-      found_end_ = false;
       return CallNode::make(subgraph_func, args);
     }
   }
@@ -137,7 +134,6 @@ class Partitioner : public ExprMutator {
 
  private:
   int var_count_{0};
-  bool found_end_{false};
   std::vector<std::pair<Var, Expr> > subgraph_args_;
 };
 

From f7e299b0ca1c6624a195d12b807a7235b34bb7ad Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 29 Aug 2019 20:35:25 +0000
Subject: [PATCH 04/34] A failing example

---
 src/relay/pass/extern_op.cc | 60 ++++++++++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index 872e38c7616c..daa0c185f4be 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -57,11 +57,11 @@ class ExternOpWrapper : public ExprMutator {
           Expr begin = (*begin_op)(it, compiler_);
           subgraph_begins.push_back(begin);
         }
-        Expr begin_call = CallNode::make(call->op, subgraph_begins, call->attrs);
+        Expr update_call = CallNode::make(call->op, subgraph_begins, call->attrs);
         const auto* end_op =
           runtime::Registry::Get("relay.op.annotation._make.subgraph_end");
         CHECK(end_op);
-        Expr end = (*end_op)(begin_call, compiler_);
+        Expr end = (*end_op)(update_call, compiler_);
         return end;
       }
     }
@@ -73,31 +73,71 @@ class ExternOpWrapper : public ExprMutator {
 };
 
 /*!
- * \brief Eleminates the back-to-back subgraph_begin and end annotations if they
- * are using the same external compiler. For example, the following graph
+ * \brief Eleminates the back-to-back subgraph_begin(s) and end(e) annotations
+ * if they are using the same external compiler. For example, the following
+ * Relay program
  *
- *  subgraph_begin
+ *       b
  *       |
  *      op1
  *       |
- *  subgraph_end
+ *       e
  *       |
- *  subgraph_begin
+ *       b
  *       |
  *      op2
  *       |
- *  subgraph_end
+ *       e
  *
  * will be updated to if op1 and op2 require codegen from the same external
  * compiler.
  *
- *  subgraph_begin
+ *       b
  *       |
  *      op1
  *       |
  *      op2
  *       |
- *  subgraph_end
+ *       e
+ *
+ * However, in the following case (op1-6 and op8 use external compiler and op7
+ * uses tvm codegen), we cannot simply cancel all back-to-back `start` and
+ * `end` annotations even if they use the same external compiler.
+ *
+ * For example, op1-6 and op8 would be grouped into the same subgraph if we
+ * cancel the back-to-back start and end annotations, leaving op7 alone in a
+ * separate subgraph. Unfortunately, it creates a cycle where one output of
+ * the former subgraph flows into the latter, and meanwhile it requires the
+ * the computed results of op7 from the latter subgraph.
+ *
+ * Hence, we should prevent op1-6 and op8 falling into the same subgraph all
+ * together in such a case.
+ *
+ *       |
+ *       b
+ *       |
+ *      op1
+ *    /  |  \
+ *   e   e   e
+ *   |   |   |
+ *   b   b   b
+ *   |   |   |
+ *  op2 op3 op4
+ *   |   |   |
+ *   e   e   e
+ *   |   |   |
+ *   b   b   |
+ *   |   |   |
+ *  op5 op6 op7
+ *   |   |   |
+ *   e   e   |
+ *   |   |   |
+ *   b   b   b
+ *    \  |  /
+ *      op8
+ *       |
+ *       e
+ *       |
  */
 struct EliminateAnnotation : public ExprMutator {
   Expr VisitExpr_(const CallNode* cn) {

From 60004d16dcf195e79bde941d293816bdc91b57fe Mon Sep 17 00:00:00 2001
From: comaniac <comaniac0422@gmail.com>
Date: Fri, 30 Aug 2019 16:32:16 -0700
Subject: [PATCH 05/34] Refine partition algorithm

Remaining issues:
- External function body does not refer to the generated arguments when
function body is more than one node.

- Runtime is not working due to the fix of external codegen library.

- Not working when there have more than one subgraphs during partition.
---
 python/tvm/relay/op/contrib/extern_op.py      |  24 ++-
 python/tvm/relay/op/contrib/gcc/extern_op.py  |  14 +-
 src/relay/backend/test_external_codegen.cc    |   2 +-
 src/relay/pass/extern_op.cc                   |   2 +-
 src/relay/pass/partition_graph.cc             | 182 +++++++++++++-----
 .../python/relay/test_pass_partition_graph.py |  74 +++----
 6 files changed, 210 insertions(+), 88 deletions(-)

diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index 2346034faa09..8daec9c4d3bd 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -32,7 +32,7 @@
 
 @reg.register_extern_op("nn.conv2d")
 def external_conv2d(attrs, args, compiler):
-    """Check if the external compiler should be used for conv2d.
+    """Check if the external compiler should be used.
     """
     if compiler == "gcc":
         return gcc.extern_op.conv2d(attrs, args)
@@ -42,9 +42,29 @@ def external_conv2d(attrs, args, compiler):
 
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
-    """Check if the external compiler should be used for conv2d.
+    """Check if the external compiler should be used.
     """
     if compiler == "gcc":
         return gcc.extern_op.subtract(attrs, args)
 
     raise RuntimeError("subtract in {} is not registered" % (compiler))
+
+
+@reg.register_extern_op("add")
+def external_add(attrs, args, compiler):
+    """Check if the external compiler should be used.
+    """
+    if compiler == "gcc":
+        return gcc.extern_op.add(attrs, args)
+
+    raise RuntimeError("add in {} is not registered" % (compiler))
+
+
+@reg.register_extern_op("multiply")
+def external_multiply(attrs, args, compiler):
+    """Check if the external compiler should be used.
+    """
+    if compiler == "gcc":
+        return gcc.extern_op.multiply(attrs, args)
+
+    raise RuntimeError("multiply in {} is not registered" % (compiler))
diff --git a/python/tvm/relay/op/contrib/gcc/extern_op.py b/python/tvm/relay/op/contrib/gcc/extern_op.py
index 958195913905..1d85f1916992 100644
--- a/python/tvm/relay/op/contrib/gcc/extern_op.py
+++ b/python/tvm/relay/op/contrib/gcc/extern_op.py
@@ -19,11 +19,21 @@
 from __future__ import absolute_import
 
 def conv2d(attrs, args):
-    """Check if the external codegen should be used for conv2d.
+    """Check if the external codegen should be used.
     """
     return False
 
 def subtract(attrs, args):
-    """Check if the external codegen should be used for conv2d.
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def add(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def multiply(attrs, args):
+    """Check if the external codegen should be used.
     """
     return True
diff --git a/src/relay/backend/test_external_codegen.cc b/src/relay/backend/test_external_codegen.cc
index 232d4c61b45b..1bb41a07d7a2 100644
--- a/src/relay/backend/test_external_codegen.cc
+++ b/src/relay/backend/test_external_codegen.cc
@@ -93,7 +93,7 @@ class ExternalModuleNode : public runtime:: ModuleNode {
         *rv = c;
       });
     } else {
-      LOG(FATAL) << "Unknow function found when invoking extern library: " << name;
+      LOG(FATAL) << "Unknown function found when invoking extern library: " << name;
       return PackedFunc();
     }
   }
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index daa0c185f4be..b37aa85a36c2 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -167,7 +167,7 @@ struct EliminateAnnotation : public ExprMutator {
 
 Expr ExternOp(const Expr& expr, const std::string& compiler) {
   Expr annotated = ExternOpWrapper(compiler).Mutate(expr);
-  return EliminateAnnotation().Mutate(annotated);
+  return annotated; //EliminateAnnotation().Mutate(annotated);
 }
 
 }  // namespace extern_op
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 5f7554b4d806..921b5e79b760 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -30,14 +30,14 @@
  */
 
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/transform.h>
 
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -45,15 +45,53 @@ namespace tvm {
 namespace relay {
 namespace graph_partitioning {
 
+/*!
+ * \brief The subgraph properties for partition.
+ */
+struct Subgraph {
+  /*! \brief The subgraph ID. */
+  int id;
+
+  /*! \brief The input arguments of this subgraph. */
+  std::vector<std::pair<Var, Expr>> args;
+
+  /*! \brief Nodes in this subgraph. */
+  std::unordered_set<Expr, ExprHash, ExprEqual> nodes;
+};
+
 /*!
  * \brief The checker that verifies if a Relay program is annotated correctly
  * for graph partitioning.
  */
 class AnnotationChecker : public ExprVisitor {
  public:
-  bool Check(const Expr& expr) {
+  bool Check() {
+    if (!this->found_start && !this->found_end) {
+      LOG(WARNING) << "No subgraph annotation found";
+    } else if (!this->found_start) {
+      LOG(ERROR) << "Subgraph start annotation is missing";
+      return false;
+    } else if (!this->found_end) {
+      LOG(ERROR) << "Subgraph end annotation is missing";
+      return false;
+    }
     return true;
   }
+
+  void VisitExpr_(const CallNode* call) final {
+    auto op_node = call->op.as<OpNode>();
+    if (op_node == nullptr || call->attrs.as<SubgraphAttrs>() == nullptr) {
+      return;
+    } else if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
+      this->found_start = true;
+    } else if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end")) {
+      this->found_end = true;
+    }
+  }
+
+ private:
+  bool found_start = false;
+  bool found_end = false;
 };
 
 /*! \brief This class partitions the graph labeled with begin and end annoations
@@ -65,76 +103,125 @@ class AnnotationChecker : public ExprVisitor {
  */
 class Partitioner : public ExprMutator {
  public:
+  Subgraph* GetSubgraph(const Expr node) {
+    for (auto candidate : this->subgraphs_) {
+      if (candidate->nodes.find(node) != candidate->nodes.end()) {
+        return candidate;
+      }
+    }
+    return nullptr;
+  }
+
+  void MergeSubgraph(Subgraph* subgraph1, Subgraph* subgraph2) {
+    // Merge subgraph 2 to subgraph 1 and erase subgraph 2.
+    subgraph1->nodes.insert(subgraph2->nodes.begin(), subgraph2->nodes.end());
+    for (auto arg : subgraph2->args) {
+      subgraph1->args.push_back(arg);
+    }
+    this->subgraphs_.erase(subgraph2);
+  }
+
+  void AddToSubgraph(Subgraph* subgraph, const Expr expr) {
+      auto subgraph2 = GetSubgraph(expr);
+      if (subgraph2) {
+        MergeSubgraph(subgraph, subgraph2);
+      } else {
+        subgraph->nodes.insert(expr);
+      }
+  }
+
   Expr VisitExpr_(const CallNode* call) final {
     auto op_node = call->op.as<OpNode>();
 
-    // Use the default visitor to traverse the nodes that are not subgraph
-    // nodes.
     if (op_node == nullptr || call->attrs.as<SubgraphAttrs>() == nullptr) {
+      // Propogate subgraph to arguments
+      auto subgraph = GetSubgraph(GetRef<Call>(call));
+      if (subgraph) {
+        for (auto arg : call->args) {
+          AddToSubgraph(subgraph, arg);
+        }
+      }
       return ExprMutator::VisitExpr_(call);
-    }
+    } else if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
+      // The annotation node is inserted on edge so it must have only one argument.
+      CHECK(call->args.size() == 1);
 
-    if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
+      // Traverse the rest graph.
       auto input_expr = VisitExpr(call->args[0]);
+
+      // Replace the begin annotation with an external call input variable.
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
-      auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_count_++),
-                               call->args[0]->checked_type());
-      subgraph_args_.push_back({var, input_expr});
+      auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_id_++),
+                               input_expr->checked_type());
+
+      // Find the corresponding subgraph and add the argument.
+      auto subgraph = GetSubgraph(GetRef<Call>(call));
+      if (!subgraph) {
+        throw Error(RELAY_ERROR("Cannot find the corresponding subgraph for end annotation:\n"
+                                << AsText(GetRef<Call>(call), false)));
+      }
+      subgraph->args.push_back({var, input_expr});
+      //LOG(ERROR) << "Add an argument to subgraph " << subgraph->id << ":\n" << AsText(var, false);
       return std::move(var);
     } else {
       CHECK(GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end"));
+      // The annotation node is inserted on edge so it must have only one argument.
+      CHECK(call->args.size() == 1);
 
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
-      CHECK(subgraph_attrs);
+
+      // Check if the argument is already belonged to an exist subgraph
+      auto subgraph = GetSubgraph(call->args[0]);
+      if (!subgraph) {
+        auto ret = this->subgraphs_.emplace(new Subgraph());
+        subgraph = *ret.first;
+        subgraph->nodes.insert(call->args[0]);
+        subgraph->id = this->subgraph_id_++;
+      }
+      subgraph->nodes.insert(GetRef<Call>(call));
+
+      // Traverse towarding to subgraph inputs.
       auto input = VisitExpr(call->args[0]);
       Array<Var> params;
       Array<Expr> args;
 
-      for (auto pair : subgraph_args_) {
+      // The subgraph may be merged so we need to update it again.
+      subgraph = GetSubgraph(GetRef<Call>(call));
+      for (auto pair : subgraph->args) {
         params.push_back(pair.first);
         args.push_back(pair.second);
       }
 
       auto subgraph_func = FunctionNode::make(params, input, Type(), {}, Attrs());
-
+      
+      // FIXME: How to determine the function name?
       subgraph_func =
           FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make("Subtract"));
       subgraph_func = FunctionSetAttr(subgraph_func, "Primitive", tvm::Integer(1));
       subgraph_func = FunctionSetAttr(subgraph_func, "External",
                                       tvm::ir::StringImm::make(subgraph_attrs->compiler));
-      subgraph_args_.clear();
-      var_count_ = 0;
       return CallNode::make(subgraph_func, args);
     }
   }
 
-  /*
-   * \brief For cases like the following:
-   *
-   *       op1
-   *        |
-   *       end
-   *        |
-   *       op2
-   *      /   \
-   *     x     y
-   *
-   * where x and y could be inputs, e.g. vars and/or constants. Here, we should
-   * group all nodes/expressions that are dominated by op2 in the same subgraph.
-   */
-  Expr VisitExpr_(const VarNode* vn) final {
-    Expr var = GetRef<Var>(vn);
-    return var;
-  }
-
-  Expr VisitExpr_(const ConstantNode* cn) final {
-    Expr constant = GetRef<Constant>(cn);
-    return constant;
+  Expr VisitExpr_(const TupleNode* op) {
+    Expr ref = GetRef<Tuple>(op);
+    auto subgraph = GetSubgraph(ref);
+    if (subgraph) {
+      for (auto field : op->fields) {
+        AddToSubgraph(subgraph, field);
+      }
+    }
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+    return ref;
   }
 
  private:
-  int var_count_{0};
-  std::vector<std::pair<Var, Expr> > subgraph_args_;
+  int var_id_{0};
+  int subgraph_id_{0};
+  std::unordered_set<Subgraph*> subgraphs_;
 };
 
 /*!
@@ -174,9 +261,7 @@ class ParallelSubgraphCombiner : public ExprMutator {
       return groups_;
     }
 
-    void VisitExpr_(const CallNode* call) final {
-      ExprVisitor::VisitExpr_(call);
-    }
+    void VisitExpr_(const CallNode* call) final { ExprVisitor::VisitExpr_(call); }
 
    private:
     ParallelGroup groups_;
@@ -193,16 +278,15 @@ Expr PartitionGraph(const Expr& expr) {
 namespace transform {
 
 Pass PartitionGraph() {
-  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
-    [=](Function f, Module m, PassContext pc) {
-    return Downcast<Function>(graph_partitioning::PartitionGraph(f));
-  };
-  auto partitioned = CreateFunctionPass(pass_func, 1, "PartitionGraph", {});
+  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> part_func =
+      [=](Function f, Module m, PassContext pc) {
+        return Downcast<Function>(graph_partitioning::PartitionGraph(f));
+      };
+  auto partitioned = CreateFunctionPass(part_func, 1, "PartitionGraph", {});
   return Sequential({partitioned, InferType()});
 }
 
-TVM_REGISTER_API("relay._transform.PartitionGraph")
-.set_body_typed(transform::PartitionGraph);
+TVM_REGISTER_API("relay._transform.PartitionGraph").set_body_typed(transform::PartitionGraph);
 
 }  // namespace transform
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 9536ce6c0dc0..148284b849a6 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -26,13 +26,14 @@
 
 class MyAnnotator(ExprMutator):
     def visit_call(self, call):
-        print(call.op.name)
-        if call.op.name == "subtract":
-            lhs = subgraph_begin(call.args[0], "gcc")
-            rhs = subgraph_begin(call.args[1], "gcc")
-            op = relay.subtract(lhs, rhs)
+        #print(call.op.name)
+        if call.op.name == "log": # Annotate begin at args
+            inp = subgraph_begin(call.args[0], "gcc")
+            op = relay.log(inp)
+            return op
+        elif call.op.name == "concatenate": # Annotate end at output
+            op = super().visit_call(call)
             return subgraph_end(op, "gcc")
-
         return super().visit_call(call)
 
     def visit_function(self, func):
@@ -42,38 +43,45 @@ def annotate(expr):
     ann = MyAnnotator()
     return ann.visit(expr)
 
-def test_subgraph():
+def test_partition_graph():
     x = relay.var('x', shape=(10, 10))
-    y = relay.var('y', shape=(10, 10))
-    z = x + x
-    f = relay.Function([x, y], y - z)
-    x_data = np.random.rand(10, 10).astype('float32')
-    y_data = np.random.rand(10, 10).astype('float32')
+    #y = relay.var('y', shape=(10, 10))
+    z0 = relay.log(x)
+    z1 = relay.log(x)
+    z2 = relay.exp(x)
+    p0 = relay.sin(z0)
+    p1 = relay.sin(z1)
+    p2 = relay.log(z2)
+    q = relay.concatenate((p0, p1, p2), axis=0)
+    f = relay.Function([x], q)
     mod = relay.Module()
     mod["main"] = annotate(f)
     mod = relay.transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
     print(mod['main'])
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+    #x_data = np.random.rand(10, 10).astype('float32')
+    #y_data = np.random.rand(10, 10).astype('float32')
+    #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    #res = ex.evaluate()(x_data, y_data)
+    #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
 
-def test_extern():
-    x = relay.var('x', shape=(10, 10))
-    y = relay.var('y', shape=(10, 10))
-    z = x + x
-    f = relay.Function([x, y], y - z)
-    x_data = np.random.rand(10, 10).astype('float32')
-    y_data = np.random.rand(10, 10).astype('float32')
-    mod = relay.Module()
-    mod["main"] = f
-    mod = relay.transform.Sequential([relay.transform.ExternOp("gcc"),
-                                      relay.transform.PartitionGraph()])(mod)
-    print(mod['main'])
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+# def test_extern():
+#     x = relay.var('x', shape=(10, 10))
+#     y = relay.var('y', shape=(10, 10))
+#     z = x + x
+#     p = y * y
+#     f = relay.Function([x, y], p - z)
+#     x_data = np.random.rand(10, 10).astype('float32')
+#     y_data = np.random.rand(10, 10).astype('float32')
+#     mod = relay.Module()
+#     mod["main"] = f
+#     mod = relay.transform.ExternOp("gcc")(mod)
+#     mod = relay.transform.PartitionGraph()(mod)
+#     print(mod['main'])
+#     #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+#     #res = ex.evaluate()(x_data, y_data)
+#     #tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
 
-if __name__ == "__main__":
-    test_subgraph()
-    test_extern()
+# if __name__ == "__main__":
+#     test_partition_graph()
+#     test_extern()

From fe3c486039920612508be8d022bbab58021ac21b Mon Sep 17 00:00:00 2001
From: comaniac <comaniac0422@gmail.com>
Date: Fri, 30 Aug 2019 17:46:17 -0700
Subject: [PATCH 06/34] Support multiple subgraphs (runtime not work)

---
 src/relay/pass/partition_graph.cc             | 22 +++---
 .../python/relay/test_pass_partition_graph.py | 70 +++++++++----------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 921b5e79b760..58fbac12519f 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -122,16 +122,17 @@ class Partitioner : public ExprMutator {
   }
 
   void AddToSubgraph(Subgraph* subgraph, const Expr expr) {
-      auto subgraph2 = GetSubgraph(expr);
-      if (subgraph2) {
-        MergeSubgraph(subgraph, subgraph2);
-      } else {
-        subgraph->nodes.insert(expr);
-      }
+    auto subgraph2 = GetSubgraph(expr);
+    if (subgraph2) {
+      MergeSubgraph(subgraph, subgraph2);
+    } else {
+      subgraph->nodes.insert(expr);
+    }
   }
 
   Expr VisitExpr_(const CallNode* call) final {
     auto op_node = call->op.as<OpNode>();
+    LOG(ERROR) << op_node->name;
 
     if (op_node == nullptr || call->attrs.as<SubgraphAttrs>() == nullptr) {
       // Propogate subgraph to arguments
@@ -151,8 +152,9 @@ class Partitioner : public ExprMutator {
 
       // Replace the begin annotation with an external call input variable.
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
+      LOG(ERROR) << "Checking var type";
       auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_id_++),
-                               input_expr->checked_type());
+                               input_expr->checked_type_);
 
       // Find the corresponding subgraph and add the argument.
       auto subgraph = GetSubgraph(GetRef<Call>(call));
@@ -161,7 +163,6 @@ class Partitioner : public ExprMutator {
                                 << AsText(GetRef<Call>(call), false)));
       }
       subgraph->args.push_back({var, input_expr});
-      //LOG(ERROR) << "Add an argument to subgraph " << subgraph->id << ":\n" << AsText(var, false);
       return std::move(var);
     } else {
       CHECK(GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end"));
@@ -192,8 +193,9 @@ class Partitioner : public ExprMutator {
         args.push_back(pair.second);
       }
 
-      auto subgraph_func = FunctionNode::make(params, input, Type(), {}, Attrs());
-      
+      auto subgraph_func =
+          FunctionNode::make(params, input, call->args[0]->checked_type_, {}, Attrs());
+
       // FIXME: How to determine the function name?
       subgraph_func =
           FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make("Subtract"));
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 148284b849a6..c38b0dffe2b9 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -43,45 +43,45 @@ def annotate(expr):
     ann = MyAnnotator()
     return ann.visit(expr)
 
-def test_partition_graph():
-    x = relay.var('x', shape=(10, 10))
-    #y = relay.var('y', shape=(10, 10))
-    z0 = relay.log(x)
-    z1 = relay.log(x)
-    z2 = relay.exp(x)
-    p0 = relay.sin(z0)
-    p1 = relay.sin(z1)
-    p2 = relay.log(z2)
-    q = relay.concatenate((p0, p1, p2), axis=0)
-    f = relay.Function([x], q)
-    mod = relay.Module()
-    mod["main"] = annotate(f)
-    mod = relay.transform.PartitionGraph()(mod)
-    mod = relay.transform.InferType()(mod)
-    print(mod['main'])
-    #x_data = np.random.rand(10, 10).astype('float32')
-    #y_data = np.random.rand(10, 10).astype('float32')
-    #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    #res = ex.evaluate()(x_data, y_data)
-    #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
-
-# def test_extern():
+# def test_partition_graph():
 #     x = relay.var('x', shape=(10, 10))
-#     y = relay.var('y', shape=(10, 10))
-#     z = x + x
-#     p = y * y
-#     f = relay.Function([x, y], p - z)
-#     x_data = np.random.rand(10, 10).astype('float32')
-#     y_data = np.random.rand(10, 10).astype('float32')
+#     #y = relay.var('y', shape=(10, 10))
+#     z0 = relay.log(x)
+#     z1 = relay.log(x)
+#     z2 = relay.exp(x)
+#     p0 = relay.sin(z0)
+#     p1 = relay.sin(z1)
+#     p2 = relay.log(z2)
+#     q = relay.concatenate((p0, p1, p2), axis=0)
+#     f = relay.Function([x], q)
 #     mod = relay.Module()
-#     mod["main"] = f
-#     mod = relay.transform.ExternOp("gcc")(mod)
+#     mod["main"] = annotate(f)
 #     mod = relay.transform.PartitionGraph()(mod)
+#     mod = relay.transform.InferType()(mod)
 #     print(mod['main'])
+#     #x_data = np.random.rand(10, 10).astype('float32')
+#     #y_data = np.random.rand(10, 10).astype('float32')
 #     #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
 #     #res = ex.evaluate()(x_data, y_data)
-#     #tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
+#     #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+
+def test_extern():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(10, 10))
+    z = x + x
+    p = y * y
+    f = relay.Function([x, y], p - z)
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+    mod = relay.Module()
+    mod["main"] = f
+    mod = relay.transform.ExternOp("gcc")(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    print(mod['main'])
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, y_data)
+    tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
 
-# if __name__ == "__main__":
-#     test_partition_graph()
-#     test_extern()
+if __name__ == "__main__":
+#    test_partition_graph()
+    test_extern()

From 8734d0a685b7434c76cfe1067ac86c9ead9ffb32 Mon Sep 17 00:00:00 2001
From: comaniac <comaniac0422@gmail.com>
Date: Fri, 30 Aug 2019 18:15:00 -0700
Subject: [PATCH 07/34] Support multiple function body nodes

- All nodes except for Call and Tuple are required to propogate
subgraph.
- Multiple outputs are not supported yet.
- Both unit tests are correct in terms of graph partition.
- Runtime is not working yet. Need to implement external add and
multiply.
---
 src/relay/pass/partition_graph.cc             | 19 ++++----
 .../python/relay/test_pass_partition_graph.py | 44 +++++++++----------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 58fbac12519f..ab7b606dd2c8 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -132,7 +132,6 @@ class Partitioner : public ExprMutator {
 
   Expr VisitExpr_(const CallNode* call) final {
     auto op_node = call->op.as<OpNode>();
-    LOG(ERROR) << op_node->name;
 
     if (op_node == nullptr || call->attrs.as<SubgraphAttrs>() == nullptr) {
       // Propogate subgraph to arguments
@@ -151,8 +150,8 @@ class Partitioner : public ExprMutator {
       auto input_expr = VisitExpr(call->args[0]);
 
       // Replace the begin annotation with an external call input variable.
+      // TODO: Confirm if it is safe to use checked_type_ instead of checked_type()
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
-      LOG(ERROR) << "Checking var type";
       auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_id_++),
                                input_expr->checked_type_);
 
@@ -207,17 +206,19 @@ class Partitioner : public ExprMutator {
   }
 
   Expr VisitExpr_(const TupleNode* op) {
-    Expr ref = GetRef<Tuple>(op);
-    auto subgraph = GetSubgraph(ref);
-    if (subgraph) {
+    auto subgraph = GetSubgraph(GetRef<Tuple>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
       for (auto field : op->fields) {
         AddToSubgraph(subgraph, field);
       }
+      Array<Expr> fields;
+      for (auto field : op->fields) {
+        fields.push_back(VisitExpr(field));
+      }
+      return TupleNode::make(fields);
     }
-    for (auto field : op->fields) {
-      VisitExpr(field);
-    }
-    return ref;
   }
 
  private:
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index c38b0dffe2b9..d09d4cef16e7 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -43,27 +43,27 @@ def annotate(expr):
     ann = MyAnnotator()
     return ann.visit(expr)
 
-# def test_partition_graph():
-#     x = relay.var('x', shape=(10, 10))
-#     #y = relay.var('y', shape=(10, 10))
-#     z0 = relay.log(x)
-#     z1 = relay.log(x)
-#     z2 = relay.exp(x)
-#     p0 = relay.sin(z0)
-#     p1 = relay.sin(z1)
-#     p2 = relay.log(z2)
-#     q = relay.concatenate((p0, p1, p2), axis=0)
-#     f = relay.Function([x], q)
-#     mod = relay.Module()
-#     mod["main"] = annotate(f)
-#     mod = relay.transform.PartitionGraph()(mod)
-#     mod = relay.transform.InferType()(mod)
-#     print(mod['main'])
-#     #x_data = np.random.rand(10, 10).astype('float32')
-#     #y_data = np.random.rand(10, 10).astype('float32')
-#     #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-#     #res = ex.evaluate()(x_data, y_data)
-#     #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+def test_partition_graph():
+    x = relay.var('x', shape=(10, 10))
+    #y = relay.var('y', shape=(10, 10))
+    z0 = relay.log(x)
+    z1 = relay.log(x)
+    z2 = relay.exp(x)
+    p0 = relay.sin(z0)
+    p1 = relay.sin(z1)
+    p2 = relay.log(z2)
+    q = relay.concatenate((p0, p1, p2), axis=0)
+    f = relay.Function([x], q)
+    mod = relay.Module()
+    mod["main"] = annotate(f)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    print(mod['main'])
+    #x_data = np.random.rand(10, 10).astype('float32')
+    #y_data = np.random.rand(10, 10).astype('float32')
+    #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    #res = ex.evaluate()(x_data, y_data)
+    #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
 
 def test_extern():
     x = relay.var('x', shape=(10, 10))
@@ -83,5 +83,5 @@ def test_extern():
     tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
 
 if __name__ == "__main__":
-#    test_partition_graph()
+    test_partition_graph()
     test_extern()

From 590996e9106284d12e485d4c6bf604a4e092b4d3 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 2 Sep 2019 01:03:29 +0000
Subject: [PATCH 08/34] Add a hack for multiple subgraphes

- Each of these subgraphes only contains one primitive op.

- Subgraphes contains multiple nodes are not handled.
---
 src/relay/backend/test_external_codegen.cc |  4 +-
 src/relay/backend/test_external_library.cc | 99 +++++++++++++++-------
 src/relay/pass/partition_graph.cc          | 15 +++-
 3 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/src/relay/backend/test_external_codegen.cc b/src/relay/backend/test_external_codegen.cc
index 1bb41a07d7a2..0723d7976c0b 100644
--- a/src/relay/backend/test_external_codegen.cc
+++ b/src/relay/backend/test_external_codegen.cc
@@ -54,9 +54,9 @@ class ExternalModuleNode : public runtime:: ModuleNode {
   PackedFunc GetFunction(
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (name == "Subtract") {
+    if (name == "Subtract" || "Add" || "Multiply") {
       CHECK(handle_) << "You need to build the external module first";
-      func_s_ = reinterpret_cast<sub>(dlsym(handle_,"Subtract"));
+      func_s_ = reinterpret_cast<sub>(dlsym(handle_, name.c_str()));
       char* error = dlerror();
       if (error != NULL) {
         LOG(FATAL) << error;
diff --git a/src/relay/backend/test_external_library.cc b/src/relay/backend/test_external_library.cc
index b3a47d59d63c..a076785bfb44 100644
--- a/src/relay/backend/test_external_library.cc
+++ b/src/relay/backend/test_external_library.cc
@@ -21,34 +21,73 @@
 #include <cstdint>
 #include <iostream>
 
-extern "C" void Subtract(ExternalTensor a, ExternalTensor b, ExternalTensor* out) {
-  if (a.ndim > 2 || a.ndim != b.ndim || a.ndim  != out->ndim) {
-    std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim
-              << ", b.ndim = " << b.ndim
-              << ", out ndim = " << out->ndim << std::endl;
+#define GCC_BINARY_OP(OP, SYMB)                                            \
+  extern "C" void OP(ExternalTensor a, ExternalTensor b,                   \
+                     ExternalTensor* out) {                                \
+    if (a.ndim > 2 || a.ndim != b.ndim || a.ndim != out->ndim) {           \
+      std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim   \
+                << ", b.ndim = " << b.ndim << ", out ndim = " << out->ndim \
+                << std::endl;                                              \
+    }                                                                      \
+    for (int i = 0; i < a.ndim; i++) {                                     \
+      if (a.shape[i] != b.shape[i]) {                                      \
+        std::cerr << "shape[" << i << "]: a = " << a.shape[i]              \
+                  << ", b = " << b.shape[i] << std::endl;                  \
+      }                                                                    \
+    }                                                                      \
+    std::cout << "dim: " << a.ndim << " shape: " << std::endl;             \
+    for (int i = 0; i < a.ndim; i++) {                                     \
+      std::cout << a.shape[i] << " " << b.shape[i] << std::endl;           \
+    }                                                                      \
+    float* a_ptr = static_cast<float*>(a.data);                            \
+    float* b_ptr = static_cast<float*>(b.data);                            \
+    float* out_ptr = static_cast<float*>(out->data);                       \
+    if (a.ndim == 1) {                                                     \
+      for (int64_t i = 0; i < a.shape[0]; i++) {                           \
+        out_ptr[i] = a_ptr[i] SYMB b_ptr[i];                               \
+      }                                                                    \
+    } else {                                                               \
+      for (int64_t i = 0; i < a.shape[0]; i++) {                           \
+        for (int64_t j = 0; j < a.shape[1]; j++) {                         \
+          int64_t k = i * a.shape[1] + j;                                  \
+          out_ptr[k] = a_ptr[k] SYMB b_ptr[k];                             \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
   }
-  for (int i = 0; i < a.ndim; i++) {
-    if (a.shape[i] != b.shape[i]) {
-      std::cerr << "shape[" << i << "]: a = " << a.shape[i] << ", b = " << b.shape[i] << std::endl;
-    }
-  }
-  std::cout << "dim: " << a.ndim << " shape: " << std::endl;
-  for (int i = 0; i < a.ndim; i++) {
-    std::cout << a.shape[i] << " " << b.shape[i] << std::endl;
-  }
-  float* a_ptr = static_cast<float*>(a.data);
-  float* b_ptr = static_cast<float*>(b.data);
-  float* out_ptr = static_cast<float*>(out->data);
-  if (a.ndim == 1) {
-    for (int64_t i = 0; i < a.shape[0]; i++) {
-      out_ptr[i] = a_ptr[i] - b_ptr[i];
-    }
-  } else {
-    for (int64_t i = 0; i < a.shape[0]; i++) {
-      for (int64_t j = 0; j < a.shape[1]; j++) {
-        int64_t k = i * a.shape[1] + j;
-        out_ptr[k] = a_ptr[k] - b_ptr[k];
-      }
-    }
-  }
-}
+
+GCC_BINARY_OP(Subtract, -);
+GCC_BINARY_OP(Add, +);
+GCC_BINARY_OP(Multiply, *);
+
+// extern "C" void Subtract(ExternalTensor a, ExternalTensor b, ExternalTensor* out) {
+//   if (a.ndim > 2 || a.ndim != b.ndim || a.ndim  != out->ndim) {
+//     std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim
+//               << ", b.ndim = " << b.ndim
+//               << ", out ndim = " << out->ndim << std::endl;
+//   }
+//   for (int i = 0; i < a.ndim; i++) {
+//     if (a.shape[i] != b.shape[i]) {
+//       std::cerr << "shape[" << i << "]: a = " << a.shape[i] << ", b = " << b.shape[i] << std::endl;
+//     }
+//   }
+//   std::cout << "dim: " << a.ndim << " shape: " << std::endl;
+//   for (int i = 0; i < a.ndim; i++) {
+//     std::cout << a.shape[i] << " " << b.shape[i] << std::endl;
+//   }
+//   float* a_ptr = static_cast<float*>(a.data);
+//   float* b_ptr = static_cast<float*>(b.data);
+//   float* out_ptr = static_cast<float*>(out->data);
+//   if (a.ndim == 1) {
+//     for (int64_t i = 0; i < a.shape[0]; i++) {
+//       out_ptr[i] = a_ptr[i] - b_ptr[i];
+//     }
+//   } else {
+//     for (int64_t i = 0; i < a.shape[0]; i++) {
+//       for (int64_t j = 0; j < a.shape[1]; j++) {
+//         int64_t k = i * a.shape[1] + j;
+//         out_ptr[k] = a_ptr[k] - b_ptr[k];
+//       }
+//     }
+//   }
+// }
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index ab7b606dd2c8..13937b95c02d 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -196,8 +196,21 @@ class Partitioner : public ExprMutator {
           FunctionNode::make(params, input, call->args[0]->checked_type_, {}, Attrs());
 
       // FIXME: How to determine the function name?
+      // This is a hack for multiple subgraph test where each subgraph only has
+      // one call node.
+      // We can probably only pass "external" to indicate that this is an
+      // external funciton and leave the processing of the function to codegen.
+      // Otherwise, it's hard to deal with multiple-node subgraphs.
+      Expr arg0 = call->args[0];
+      std::string name = "Subgraph";
+      if (const auto* arg_call = arg0.as<CallNode>()) {
+        if (const auto* op_node = arg_call->op.as<OpNode>()) {
+          name = op_node->name;
+          name[0] = name[0] - 32;
+        }
+      }
       subgraph_func =
-          FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make("Subtract"));
+          FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make(name));
       subgraph_func = FunctionSetAttr(subgraph_func, "Primitive", tvm::Integer(1));
       subgraph_func = FunctionSetAttr(subgraph_func, "External",
                                       tvm::ir::StringImm::make(subgraph_attrs->compiler));

From 5a6f955794b33a630ecb61721051a004e403064a Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Tue, 3 Sep 2019 13:56:56 -0700
Subject: [PATCH 09/34] Add rest node visiting to propogate subgraphs.

---
 src/relay/pass/partition_graph.cc             | 94 +++++++++++++++++++
 .../python/relay/test_pass_partition_graph.py | 20 ++--
 2 files changed, 104 insertions(+), 10 deletions(-)

diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 13937b95c02d..b6e7f46e58ff 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -234,6 +234,100 @@ class Partitioner : public ExprMutator {
     }
   }
 
+  Expr VisitExpr_(const TupleGetItemNode* g) {
+    auto subgraph = GetSubgraph(GetRef<TupleGetItem>(g));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(g);
+    } else {
+      AddToSubgraph(subgraph, g->tuple);
+      auto t = VisitExpr(g->tuple);
+      return TupleGetItemNode::make(t, g->index);
+    }
+  }
+
+  Expr VisitExpr_(const FunctionNode* op) {
+    auto subgraph = GetSubgraph(GetRef<Function>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      Array<Var> params;
+      for (auto param : op->params) {
+        AddToSubgraph(subgraph, param);
+      }
+      for (auto param : op->params) {
+        Var new_param = Downcast<Var>(VisitExpr(param));
+        params.push_back(new_param);
+      }
+      auto body = VisitExpr(op->body);
+      return FunctionNode::make(params, body, op->ret_type, op->type_params, op->attrs);
+    }
+  }
+
+  Expr VisitExpr_(const LetNode* op) {
+    auto subgraph = GetSubgraph(GetRef<Let>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      AddToSubgraph(subgraph, op->var);
+      AddToSubgraph(subgraph, op->value);
+      AddToSubgraph(subgraph, op->body);
+      Var var = Downcast<Var>(VisitExpr(op->var));
+      auto value = VisitExpr(op->value);
+      auto body = VisitExpr(op->body);
+
+      return LetNode::make(var, value, body);
+    }
+  }
+
+  Expr VisitExpr_(const IfNode* op) {
+    auto subgraph = GetSubgraph(GetRef<If>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      AddToSubgraph(subgraph, op->cond);
+      AddToSubgraph(subgraph, op->true_branch);
+      AddToSubgraph(subgraph, op->false_branch);
+      auto guard = VisitExpr(op->cond);
+      auto true_b = VisitExpr(op->true_branch);
+      auto false_b = VisitExpr(op->false_branch);
+      return IfNode::make(guard, true_b, false_b);
+    }
+  }
+
+  Expr VisitExpr_(const RefCreateNode* op) {
+    auto subgraph = GetSubgraph(GetRef<RefCreate>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      AddToSubgraph(subgraph, op->value);
+      Expr value = VisitExpr(op->value);
+      return RefCreateNode::make(value);
+    }
+  }
+
+  Expr VisitExpr_(const RefReadNode* op) {
+    auto subgraph = GetSubgraph(GetRef<RefRead>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      AddToSubgraph(subgraph, op->ref);
+      Expr ref = VisitExpr(op->ref);
+      return RefReadNode::make(ref);
+    }
+  }
+
+  Expr VisitExpr_(const RefWriteNode* op) {
+    auto subgraph = GetSubgraph(GetRef<RefWrite>(op));
+    if (!subgraph) {
+      return ExprMutator::VisitExpr_(op);
+    } else {
+      AddToSubgraph(subgraph, op->ref);
+      Expr ref = VisitExpr(op->ref);
+      Expr value = VisitExpr(op->value);
+      return RefWriteNode::make(ref, value);
+    }
+  }
+
  private:
   int var_id_{0};
   int subgraph_id_{0};
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index d09d4cef16e7..1f707915c2f3 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -27,9 +27,10 @@
 class MyAnnotator(ExprMutator):
     def visit_call(self, call):
         #print(call.op.name)
-        if call.op.name == "log": # Annotate begin at args
-            inp = subgraph_begin(call.args[0], "gcc")
-            op = relay.log(inp)
+        if call.op.name == "add": # Annotate begin at args
+            lhs = subgraph_begin(call.args[0], "gcc")
+            rhs = subgraph_begin(call.args[1], "gcc")
+            op = relay.add(lhs, rhs)
             return op
         elif call.op.name == "concatenate": # Annotate end at output
             op = super().visit_call(call)
@@ -45,13 +46,12 @@ def annotate(expr):
 
 def test_partition_graph():
     x = relay.var('x', shape=(10, 10))
-    #y = relay.var('y', shape=(10, 10))
-    z0 = relay.log(x)
-    z1 = relay.log(x)
-    z2 = relay.exp(x)
-    p0 = relay.sin(z0)
-    p1 = relay.sin(z1)
-    p2 = relay.log(z2)
+    z0 = relay.add(x, relay.const(0, dtype='float32'))
+    z1 = relay.add(x, relay.const(5, dtype='float32'))
+    z2 = relay.multiply(x, relay.const(2, dtype='float32'))
+    p0 = relay.subtract(z0, relay.const(3, dtype='float32'))
+    p1 = relay.subtract(z1, relay.const(4, dtype='float32'))
+    p2 = relay.add(z2, relay.const(7, dtype='float32'))
     q = relay.concatenate((p0, p1, p2), axis=0)
     f = relay.Function([x], q)
     mod = relay.Module()

From 2ffc785e7fb172d8c544a0a047925182fdadfbbe Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 6 Sep 2019 21:24:45 +0000
Subject: [PATCH 10/34] cblas template

---
 cmake/modules/contrib/BLAS.cmake              |   5 +
 python/tvm/relay/op/contrib/cblas/__init__.py |  20 ++
 .../tvm/relay/op/contrib/cblas/extern_op.py   |  26 ++
 python/tvm/relay/op/contrib/extern_op.py      |  14 +-
 src/relay/backend/contrib/cblas/codegen.cc    | 222 ++++++++++++++++++
 src/relay/backend/contrib/cblas/utils.cc      |  34 +++
 src/relay/pass/extern_op.cc                   |  97 +-------
 src/relay/pass/partition_graph.cc             |  16 +-
 .../python/relay/test_pass_partition_graph.py |  11 +-
 9 files changed, 336 insertions(+), 109 deletions(-)
 create mode 100644 python/tvm/relay/op/contrib/cblas/__init__.py
 create mode 100644 python/tvm/relay/op/contrib/cblas/extern_op.py
 create mode 100644 src/relay/backend/contrib/cblas/codegen.cc
 create mode 100644 src/relay/backend/contrib/cblas/utils.cc

diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index bd8c0d0c445f..41d9df76bee6 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -17,11 +17,13 @@
 
 # Plugin rules for cblas
 file(GLOB CBLAS_CONTRIB_SRC src/runtime/contrib/cblas/*.cc)
+file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/*.cc)
 
 if(USE_BLAS STREQUAL "openblas")
   find_library(BLAS_LIBRARY openblas)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "mkl")
   if(NOT IS_DIRECTORY ${USE_MKL_PATH})
@@ -37,18 +39,21 @@ elseif(USE_BLAS STREQUAL "mkl")
   include_directories(${USE_MKL_PATH}/include)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY_MKL})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   add_definitions(-DUSE_MKL_BLAS=1)
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY_MKL})
 elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
   find_library(BLAS_LIBRARY cblas)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "apple")
   find_library(BLAS_LIBRARY Accelerate)
   include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "none")
   # pass
diff --git a/python/tvm/relay/op/contrib/cblas/__init__.py b/python/tvm/relay/op/contrib/cblas/__init__.py
new file mode 100644
index 000000000000..0da426ab4741
--- /dev/null
+++ b/python/tvm/relay/op/contrib/cblas/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .extern_op import *
diff --git a/python/tvm/relay/op/contrib/cblas/extern_op.py b/python/tvm/relay/op/contrib/cblas/extern_op.py
new file mode 100644
index 000000000000..c5e3b40e0f04
--- /dev/null
+++ b/python/tvm/relay/op/contrib/cblas/extern_op.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CBLAS library supported operators."""
+from __future__ import absolute_import
+
+
+def dense(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    # TODO(@comaniac) Check the attribute.
+    return False
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index 8daec9c4d3bd..458bc2d361c1 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -27,7 +27,7 @@
 """
 from __future__ import absolute_import
 
-from . import gcc
+from . import gcc, cblas
 from .. import op as reg
 
 @reg.register_extern_op("nn.conv2d")
@@ -40,6 +40,18 @@ def external_conv2d(attrs, args, compiler):
     raise RuntimeError("conv2d in {} is not registered" % (compiler))
 
 
+@reg.register_extern_op("nn.dense")
+def external_dense(attrs, args, compiler):
+    """Check if the external compiler should be used.
+    """
+    if compiler == "gcc":
+        return gcc.extern_op.dense(attrs, args)
+    if compiler == "cblas":
+        return cblas.extern_op.dense(attrs, args)
+
+    raise RuntimeError("conv2d in {} is not registered" % (compiler))
+
+
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
     """Check if the external compiler should be used.
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
new file mode 100644
index 000000000000..8b6c2bb386bf
--- /dev/null
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -0,0 +1,222 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <dlpack/dlpack.h>
+#include <stdlib.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/util.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+typedef void (*CblasFloat)(float* a, float* b, float* out, int M, int N, int K);
+// typedef void (*CblasDouble)(float* a, float* b, float* out);
+
+class CblasModuleNode : public runtime:: ModuleNode {
+ public:
+  CblasModuleNode() = default;
+  ~CblasModuleNode() {
+    Close();
+  }
+
+  // void Init(const std::string& bin_path);
+  // void Exec(const std::string& fun_name, const TVMArgs& args);
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters.
+   *
+   * \param name the name of the external function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    CHECK(handle_) << "The external cblas module has not been built yet."
+                   << "\n";
+    if (name == "nn.dense") {
+      func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(name));
+      char* error = dlerror();
+      if (error != NULL) {
+        LOG(FATAL) << error;
+        return PackedFunc();
+      }
+      return CallDense(sptr_to_self);
+    } else {
+      LOG(INFO) << "Only nn.dense is supported so far";
+      return PackedFunc();
+    }
+  }
+
+  PackedFunc CallDense(const std::shared_ptr<ModuleNode>& sptr_to_self) {
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 3U);
+      runtime::NDArray data = args[0];
+      runtime::NDArray weight = args[1];
+      runtime::NDArray out = args[2];
+
+      const DLTensor* dptr = data.operator->();
+      CHECK(runtime::TypeMatch(dptr->dtype, kDLFloat, 32));
+
+      float* d_data = reinterpret_cast<float*>(data->data);
+      float* weight_data = reinterpret_cast<float*>(weight->data);
+      float* out_data = reinterpret_cast<float*>(out->data);
+
+      // Blas is column major. So we pass B, A, C
+      int M = CountColumn(weight);
+      int N = CountRow(data);
+      int K = CountColumn(data);
+      (*func_s_)(weight_data, d_data, out_data, M, N, K);
+      *rv = out;
+    });
+  }
+
+  /*!
+   * \brief Get the source code of the external module.
+   *
+   * \param format The format of the source code.
+   *
+   * \return The source code of the external library module in the text form.
+   */
+  TVM_DLL std::string GetSource(const std::string& format = "") override {
+    return "";
+  }
+
+  const char* type_key() const final {
+    return "CblasModule";
+  }
+
+  void Build(const Expr& expr) {
+    Function func = Downcast<Function>(expr);
+    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    const auto* call = func->body.as<CallNode>();
+    CHECK(call) << "CBLAS expects a single convolution or dense op.";
+
+    const auto* op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "CBLAS expects a single convolution or dense op.";
+    Op op = GetRef<Op>(op_node);
+    if (op == Op::Get("nn.conv2d")) {
+      const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
+      // TODO(@zhiics) Generate the template.
+    } else if (op == Op::Get("nn.dense")) {
+      // TODO(@zhiics) Generate the template.
+      const auto* dense_attr = call->attrs.as<DenseAttrs>();
+    } else {
+      LOG(FATAL) << "CBLAS expects a single convolution or dense op.";
+    }
+
+    int ret = std::system(
+        "g++ -O2 -Wall -std=c++11 -shared -fPIC -I/opt/intel/mkl/include utils.cc "
+        "-L/opt/intel/mkl/lib/intel64 -o /tmp/util.so -ldl -lpthread -lm -lmkl_rt");
+    if (!ret) {
+      LOG(FATAL) << "Command failed";
+    }
+
+    Open("/tmp/subtract.so");
+  }
+
+ private:
+  // Get the number of row of a ndarray.
+  int CountRow(const runtime::NDArray& data) {
+    const DLTensor* tensor = data.operator->();
+    return tensor->shape[0];
+  }
+
+  // Get the number of columns of a ndarray.
+  int CountColumn(const runtime::NDArray& data) {
+    const DLTensor* tensor = data.operator->();
+    return tensor->shape[1];
+  }
+
+  // Platform dependent handlers for opening system lib.
+#if defined(_WIN32)
+  // The handle.
+  HMODULE handle_{nullptr};
+
+  // Open the library
+  void Open(const std::string& name) {
+    std::wstring wname(name.begin(), name.end());
+    handle_ = LoadLibraryW(wname.c_str());
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name;
+  }
+
+  // Retrieve a symbol.
+  void* GetSymbol(const std::string& name) {
+    return reinterpret_cast<void*>(GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
+  }
+
+  // Close the handle.
+  void Close() {
+    FreeLibrary(handle_);
+  }
+#else
+  // The handle.
+  void* handle_{nullptr};
+
+  // load the library
+  void Open(const std::string& name) {
+    handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name << " "
+                              << dlerror();
+  }
+
+  // Retrieve a symbol.
+  void* GetSymbol(const std::string& name) {
+    return dlsym(handle_, name.c_str());
+  }
+
+  void Close() {
+    dlclose(handle_);
+  }
+#endif
+  CblasFloat func_s_;
+};
+
+runtime::Module CreateCblasModule(const Expr& expr) {
+  std::shared_ptr<CblasModuleNode> n = std::make_shared<CblasModuleNode>();
+  n->Build(expr);
+  return runtime::Module(n);
+}
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression and
+ * compile it into a runtime module.
+ */
+runtime::Module Compiler(const Expr& expr) {
+  return CreateCblasModule(expr);
+}
+
+TVM_REGISTER_API("relay.ext.cblas")
+.set_body_typed(Compiler);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/cblas/utils.cc b/src/relay/backend/contrib/cblas/utils.cc
new file mode 100644
index 000000000000..035b335fb201
--- /dev/null
+++ b/src/relay/backend/contrib/cblas/utils.cc
@@ -0,0 +1,34 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#include <mkl_cblas.h>
+#endif  // extern "C"
+
+// TODO(@zhiics) Generate the signature that is consistent to cblas_sgemm
+// directly. We can process the other parameters from attribute of a Relay call
+// node.
+void dense(float* A, float* B, float* C, int M, int N, int K) {
+ cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A, 1, B, 1, 0.0, C, 1);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index b37aa85a36c2..bd3c284f25ed 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -34,6 +34,7 @@ namespace tvm {
 namespace relay {
 namespace extern_op {
 
+// A helper class to insert annotation boundaries for subgraphs.
 class ExternOpWrapper : public ExprMutator {
  public:
   explicit ExternOpWrapper(const std::string& compiler) : compiler_(compiler) {}
@@ -72,102 +73,8 @@ class ExternOpWrapper : public ExprMutator {
   std::string compiler_;
 };
 
-/*!
- * \brief Eleminates the back-to-back subgraph_begin(s) and end(e) annotations
- * if they are using the same external compiler. For example, the following
- * Relay program
- *
- *       b
- *       |
- *      op1
- *       |
- *       e
- *       |
- *       b
- *       |
- *      op2
- *       |
- *       e
- *
- * will be updated to if op1 and op2 require codegen from the same external
- * compiler.
- *
- *       b
- *       |
- *      op1
- *       |
- *      op2
- *       |
- *       e
- *
- * However, in the following case (op1-6 and op8 use external compiler and op7
- * uses tvm codegen), we cannot simply cancel all back-to-back `start` and
- * `end` annotations even if they use the same external compiler.
- *
- * For example, op1-6 and op8 would be grouped into the same subgraph if we
- * cancel the back-to-back start and end annotations, leaving op7 alone in a
- * separate subgraph. Unfortunately, it creates a cycle where one output of
- * the former subgraph flows into the latter, and meanwhile it requires the
- * the computed results of op7 from the latter subgraph.
- *
- * Hence, we should prevent op1-6 and op8 falling into the same subgraph all
- * together in such a case.
- *
- *       |
- *       b
- *       |
- *      op1
- *    /  |  \
- *   e   e   e
- *   |   |   |
- *   b   b   b
- *   |   |   |
- *  op2 op3 op4
- *   |   |   |
- *   e   e   e
- *   |   |   |
- *   b   b   |
- *   |   |   |
- *  op5 op6 op7
- *   |   |   |
- *   e   e   |
- *   |   |   |
- *   b   b   b
- *    \  |  /
- *      op8
- *       |
- *       e
- *       |
- */
-struct EliminateAnnotation : public ExprMutator {
-  Expr VisitExpr_(const CallNode* cn) {
-    Expr new_e = ExprMutator::VisitExpr_(cn);
-    const auto* op_node = cn->op.as<OpNode>();
-    if (op_node && GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
-      Expr input = cn->args[0];
-      if (input.as<CallNode>() == nullptr) return new_e;
-      Call input_call = Downcast<Call>(input);
-      if (input_call.defined()) {
-        const auto* call_op = input_call->op.as<OpNode>();
-        if (call_op &&
-            GetRef<Op>(call_op) == Op::Get("annotation.subgraph_end")) {
-          auto end_attrs = cn->attrs.as<SubgraphAttrs>();
-          auto begin_attrs = input_call->attrs.as<SubgraphAttrs>();
-          if (end_attrs && begin_attrs &&
-              end_attrs->compiler == begin_attrs->compiler) {
-            // Eliminate end and begin
-            return input_call->args[0];
-          }
-        }
-      }
-    }
-    return new_e;
-  }
-};
-
 Expr ExternOp(const Expr& expr, const std::string& compiler) {
-  Expr annotated = ExternOpWrapper(compiler).Mutate(expr);
-  return annotated; //EliminateAnnotation().Mutate(annotated);
+  return ExternOpWrapper(compiler).Mutate(expr);
 }
 
 }  // namespace extern_op
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index b6e7f46e58ff..64dfecbd4e74 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -218,7 +218,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const TupleNode* op) {
+  Expr VisitExpr_(const TupleNode* op) final {
     auto subgraph = GetSubgraph(GetRef<Tuple>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -234,7 +234,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const TupleGetItemNode* g) {
+  Expr VisitExpr_(const TupleGetItemNode* g) final {
     auto subgraph = GetSubgraph(GetRef<TupleGetItem>(g));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(g);
@@ -245,7 +245,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const FunctionNode* op) {
+  Expr VisitExpr_(const FunctionNode* op) final {
     auto subgraph = GetSubgraph(GetRef<Function>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -263,7 +263,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const LetNode* op) {
+  Expr VisitExpr_(const LetNode* op) final {
     auto subgraph = GetSubgraph(GetRef<Let>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -279,7 +279,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const IfNode* op) {
+  Expr VisitExpr_(const IfNode* op) final {
     auto subgraph = GetSubgraph(GetRef<If>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -294,7 +294,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const RefCreateNode* op) {
+  Expr VisitExpr_(const RefCreateNode* op) final {
     auto subgraph = GetSubgraph(GetRef<RefCreate>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -305,7 +305,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const RefReadNode* op) {
+  Expr VisitExpr_(const RefReadNode* op) final {
     auto subgraph = GetSubgraph(GetRef<RefRead>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
@@ -316,7 +316,7 @@ class Partitioner : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const RefWriteNode* op) {
+  Expr VisitExpr_(const RefWriteNode* op) final {
     auto subgraph = GetSubgraph(GetRef<RefWrite>(op));
     if (!subgraph) {
       return ExprMutator::VisitExpr_(op);
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 1f707915c2f3..6eb40715fb3d 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -56,14 +56,15 @@ def test_partition_graph():
     f = relay.Function([x], q)
     mod = relay.Module()
     mod["main"] = annotate(f)
+    print(mod['main'])
     mod = relay.transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
     print(mod['main'])
-    #x_data = np.random.rand(10, 10).astype('float32')
-    #y_data = np.random.rand(10, 10).astype('float32')
-    #ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    #res = ex.evaluate()(x_data, y_data)
-    #tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(10, 10).astype('float32')
+    # ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    # res = ex.evaluate()(x_data)
+    # tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
 
 def test_extern():
     x = relay.var('x', shape=(10, 10))

From c8e14999e7ccb1275c192acc2f3c5ea14cbd53af Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Tue, 10 Sep 2019 11:32:55 -0700
Subject: [PATCH 11/34] make Cblas working and refactor contrib codegen

---
 CMakeLists.txt                                |   1 +
 cmake/modules/contrib/BLAS.cmake              |   5 -
 cmake/modules/contrib/Extern.cmake            |  30 ++++
 include/tvm/relay/contrib_codegen.h           | 166 ++++++++++++++++++
 .../tvm/relay/op/contrib/cblas/extern_op.py   |   3 +-
 python/tvm/relay/op/contrib/extern_op.py      |  49 +++---
 src/relay/backend/contrib/cblas/codegen.cc    | 152 +++++-----------
 .../contrib/cblas/{utils.cc => libs.cc}       |   6 +-
 .../gcc/codegen.cc}                           |  78 ++++----
 .../gcc/libs.cc}                              |   8 +-
 .../gcc/libs.h}                               |   0
 src/relay/pass/partition_graph.cc             |   3 +-
 .../python/relay/test_pass_partition_graph.py |  29 ++-
 13 files changed, 333 insertions(+), 197 deletions(-)
 create mode 100644 cmake/modules/contrib/Extern.cmake
 create mode 100644 include/tvm/relay/contrib_codegen.h
 rename src/relay/backend/contrib/cblas/{utils.cc => libs.cc} (79%)
 rename src/relay/backend/{test_external_codegen.cc => contrib/gcc/codegen.cc} (66%)
 rename src/relay/backend/{test_external_library.cc => contrib/gcc/libs.cc} (97%)
 rename src/relay/backend/{test_external_library.h => contrib/gcc/libs.h} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf18ffc9e856..19e026f95d82 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,6 +252,7 @@ include(cmake/modules/LLVM.cmake)
 include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
+include(cmake/modules/contrib/Extern.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 41d9df76bee6..bd8c0d0c445f 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -17,13 +17,11 @@
 
 # Plugin rules for cblas
 file(GLOB CBLAS_CONTRIB_SRC src/runtime/contrib/cblas/*.cc)
-file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/*.cc)
 
 if(USE_BLAS STREQUAL "openblas")
   find_library(BLAS_LIBRARY openblas)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "mkl")
   if(NOT IS_DIRECTORY ${USE_MKL_PATH})
@@ -39,21 +37,18 @@ elseif(USE_BLAS STREQUAL "mkl")
   include_directories(${USE_MKL_PATH}/include)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY_MKL})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   add_definitions(-DUSE_MKL_BLAS=1)
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY_MKL})
 elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas")
   find_library(BLAS_LIBRARY cblas)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "apple")
   find_library(BLAS_LIBRARY Accelerate)
   include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
   message(STATUS "Use BLAS library " ${BLAS_LIBRARY})
 elseif(USE_BLAS STREQUAL "none")
   # pass
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
new file mode 100644
index 000000000000..c8e862ac5e84
--- /dev/null
+++ b/cmake/modules/contrib/Extern.cmake
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+message(STATUS "Build with relay.backend.contrib")
+
+# Gcc (for demo purpose)
+file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/*.cc)
+list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+
+# CBLAS (for demo purpose)
+file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/*.cc)
+if(USE_BLAS STREQUAL "mkl")
+    list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
+elseif(USE_BLAS STREQUAL "none")
+    # pass
+endif()
diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
new file mode 100644
index 000000000000..1dc27523243b
--- /dev/null
+++ b/include/tvm/relay/contrib_codegen.h
@@ -0,0 +1,166 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RELAY_CONTRIB_CODEGEN_H_
+#define TVM_RELAY_CONTRIB_CODEGEN_H_
+
+#include <dlpack/dlpack.h>
+#include <stdlib.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/util.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+class ExternModuleNodeBase : public runtime:: ModuleNode {
+ public:
+  ExternModuleNodeBase() = default;
+  ~ExternModuleNodeBase() {
+    Close();
+  }
+
+  /*!
+   * \brief Get the full path of compiled external shared library of this compiler.
+   *
+   * \return The string of the library path.
+   */
+  virtual const std::string GetExternLibPath() = 0;
+
+  /*!
+   * \brief Build the shared library of external ops.
+   *
+   * \param expr The subgraph Relay expression to be executed using extern ops.
+   *
+   */
+  virtual void Build(const Expr& expr) = 0;
+
+  /*!
+   * \brief The extern module specific implementation of invoking pre-built functions.
+   *
+   * \param name the name of the external function.
+   * \param func_s The function symbol retrieved from the external library.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  virtual runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+                                               const std::shared_ptr<ModuleNode>& sptr_to_self) = 0;
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters.
+   *
+   * \param name the name of the external function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  runtime::PackedFunc GetFunction(const std::string& name,
+                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    if (handle_ == nullptr) {
+      Open(this->GetExternLibPath());
+    }
+    CHECK(handle_) << "The external cblas module has not been built or failed to be opened.\n";
+
+    auto func_s = GetSymbol(name);
+    char* error = dlerror();
+    if (error != NULL) {
+      LOG(FATAL) << error;
+      return PackedFunc();
+    }
+    return this->InvokeExternFunc(name, func_s, sptr_to_self);
+  }
+
+  /*!
+   * \brief Get the source code of the external module.
+   *
+   * \param format The format of the source code.
+   *
+   * \return The source code of the external library module in the text form.
+   */
+  TVM_DLL std::string GetSource(const std::string& format = "") override {
+    return "";
+  }
+
+  const char* type_key() const override {
+    return "ExternModule";
+  }
+
+ private:
+  // Platform dependent handlers for opening system lib.
+#if defined(_WIN32)
+  // The handle.
+  HMODULE handle_{nullptr};
+
+  // Open the library
+  void Open(const std::string& name) {
+    std::wstring wname(name.begin(), name.end());
+    handle_ = LoadLibraryW(wname.c_str());
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name;
+  }
+
+  // Retrieve a symbol.
+  void* GetSymbol(const std::string& name) {
+    return reinterpret_cast<void*>(GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
+  }
+
+  // Close the handle.
+  void Close() {
+    FreeLibrary(handle_);
+  }
+#else
+  // The handle.
+  void* handle_{nullptr};
+
+  // load the library
+  void Open(const std::string& name) {
+    handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name << " "
+                              << dlerror();
+  }
+
+  // Retrieve a symbol.
+  void* GetSymbol(const std::string& name) {
+    std::string op_name = name;
+    if (op_name.find('.') != std::string::npos) {
+      op_name = op_name.substr(op_name.rfind('.') + 1);
+    }
+    return dlsym(handle_, op_name.c_str());
+  }
+
+  void Close() {
+    dlclose(handle_);
+  }
+#endif
+};
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+#endif
\ No newline at end of file
diff --git a/python/tvm/relay/op/contrib/cblas/extern_op.py b/python/tvm/relay/op/contrib/cblas/extern_op.py
index c5e3b40e0f04..b7959aefa65d 100644
--- a/python/tvm/relay/op/contrib/cblas/extern_op.py
+++ b/python/tvm/relay/op/contrib/cblas/extern_op.py
@@ -22,5 +22,4 @@
 def dense(attrs, args):
     """Check if the external codegen should be used.
     """
-    # TODO(@comaniac) Check the attribute.
-    return False
+    return True
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index 458bc2d361c1..67f36ff38720 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -27,56 +27,59 @@
 """
 from __future__ import absolute_import
 
-from . import gcc, cblas
+import sys
+import pkgutil
+from pathlib import Path
+from importlib import import_module
+
 from .. import op as reg
 
+# Load available contrib compilers
+compilers = {}
+for _, name, _ in pkgutil.iter_modules([Path(__file__).parent]):
+    compilers[name] = import_module('.%s' % name, package='.'.join(__name__.split('.')[:-1]))
+
+def get_extern_op(compiler, op_name):
+    """Get the extern op function from the registered compiler
+    """
+    if compiler in compilers:
+        if hasattr(compilers[compiler], 'extern_op'):
+            extern_op = getattr(compilers[compiler], 'extern_op')
+            if hasattr(extern_op, op_name):
+                return getattr(extern_op, op_name)
+
+    raise RuntimeError("%s in %s is not registered" % (op_name, compiler))
+
 @reg.register_extern_op("nn.conv2d")
 def external_conv2d(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
-    if compiler == "gcc":
-        return gcc.extern_op.conv2d(attrs, args)
-
-    raise RuntimeError("conv2d in {} is not registered" % (compiler))
+    return get_extern_op(compiler, 'conv2d')(attrs, args)
 
 
 @reg.register_extern_op("nn.dense")
 def external_dense(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
-    if compiler == "gcc":
-        return gcc.extern_op.dense(attrs, args)
-    if compiler == "cblas":
-        return cblas.extern_op.dense(attrs, args)
-
-    raise RuntimeError("conv2d in {} is not registered" % (compiler))
+    return get_extern_op(compiler, 'dense')(attrs, args)
 
 
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
-    if compiler == "gcc":
-        return gcc.extern_op.subtract(attrs, args)
-
-    raise RuntimeError("subtract in {} is not registered" % (compiler))
+    return get_extern_op(compiler, 'subtract')(attrs, args)
 
 
 @reg.register_extern_op("add")
 def external_add(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
-    if compiler == "gcc":
-        return gcc.extern_op.add(attrs, args)
-
-    raise RuntimeError("add in {} is not registered" % (compiler))
+    return get_extern_op(compiler, 'add')(attrs, args)
 
 
 @reg.register_extern_op("multiply")
 def external_multiply(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
-    if compiler == "gcc":
-        return gcc.extern_op.multiply(attrs, args)
-
-    raise RuntimeError("multiply in {} is not registered" % (compiler))
+    return get_extern_op(compiler, 'multiply')(attrs, args)
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
index 8b6c2bb386bf..df77895d3bdd 100644
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -18,6 +18,7 @@
 #include <dlpack/dlpack.h>
 #include <stdlib.h>
 #include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
@@ -36,69 +37,52 @@ namespace relay {
 namespace contrib {
 
 typedef void (*CblasFloat)(float* a, float* b, float* out, int M, int N, int K);
-// typedef void (*CblasDouble)(float* a, float* b, float* out);
 
-class CblasModuleNode : public runtime:: ModuleNode {
+class CblasModuleNode : public ExternModuleNodeBase {
  public:
-  CblasModuleNode() = default;
-  ~CblasModuleNode() {
-    Close();
+  const std::string GetExternLibPath() override {
+    return "/tmp/relay_extern_cblas.so";
   }
 
-  // void Init(const std::string& bin_path);
-  // void Exec(const std::string& fun_name, const TVMArgs& args);
-
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
    * for execution given some parameters.
    *
    * \param name the name of the external function.
+   * \param func_s The function symbol retrieved from the external library.
    * \param sptr_to_self The shared_ptr that points to this module node.
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    CHECK(handle_) << "The external cblas module has not been built yet."
-                   << "\n";
+  runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     if (name == "nn.dense") {
-      func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(name));
-      char* error = dlerror();
-      if (error != NULL) {
-        LOG(FATAL) << error;
-        return PackedFunc();
-      }
-      return CallDense(sptr_to_self);
+      func_s_ = reinterpret_cast<CblasFloat>(func_s);
+      return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+        CHECK_EQ(args.size(), 3U);
+        runtime::NDArray data = args[0];
+        runtime::NDArray weight = args[1];
+        runtime::NDArray out = args[2];
+
+        const DLTensor* dptr = data.operator->();
+        CHECK(runtime::TypeMatch(dptr->dtype, kDLFloat, 32));
+
+        float* d_data = reinterpret_cast<float*>(data->data);
+        float* weight_data = reinterpret_cast<float*>(weight->data);
+        float* out_data = reinterpret_cast<float*>(out->data);
+
+        int M = CountRow(data);
+        int N = CountColumn(weight);
+        int K = CountColumn(data);
+        (*func_s_)(d_data, weight_data, out_data, M, N, K);
+        *rv = out;
+      });
     } else {
-      LOG(INFO) << "Only nn.dense is supported so far";
+      LOG(INFO) << name << " is not Supported. Only nn.dense is supported so far";
       return PackedFunc();
     }
   }
 
-  PackedFunc CallDense(const std::shared_ptr<ModuleNode>& sptr_to_self) {
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 3U);
-      runtime::NDArray data = args[0];
-      runtime::NDArray weight = args[1];
-      runtime::NDArray out = args[2];
-
-      const DLTensor* dptr = data.operator->();
-      CHECK(runtime::TypeMatch(dptr->dtype, kDLFloat, 32));
-
-      float* d_data = reinterpret_cast<float*>(data->data);
-      float* weight_data = reinterpret_cast<float*>(weight->data);
-      float* out_data = reinterpret_cast<float*>(out->data);
-
-      // Blas is column major. So we pass B, A, C
-      int M = CountColumn(weight);
-      int N = CountRow(data);
-      int K = CountColumn(data);
-      (*func_s_)(weight_data, d_data, out_data, M, N, K);
-      *rv = out;
-    });
-  }
-
   /*!
    * \brief Get the source code of the external module.
    *
@@ -110,11 +94,11 @@ class CblasModuleNode : public runtime:: ModuleNode {
     return "";
   }
 
-  const char* type_key() const final {
+  const char* type_key() const override {
     return "CblasModule";
   }
 
-  void Build(const Expr& expr) {
+  void Build(const Expr& expr) override {
     Function func = Downcast<Function>(expr);
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
@@ -124,23 +108,26 @@ class CblasModuleNode : public runtime:: ModuleNode {
     CHECK(op_node) << "CBLAS expects a single convolution or dense op.";
     Op op = GetRef<Op>(op_node);
     if (op == Op::Get("nn.conv2d")) {
-      const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
+      // const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
       // TODO(@zhiics) Generate the template.
+      ;
     } else if (op == Op::Get("nn.dense")) {
       // TODO(@zhiics) Generate the template.
-      const auto* dense_attr = call->attrs.as<DenseAttrs>();
+      //const auto* dense_attr = call->attrs.as<DenseAttrs>();
+      ;
     } else {
       LOG(FATAL) << "CBLAS expects a single convolution or dense op.";
     }
 
-    int ret = std::system(
-        "g++ -O2 -Wall -std=c++11 -shared -fPIC -I/opt/intel/mkl/include utils.cc "
-        "-L/opt/intel/mkl/lib/intel64 -o /tmp/util.so -ldl -lpthread -lm -lmkl_rt");
-    if (!ret) {
-      LOG(FATAL) << "Command failed";
+    if (!std::getenv("MKLROOT")) {
+      LOG(FATAL) << "MKLROOT not found. Did you source mklvars.sh?";
+    }
+    int ret = std::system("g++ -O2 -Wall -std=c++11 -shared -fPIC "
+                          "src/relay/backend/contrib/cblas/libs.cc "
+                          "-o /tmp/relay_extern_cblas.so -ldl -lpthread -lm -lmkl_rt");
+    if (ret != 0) {
+      LOG(FATAL) << "Failed to compile CBLAS library. Error code: " << ret;
     }
-
-    Open("/tmp/subtract.so");
   }
 
  private:
@@ -156,66 +143,21 @@ class CblasModuleNode : public runtime:: ModuleNode {
     return tensor->shape[1];
   }
 
-  // Platform dependent handlers for opening system lib.
-#if defined(_WIN32)
-  // The handle.
-  HMODULE handle_{nullptr};
-
-  // Open the library
-  void Open(const std::string& name) {
-    std::wstring wname(name.begin(), name.end());
-    handle_ = LoadLibraryW(wname.c_str());
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name;
-  }
-
-  // Retrieve a symbol.
-  void* GetSymbol(const std::string& name) {
-    return reinterpret_cast<void*>(GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
-  }
-
-  // Close the handle.
-  void Close() {
-    FreeLibrary(handle_);
-  }
-#else
-  // The handle.
-  void* handle_{nullptr};
-
-  // load the library
-  void Open(const std::string& name) {
-    handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name << " "
-                              << dlerror();
-  }
-
-  // Retrieve a symbol.
-  void* GetSymbol(const std::string& name) {
-    return dlsym(handle_, name.c_str());
-  }
-
-  void Close() {
-    dlclose(handle_);
-  }
-#endif
   CblasFloat func_s_;
 };
 
-runtime::Module CreateCblasModule(const Expr& expr) {
-  std::shared_ptr<CblasModuleNode> n = std::make_shared<CblasModuleNode>();
-  n->Build(expr);
-  return runtime::Module(n);
-}
-
 /*!
  * \brief The external compiler/codegen tool. It takes a Relay expression and
  * compile it into a runtime module.
  */
-runtime::Module Compiler(const Expr& expr) {
-  return CreateCblasModule(expr);
+runtime::Module CblasCompiler(const Expr& expr) {
+  std::shared_ptr<CblasModuleNode> n = std::make_shared<CblasModuleNode>();
+  n->Build(expr);
+  return runtime::Module(n);
 }
 
 TVM_REGISTER_API("relay.ext.cblas")
-.set_body_typed(Compiler);
+.set_body_typed(CblasCompiler);
 
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/cblas/utils.cc b/src/relay/backend/contrib/cblas/libs.cc
similarity index 79%
rename from src/relay/backend/contrib/cblas/utils.cc
rename to src/relay/backend/contrib/cblas/libs.cc
index 035b335fb201..3998b8a49c33 100644
--- a/src/relay/backend/contrib/cblas/utils.cc
+++ b/src/relay/backend/contrib/cblas/libs.cc
@@ -20,13 +20,11 @@
 extern "C"
 {
 #include <mkl_cblas.h>
+#include <stdio.h>
 #endif  // extern "C"
 
-// TODO(@zhiics) Generate the signature that is consistent to cblas_sgemm
-// directly. We can process the other parameters from attribute of a Relay call
-// node.
 void dense(float* A, float* B, float* C, int M, int N, int K) {
- cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A, 1, B, 1, 0.0, C, 1);
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
 }
 
 #ifdef __cplusplus
diff --git a/src/relay/backend/test_external_codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
similarity index 66%
rename from src/relay/backend/test_external_codegen.cc
rename to src/relay/backend/contrib/gcc/codegen.cc
index 0723d7976c0b..37eb5b458e86 100644
--- a/src/relay/backend/test_external_codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -17,51 +17,42 @@
  */
 #include <dlfcn.h>
 #include <stdlib.h>
+#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 
-#include "test_external_library.h"
+#include "libs.h"
 
 namespace tvm {
 namespace relay {
+namespace contrib {
 
-typedef void (*sub)(ExternalTensor a, ExternalTensor b, ExternalTensor* out);
+typedef void (*GccBinaryFunc)(ExternalTensor a, ExternalTensor b, ExternalTensor* out);
 
-class ExternalModuleNode : public runtime:: ModuleNode {
+class GccModuleNode : public ExternModuleNodeBase {
  public:
-  ExternalModuleNode() = default;
-  ~ExternalModuleNode() {
-    if (handle_ != nullptr) {
-      dlclose(handle_);
-    }
-  }
 
-  // void Init(const std::string& bin_path);
-  // void Exec(const std::string& fun_name, const TVMArgs& args);
+  const std::string GetExternLibPath() override {
+    return "/tmp/relay_extern_gcc.so";
+  }
 
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
    * for execution given some parameters.
    *
    * \param name the name of the external function.
+   * \param func_s The function symbol retrieved from the external library.
    * \param sptr_to_self The shared_ptr that points to this module node.
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (name == "Subtract" || "Add" || "Multiply") {
-      CHECK(handle_) << "You need to build the external module first";
-      func_s_ = reinterpret_cast<sub>(dlsym(handle_, name.c_str()));
-      char* error = dlerror();
-      if (error != NULL) {
-        LOG(FATAL) << error;
-        return PackedFunc();
-      }
+  runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    if (name == "subtract" || "add" || "multiply") {
+      func_s_ = reinterpret_cast<GccBinaryFunc>(func_s);
 
       return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
         CHECK_EQ(args.size(), 3U);
@@ -109,35 +100,25 @@ class ExternalModuleNode : public runtime:: ModuleNode {
     return "";
   }
 
-  const char* type_key() const final {
-    return "ExternalModule";
+  const char* type_key() const override {
+    return "GccModule";
   }
 
-  void Build() {
-    std::system(
-        "g++ -std=c++11 -shared -fPIC -ldl src/relay/backend/test_external_library.cc -o /tmp/subtract.so");
-    handle_ = dlopen("/tmp/subtract.so", RTLD_LAZY);
-    if (!handle_) {
-      LOG(FATAL) << "Cannot open library: " << dlerror() << '\n';
+  void Build(const Expr& expr) override {
+    Function func = Downcast<Function>(expr);
+    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    int ret = std::system(
+        "g++ -std=c++11 -shared -fPIC -ldl src/relay/backend/contrib/gcc/libs.cc "
+        "-o /tmp/relay_extern_gcc.so");
+    if (ret != 0) {
+      LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
   }
 
  private:
-  void* handle_{nullptr};
-  sub func_s_;
+  GccBinaryFunc func_s_;
 };
 
-runtime::Module CreateExternalModule() {
-  std::shared_ptr<ExternalModuleNode> n = std::make_shared<ExternalModuleNode>();
-  n->Build();
-  return runtime::Module(n);
-}
-
-}  // namespace relay
-}  // namespace tvm
-
-namespace tvm {
-namespace relay {
 
 /*!
  * \brief The external compiler/codegen tool. It takes a Relay expression and
@@ -152,14 +133,15 @@ namespace relay {
  * a single expression/function.
  *  2. Return runtime::Module.
  */
-runtime::Module Compiler(const Expr& expr) {
-  Function func = Downcast<Function>(expr);
-  CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
-  return CreateExternalModule();
+runtime::Module GccCompiler(const Expr& expr) {
+  std::shared_ptr<GccModuleNode> n = std::make_shared<GccModuleNode>();
+  n->Build(expr);
+  return runtime::Module(n);
 }
 
 TVM_REGISTER_API("relay.ext.gcc")
-.set_body_typed(Compiler);
+.set_body_typed(GccCompiler);
 
+}  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/test_external_library.cc b/src/relay/backend/contrib/gcc/libs.cc
similarity index 97%
rename from src/relay/backend/test_external_library.cc
rename to src/relay/backend/contrib/gcc/libs.cc
index a076785bfb44..4c81567c3845 100644
--- a/src/relay/backend/test_external_library.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -16,7 +16,7 @@
  * under the License.
  */
 
-#include "test_external_library.h"
+#include "libs.h"
 
 #include <cstdint>
 #include <iostream>
@@ -56,9 +56,9 @@
     }                                                                      \
   }
 
-GCC_BINARY_OP(Subtract, -);
-GCC_BINARY_OP(Add, +);
-GCC_BINARY_OP(Multiply, *);
+GCC_BINARY_OP(subtract, -);
+GCC_BINARY_OP(add, +);
+GCC_BINARY_OP(multiply, *);
 
 // extern "C" void Subtract(ExternalTensor a, ExternalTensor b, ExternalTensor* out) {
 //   if (a.ndim > 2 || a.ndim != b.ndim || a.ndim  != out->ndim) {
diff --git a/src/relay/backend/test_external_library.h b/src/relay/backend/contrib/gcc/libs.h
similarity index 100%
rename from src/relay/backend/test_external_library.h
rename to src/relay/backend/contrib/gcc/libs.h
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 64dfecbd4e74..2c43e222f609 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -202,11 +202,10 @@ class Partitioner : public ExprMutator {
       // external funciton and leave the processing of the function to codegen.
       // Otherwise, it's hard to deal with multiple-node subgraphs.
       Expr arg0 = call->args[0];
-      std::string name = "Subgraph";
+      std::string name = "subgraph";
       if (const auto* arg_call = arg0.as<CallNode>()) {
         if (const auto* op_node = arg_call->op.as<OpNode>()) {
           name = op_node->name;
-          name[0] = name[0] - 32;
         }
       }
       subgraph_func =
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 6eb40715fb3d..ec7049a5d041 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -60,13 +60,13 @@ def test_partition_graph():
     mod = relay.transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
     print(mod['main'])
-    x_data = np.random.rand(10, 10).astype('float32')
-    y_data = np.random.rand(10, 10).astype('float32')
+    #x_data = np.random.rand(10, 10).astype('float32')
+    #y_data = np.random.rand(10, 10).astype('float32')
     # ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
     # res = ex.evaluate()(x_data)
     # tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
 
-def test_extern():
+def test_extern_gcc():
     x = relay.var('x', shape=(10, 10))
     y = relay.var('y', shape=(10, 10))
     z = x + x
@@ -83,6 +83,27 @@ def test_extern():
     res = ex.evaluate()(x_data, y_data)
     tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
 
+def test_extern_cblas():
+    m = 16
+    n = 224
+    k = 224
+    x = relay.var('x', shape=(m, k))
+    y = relay.var('y', shape=(n, k))
+    f = relay.Function([x, y], relay.op.nn.dense(x, y))
+    mod = relay.Module()
+    mod['main'] = f
+    mod = relay.transform.ExternOp('cblas')(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    print(mod['main'])
+
+    x_data = np.random.uniform(low=0, high=1, size=(m, k)).astype('float32')
+    y_data = np.random.uniform(low=0, high=1, size=(n, k)).astype('float32')
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, y_data)
+    tvm.testing.assert_allclose(
+        res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
+
 if __name__ == "__main__":
     test_partition_graph()
-    test_extern()
+    test_extern_gcc()
+    test_extern_cblas()

From 7eef6f1b9e0ea19c8c2d9faa51ccf3afb03700d1 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 10 Sep 2019 21:45:05 +0000
Subject: [PATCH 12/34] small fix for style and check handle_ before closing it

---
 include/tvm/relay/contrib_codegen.h        | 16 ++++++++++------
 include/tvm/relay/op_attr_types.h          |  6 +++---
 python/tvm/relay/op/contrib/extern_op.py   |  1 -
 src/relay/backend/contrib/cblas/codegen.cc | 15 ++++++++-------
 src/relay/backend/contrib/cblas/libs.cc    |  5 ++---
 src/relay/backend/contrib/gcc/codegen.cc   |  3 +--
 src/relay/pass/partition_graph.cc          |  8 ++++----
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 1dc27523243b..8cc81f808d17 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -21,12 +21,12 @@
 #include <dlpack/dlpack.h>
 #include <stdlib.h>
 #include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/util.h>
+#include <string>
 
 #if defined(_WIN32)
 #include <windows.h>
@@ -50,7 +50,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return The string of the library path.
    */
-  virtual const std::string GetExternLibPath() = 0;
+  virtual const std::string GetExternLibPath() const = 0;
 
   /*!
    * \brief Build the shared library of external ops.
@@ -86,7 +86,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     if (handle_ == nullptr) {
       Open(this->GetExternLibPath());
     }
-    CHECK(handle_) << "The external cblas module has not been built or failed to be opened.\n";
+    CHECK(handle_) << "The external cblas module has not been built or failed to open.\n";
 
     auto func_s = GetSymbol(name);
     char* error = dlerror();
@@ -132,7 +132,9 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
 
   // Close the handle.
   void Close() {
-    FreeLibrary(handle_);
+    if (handle_) {
+      FreeLibrary(handle_);
+    }
   }
 #else
   // The handle.
@@ -155,7 +157,9 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   }
 
   void Close() {
-    dlclose(handle_);
+    if (handle_) {
+      dlclose(handle_);
+    }
   }
 #endif
 };
@@ -163,4 +167,4 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
-#endif
\ No newline at end of file
+#endif  // TVM_RELAY_CONTRIB_CODEGEN_H_
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 02e1a6ac08d4..d86745916fb6 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -159,9 +159,9 @@ using FTVMLegalize = runtime::TypedPackedFunc<
  * otherwise, false.
  */
 using FTVMExternOp = runtime::TypedPackedFunc<
-bool(const Attrs& attrs,
-     const Array<Expr>& args,
-     const std::string& compiler)>;
+  bool(const Attrs& attrs,  // NOLINT(*)
+       const Array<Expr>& args,
+       const std::string& compiler)>;
 
 /*!
  * \brief Forward rewriting rule for a specific op.
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index 67f36ff38720..a6ba2dfa525b 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -27,7 +27,6 @@
 """
 from __future__ import absolute_import
 
-import sys
 import pkgutil
 from pathlib import Path
 from importlib import import_module
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
index df77895d3bdd..8bcae7512b09 100644
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -37,10 +37,11 @@ namespace relay {
 namespace contrib {
 
 typedef void (*CblasFloat)(float* a, float* b, float* out, int M, int N, int K);
+typedef void (*CblasDouble)(double* a, double* b, double* out, int M, int N, int K);
 
 class CblasModuleNode : public ExternModuleNodeBase {
  public:
-  const std::string GetExternLibPath() override {
+  const std::string GetExternLibPath() const override {
     return "/tmp/relay_extern_cblas.so";
   }
 
@@ -110,11 +111,9 @@ class CblasModuleNode : public ExternModuleNodeBase {
     if (op == Op::Get("nn.conv2d")) {
       // const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
       // TODO(@zhiics) Generate the template.
-      ;
     } else if (op == Op::Get("nn.dense")) {
       // TODO(@zhiics) Generate the template.
-      //const auto* dense_attr = call->attrs.as<DenseAttrs>();
-      ;
+      // const auto* dense_attr = call->attrs.as<DenseAttrs>();
     } else {
       LOG(FATAL) << "CBLAS expects a single convolution or dense op.";
     }
@@ -131,15 +130,17 @@ class CblasModuleNode : public ExternModuleNodeBase {
   }
 
  private:
-  // Get the number of row of a ndarray.
-  int CountRow(const runtime::NDArray& data) {
+  // Get the number of rows of a ndarray.
+  int CountRow(const runtime::NDArray& data) const {
     const DLTensor* tensor = data.operator->();
+    CHECK(tensor) << "No container is defined in the NDArray" << "\n";
     return tensor->shape[0];
   }
 
   // Get the number of columns of a ndarray.
-  int CountColumn(const runtime::NDArray& data) {
+  int CountColumn(const runtime::NDArray& data) const {
     const DLTensor* tensor = data.operator->();
+    CHECK(tensor) << "No container is defined in the NDArray" << "\n";
     return tensor->shape[1];
   }
 
diff --git a/src/relay/backend/contrib/cblas/libs.cc b/src/relay/backend/contrib/cblas/libs.cc
index 3998b8a49c33..fb4e590d701b 100644
--- a/src/relay/backend/contrib/cblas/libs.cc
+++ b/src/relay/backend/contrib/cblas/libs.cc
@@ -17,14 +17,13 @@
  */
 
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #include <mkl_cblas.h>
 #include <stdio.h>
 #endif  // extern "C"
 
 void dense(float* A, float* B, float* C, int M, int N, int K) {
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
 }
 
 #ifdef __cplusplus
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 37eb5b458e86..1b5a0789aeca 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -34,8 +34,7 @@ typedef void (*GccBinaryFunc)(ExternalTensor a, ExternalTensor b, ExternalTensor
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-
-  const std::string GetExternLibPath() override {
+  const std::string GetExternLibPath() const override {
     return "/tmp/relay_extern_gcc.so";
   }
 
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 2c43e222f609..e205a6fa6b60 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -144,13 +144,12 @@ class Partitioner : public ExprMutator {
       return ExprMutator::VisitExpr_(call);
     } else if (GetRef<Op>(op_node) == Op::Get("annotation.subgraph_begin")) {
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK(call->args.size() == 1);
+      CHECK_EQ(call->args.size(), 1U);
 
       // Traverse the rest graph.
       auto input_expr = VisitExpr(call->args[0]);
 
       // Replace the begin annotation with an external call input variable.
-      // TODO: Confirm if it is safe to use checked_type_ instead of checked_type()
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
       auto var = VarNode::make(subgraph_attrs->compiler + "_input" + std::to_string(var_id_++),
                                input_expr->checked_type_);
@@ -166,7 +165,7 @@ class Partitioner : public ExprMutator {
     } else {
       CHECK(GetRef<Op>(op_node) == Op::Get("annotation.subgraph_end"));
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK(call->args.size() == 1);
+      CHECK_EQ(call->args.size(), 1U);
 
       auto subgraph_attrs = call->attrs.as<SubgraphAttrs>();
 
@@ -395,7 +394,8 @@ Pass PartitionGraph() {
   return Sequential({partitioned, InferType()});
 }
 
-TVM_REGISTER_API("relay._transform.PartitionGraph").set_body_typed(transform::PartitionGraph);
+TVM_REGISTER_API("relay._transform.PartitionGraph")
+.set_body_typed(transform::PartitionGraph);
 
 }  // namespace transform
 

From 001f9c695c347743a3344be7014900c720cddbb2 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 11 Sep 2019 16:21:08 -0700
Subject: [PATCH 13/34] refactor the interface for different data types

---
 include/tvm/relay/contrib_codegen.h           | 63 ++++++++++++-------
 .../tvm/relay/op/contrib/cblas/extern_op.py   |  2 +-
 src/relay/backend/contrib/cblas/codegen.cc    | 42 ++++++++-----
 src/relay/backend/contrib/cblas/libs.cc       |  6 +-
 src/relay/backend/contrib/gcc/codegen.cc      |  8 +--
 .../python/relay/test_pass_partition_graph.py | 28 ++++-----
 6 files changed, 91 insertions(+), 58 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 8cc81f808d17..7b294b56c40e 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -46,11 +46,11 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   }
 
   /*!
-   * \brief Get the full path of compiled external shared library of this compiler.
+   * \brief Get the full path of compiled external shared libraries of this compiler.
    *
-   * \return The string of the library path.
+   * \return An array of strings of the library paths.
    */
-  virtual const std::string GetExternLibPath() const = 0;
+  virtual const std::vector<std::string> GetExternLibPaths() const = 0;
 
   /*!
    * \brief Build the shared library of external ops.
@@ -69,7 +69,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  virtual runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+  virtual runtime::PackedFunc InvokeExternFunc(const std::string& name,
                                                const std::shared_ptr<ModuleNode>& sptr_to_self) = 0;
 
   /*!
@@ -83,18 +83,18 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (handle_ == nullptr) {
-      Open(this->GetExternLibPath());
+    if (!IsHandleOpen()) {
+      Open(this->GetExternLibPaths());
     }
-    CHECK(handle_) << "The external cblas module has not been built or failed to open.\n";
+    CHECK(handle_) << "The external module has not been built or failed to open.\n";
 
-    auto func_s = GetSymbol(name);
+    auto func_s = this->InvokeExternFunc(name, sptr_to_self);
     char* error = dlerror();
     if (error != NULL) {
       LOG(FATAL) << error;
       return PackedFunc();
     }
-    return this->InvokeExternFunc(name, func_s, sptr_to_self);
+    return func_s;
   }
 
   /*!
@@ -112,26 +112,36 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     return "ExternModule";
   }
 
- private:
+  /*!
+   * \brief Check if the library is opened or not.
+   *
+   *
+   * \return True if the library is already opened.
+   */
+  virtual bool IsHandleOpen() {
+    return handle_ != nullptr;
+  }
+
+ protected:
   // Platform dependent handlers for opening system lib.
 #if defined(_WIN32)
   // The handle.
   HMODULE handle_{nullptr};
 
   // Open the library
-  void Open(const std::string& name) {
+  virtual void Open(const std::string& name) {
     std::wstring wname(name.begin(), name.end());
     handle_ = LoadLibraryW(wname.c_str());
     CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name;
   }
 
   // Retrieve a symbol.
-  void* GetSymbol(const std::string& name) {
+  virtual void* GetSymbol(const std::string& name) {
     return reinterpret_cast<void*>(GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
   }
 
   // Close the handle.
-  void Close() {
+  virtual void Close() {
     if (handle_) {
       FreeLibrary(handle_);
     }
@@ -141,22 +151,27 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   void* handle_{nullptr};
 
   // load the library
-  void Open(const std::string& name) {
-    handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name << " "
+  virtual void Open(const std::vector<std::string> lib_names) {
+    CHECK(lib_names.size() == 1) << "Default library loader only loads one library. "
+                                 << "Please override the loader if multiple libraries are used";
+    handle_ = dlopen(lib_names[0].c_str(), RTLD_LAZY | RTLD_LOCAL);
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << lib_names[0] << " "
                               << dlerror();
   }
 
-  // Retrieve a symbol.
-  void* GetSymbol(const std::string& name) {
-    std::string op_name = name;
-    if (op_name.find('.') != std::string::npos) {
-      op_name = op_name.substr(op_name.rfind('.') + 1);
-    }
-    return dlsym(handle_, op_name.c_str());
+  /*!
+   * \brief Retrieve the pre-compiled function symbol from the opened library.
+   *
+   * \param name the name of the external function.
+   *
+   * \return The pointer to the external function.
+   * \note Exceptions when loading the symbol can be retrieved by dlerror().
+   */
+  virtual void* GetSymbol(const std::string& name) {
+    return dlsym(handle_, name.c_str());
   }
 
-  void Close() {
+  virtual void Close() {
     if (handle_) {
       dlclose(handle_);
     }
diff --git a/python/tvm/relay/op/contrib/cblas/extern_op.py b/python/tvm/relay/op/contrib/cblas/extern_op.py
index b7959aefa65d..8de8a3f45534 100644
--- a/python/tvm/relay/op/contrib/cblas/extern_op.py
+++ b/python/tvm/relay/op/contrib/cblas/extern_op.py
@@ -22,4 +22,4 @@
 def dense(attrs, args):
     """Check if the external codegen should be used.
     """
-    return True
+    return (args[0]._checked_type_.dtype == 'float32' or args[0]._checked_type_.dtype == 'float64')
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
index 8bcae7512b09..735faf3cba19 100644
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -41,8 +41,8 @@ typedef void (*CblasDouble)(double* a, double* b, double* out, int M, int N, int
 
 class CblasModuleNode : public ExternModuleNodeBase {
  public:
-  const std::string GetExternLibPath() const override {
-    return "/tmp/relay_extern_cblas.so";
+  const std::vector<std::string> GetExternLibPaths() const override {
+    return {"/tmp/relay_extern_cblas.so"};
   }
 
   /*!
@@ -55,27 +55,41 @@ class CblasModuleNode : public ExternModuleNodeBase {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+  runtime::PackedFunc InvokeExternFunc(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     if (name == "nn.dense") {
-      func_s_ = reinterpret_cast<CblasFloat>(func_s);
+      curr_op_name = "dense";
       return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
         CHECK_EQ(args.size(), 3U);
         runtime::NDArray data = args[0];
         runtime::NDArray weight = args[1];
         runtime::NDArray out = args[2];
-
-        const DLTensor* dptr = data.operator->();
-        CHECK(runtime::TypeMatch(dptr->dtype, kDLFloat, 32));
-
-        float* d_data = reinterpret_cast<float*>(data->data);
-        float* weight_data = reinterpret_cast<float*>(weight->data);
-        float* out_data = reinterpret_cast<float*>(out->data);
-
         int M = CountRow(data);
         int N = CountColumn(weight);
         int K = CountColumn(data);
-        (*func_s_)(d_data, weight_data, out_data, M, N, K);
+
+        const DLTensor* dptr = data.operator->();
+        std::string encoded_name = curr_op_name + "_" + std::to_string(dptr->dtype.code) + "_" +
+                                   std::to_string(dptr->dtype.bits);
+
+        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+          float* d_data = reinterpret_cast<float*>(data->data);
+          float* weight_data = reinterpret_cast<float*>(weight->data);
+          float* out_data = reinterpret_cast<float*>(out->data);
+
+          auto func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(encoded_name));
+          (*func_s_)(d_data, weight_data, out_data, M, N, K);
+        } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
+          double* d_data = reinterpret_cast<double*>(data->data);
+          double* weight_data = reinterpret_cast<double*>(weight->data);
+          double* out_data = reinterpret_cast<double*>(out->data);
+
+          auto func_s_ = reinterpret_cast<CblasDouble>(GetSymbol(encoded_name));
+          (*func_s_)(d_data, weight_data, out_data, M, N, K);
+        } else {
+          LOG(FATAL) << "Only support float32 and float64 types.";
+        }
+        
         *rv = out;
       });
     } else {
@@ -144,7 +158,7 @@ class CblasModuleNode : public ExternModuleNodeBase {
     return tensor->shape[1];
   }
 
-  CblasFloat func_s_;
+  std::string curr_op_name;
 };
 
 /*!
diff --git a/src/relay/backend/contrib/cblas/libs.cc b/src/relay/backend/contrib/cblas/libs.cc
index fb4e590d701b..d9a33f880a66 100644
--- a/src/relay/backend/contrib/cblas/libs.cc
+++ b/src/relay/backend/contrib/cblas/libs.cc
@@ -22,10 +22,14 @@ extern "C" {
 #include <stdio.h>
 #endif  // extern "C"
 
-void dense(float* A, float* B, float* C, int M, int N, int K) {
+void dense_2_32(float* A, float* B, float* C, int M, int N, int K) {
   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
 }
 
+void dense_2_64(double* A, double* B, double* C, int M, int N, int K) {
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 1b5a0789aeca..d4e8fb7af811 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -34,8 +34,8 @@ typedef void (*GccBinaryFunc)(ExternalTensor a, ExternalTensor b, ExternalTensor
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::string GetExternLibPath() const override {
-    return "/tmp/relay_extern_gcc.so";
+  const std::vector<std::string> GetExternLibPaths() const override {
+    return {"/tmp/relay_extern_gcc.so"};
   }
 
   /*!
@@ -48,10 +48,10 @@ class GccModuleNode : public ExternModuleNodeBase {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  runtime::PackedFunc InvokeExternFunc(const std::string& name, void* func_s,
+  runtime::PackedFunc InvokeExternFunc(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     if (name == "subtract" || "add" || "multiply") {
-      func_s_ = reinterpret_cast<GccBinaryFunc>(func_s);
+      func_s_ = reinterpret_cast<GccBinaryFunc>(GetSymbol(name));
 
       return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
         CHECK_EQ(args.size(), 3U);
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index ec7049a5d041..4d8be1461e48 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -87,21 +87,21 @@ def test_extern_cblas():
     m = 16
     n = 224
     k = 224
-    x = relay.var('x', shape=(m, k))
-    y = relay.var('y', shape=(n, k))
-    f = relay.Function([x, y], relay.op.nn.dense(x, y))
-    mod = relay.Module()
-    mod['main'] = f
-    mod = relay.transform.ExternOp('cblas')(mod)
-    mod = relay.transform.PartitionGraph()(mod)
-    print(mod['main'])
+    for dtype in ['float32', 'float64']:
+        x = relay.var('x', shape=(m, k), dtype=dtype)
+        y = relay.var('y', shape=(n, k), dtype=dtype)
+        f = relay.Function([x, y], relay.op.nn.dense(x, y))
+        mod = relay.Module()
+        mod['main'] = f
+        mod = relay.transform.ExternOp('cblas')(mod)
+        mod = relay.transform.PartitionGraph()(mod)
 
-    x_data = np.random.uniform(low=0, high=1, size=(m, k)).astype('float32')
-    y_data = np.random.uniform(low=0, high=1, size=(n, k)).astype('float32')
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(
-        res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
+        x_data = np.random.uniform(0, 1, (m, k)).astype(dtype)
+        y_data = np.random.uniform(0, 1, (n, k)).astype(dtype)
+        ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+        res = ex.evaluate()(x_data, y_data)
+        tvm.testing.assert_allclose(
+            res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
 
 if __name__ == "__main__":
     test_partition_graph()

From 7febcdb875deb9e2df472eacb27ca6746e02c052 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 18 Sep 2019 15:55:29 -0700
Subject: [PATCH 14/34] add MKLDNN support and refine interface

---
 cmake/modules/contrib/Extern.cmake            |   9 +-
 include/tvm/relay/contrib_codegen.h           |  64 ++++-
 python/tvm/relay/op/contrib/dnnl/__init__.py  |  20 ++
 python/tvm/relay/op/contrib/dnnl/extern_op.py |  34 +++
 python/tvm/relay/op/contrib/extern_op.py      |  12 +-
 src/relay/backend/contrib/cblas/codegen.cc    | 149 +++++------
 src/relay/backend/contrib/cblas/libs.cc       |  23 +-
 src/relay/backend/contrib/dnnl/codegen.cc     | 236 ++++++++++++++++++
 src/relay/backend/contrib/dnnl/libs.cc        | 148 +++++++++++
 src/relay/backend/contrib/gcc/codegen.cc      | 107 +++++---
 src/relay/backend/contrib/gcc/libs.cc         |  35 ---
 src/relay/pass/extern_op.cc                   |   3 +
 src/relay/pass/partition_graph.cc             |   4 +-
 .../python/relay/test_pass_partition_graph.py |  99 ++++++--
 14 files changed, 752 insertions(+), 191 deletions(-)
 create mode 100644 python/tvm/relay/op/contrib/dnnl/__init__.py
 create mode 100644 python/tvm/relay/op/contrib/dnnl/extern_op.py
 create mode 100644 src/relay/backend/contrib/dnnl/codegen.cc
 create mode 100644 src/relay/backend/contrib/dnnl/libs.cc

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index c8e862ac5e84..2fc88449d8cc 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -18,13 +18,18 @@
 message(STATUS "Build with relay.backend.contrib")
 
 # Gcc (for demo purpose)
-file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/*.cc)
+file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
 list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
 # CBLAS (for demo purpose)
-file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/*.cc)
+file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/codegen.cc)
 if(USE_BLAS STREQUAL "mkl")
     list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
 elseif(USE_BLAS STREQUAL "none")
     # pass
 endif()
+
+# DNNL (for demo purpose)
+file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
+list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+
diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 7b294b56c40e..79ed2223a5b9 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -50,7 +50,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return An array of strings of the library paths.
    */
-  virtual const std::vector<std::string> GetExternLibPaths() const = 0;
+  virtual const std::vector<std::string> GetExternLibPaths(std::string id = "") const = 0;
 
   /*!
    * \brief Build the shared library of external ops.
@@ -83,9 +83,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (!IsHandleOpen()) {
-      Open(this->GetExternLibPaths());
-    }
+    auto id = GetSubgraphID(name);
+    Open(this->GetExternLibPaths(id));
     CHECK(handle_) << "The external module has not been built or failed to open.\n";
 
     auto func_s = this->InvokeExternFunc(name, sptr_to_self);
@@ -113,16 +112,57 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   }
 
   /*!
-   * \brief Check if the library is opened or not.
+   * \brief Split the encoded function name to tokens.
    *
+   * \param the function name string.
    *
-   * \return True if the library is already opened.
+   * \return a vector of tokenized function name splitted by "_".
    */
-  virtual bool IsHandleOpen() {
-    return handle_ != nullptr;
+  std::string GetSubgraphID(Function& func) {
+    const auto name_node = FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
+    CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
+    std::string name = name_node->value;
+    return GetSubgraphID(name);
+  }
+  
+  std::string GetSubgraphID(std::string name) {
+    std::string temp = name;
+    std::vector<std::string> tokens;
+    std::string delimiter = "_";
+    size_t pos = 0;
+    std::string token;
+    while ((pos = temp.find(delimiter)) != std::string::npos) {
+      token = temp.substr(0, pos);
+      tokens.push_back(token);
+      temp.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(temp);
+
+    CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
+    CHECK(tokens[0] == "subgraph") << "Function name does not start with \"subgraph\": " << name;
+    return tokens[1];
+  }
+
+  bool IsOp(const CallNode* call, std::string op_name) {
+    const auto* op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "Expects a single op.";
+    Op op = GetRef<Op>(op_node);
+    return op == Op::Get(op_name);
   }
 
  protected:
+  std::vector<int> GetShape(const Expr& expr) const {
+    const auto* ttype = expr->checked_type().as<TensorTypeNode>();
+    CHECK(ttype);
+    std::vector<int> _shape;
+    for (int i = 0; i < ttype->shape.size(); ++i) {
+      auto* val = ttype->shape[i].as<IntImm>();
+      CHECK(val);
+      _shape.push_back(val->value);
+    }
+    return _shape;
+  }
+
   // Platform dependent handlers for opening system lib.
 #if defined(_WIN32)
   // The handle.
@@ -152,6 +192,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
 
   // load the library
   virtual void Open(const std::vector<std::string> lib_names) {
+    Close();
     CHECK(lib_names.size() == 1) << "Default library loader only loads one library. "
                                  << "Please override the loader if multiple libraries are used";
     handle_ = dlopen(lib_names[0].c_str(), RTLD_LAZY | RTLD_LOCAL);
@@ -168,7 +209,12 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    * \note Exceptions when loading the symbol can be retrieved by dlerror().
    */
   virtual void* GetSymbol(const std::string& name) {
-    return dlsym(handle_, name.c_str());
+    auto sym = dlsym(handle_, name.c_str());
+    char* error = dlerror();
+    if (error) {
+      CHECK(0) << "Fail to get symbol " << name << ": " << error;
+    }
+    return sym;
   }
 
   virtual void Close() {
diff --git a/python/tvm/relay/op/contrib/dnnl/__init__.py b/python/tvm/relay/op/contrib/dnnl/__init__.py
new file mode 100644
index 000000000000..0da426ab4741
--- /dev/null
+++ b/python/tvm/relay/op/contrib/dnnl/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .extern_op import *
diff --git a/python/tvm/relay/op/contrib/dnnl/extern_op.py b/python/tvm/relay/op/contrib/dnnl/extern_op.py
new file mode 100644
index 000000000000..a8da36a5c32c
--- /dev/null
+++ b/python/tvm/relay/op/contrib/dnnl/extern_op.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CBLAS library supported operators."""
+from __future__ import absolute_import
+
+def conv2d(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def dense(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def relu(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index a6ba2dfa525b..ab43bff40091 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -27,12 +27,15 @@
 """
 from __future__ import absolute_import
 
+import logging
 import pkgutil
 from pathlib import Path
 from importlib import import_module
 
 from .. import op as reg
 
+logger = logging.getLogger('ExternOp')
+
 # Load available contrib compilers
 compilers = {}
 for _, name, _ in pkgutil.iter_modules([Path(__file__).parent]):
@@ -47,7 +50,8 @@ def get_extern_op(compiler, op_name):
             if hasattr(extern_op, op_name):
                 return getattr(extern_op, op_name)
 
-    raise RuntimeError("%s in %s is not registered" % (op_name, compiler))
+    logger.warning("%s in %s is not registered. Fallback to CPU" % (op_name, compiler))
+    return lambda x, y: False
 
 @reg.register_extern_op("nn.conv2d")
 def external_conv2d(attrs, args, compiler):
@@ -62,6 +66,12 @@ def external_dense(attrs, args, compiler):
     """
     return get_extern_op(compiler, 'dense')(attrs, args)
 
+@reg.register_extern_op("nn.relu")
+def external_relu(attrs, args, compiler):
+    """Check if the external compiler should be used.
+    """
+    return get_extern_op(compiler, 'relu')(attrs, args)
+
 
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
index 735faf3cba19..0308db23c515 100644
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -36,13 +36,13 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*CblasFloat)(float* a, float* b, float* out, int M, int N, int K);
-typedef void (*CblasDouble)(double* a, double* b, double* out, int M, int N, int K);
+typedef void (*CblasFloat)(float* a, float* b, float* out);
+typedef void (*CblasDouble)(double* a, double* b, double* out);
 
 class CblasModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths() const override {
-    return {"/tmp/relay_extern_cblas.so"};
+  const std::vector<std::string> GetExternLibPaths(std::string id = "") const override {
+    return {"/tmp/relay_cblas_lib_" + id + ".so"};
   }
 
   /*!
@@ -57,45 +57,36 @@ class CblasModuleNode : public ExternModuleNodeBase {
    */
   runtime::PackedFunc InvokeExternFunc(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (name == "nn.dense") {
-      curr_op_name = "dense";
-      return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 3U);
-        runtime::NDArray data = args[0];
-        runtime::NDArray weight = args[1];
-        runtime::NDArray out = args[2];
-        int M = CountRow(data);
-        int N = CountColumn(weight);
-        int K = CountColumn(data);
-
-        const DLTensor* dptr = data.operator->();
-        std::string encoded_name = curr_op_name + "_" + std::to_string(dptr->dtype.code) + "_" +
-                                   std::to_string(dptr->dtype.bits);
-
-        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-          float* d_data = reinterpret_cast<float*>(data->data);
-          float* weight_data = reinterpret_cast<float*>(weight->data);
-          float* out_data = reinterpret_cast<float*>(out->data);
-
-          auto func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(encoded_name));
-          (*func_s_)(d_data, weight_data, out_data, M, N, K);
-        } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
-          double* d_data = reinterpret_cast<double*>(data->data);
-          double* weight_data = reinterpret_cast<double*>(weight->data);
-          double* out_data = reinterpret_cast<double*>(out->data);
-
-          auto func_s_ = reinterpret_cast<CblasDouble>(GetSymbol(encoded_name));
-          (*func_s_)(d_data, weight_data, out_data, M, N, K);
-        } else {
-          LOG(FATAL) << "Only support float32 and float64 types.";
-        }
-        
-        *rv = out;
-      });
-    } else {
-      LOG(INFO) << name << " is not Supported. Only nn.dense is supported so far";
-      return PackedFunc();
-    }
+    _curr_id = GetSubgraphID(name);
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 3U);
+      runtime::NDArray data = args[0];
+      runtime::NDArray weight = args[1];
+      runtime::NDArray out = args[2];
+
+      const DLTensor* dptr = data.operator->();
+      std::string encoded_name = _prefix + _curr_id;
+
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        float* d_data = reinterpret_cast<float*>(data->data);
+        float* weight_data = reinterpret_cast<float*>(weight->data);
+        float* out_data = reinterpret_cast<float*>(out->data);
+
+        auto func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(encoded_name));
+        (*func_s_)(d_data, weight_data, out_data);
+      } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
+        double* d_data = reinterpret_cast<double*>(data->data);
+        double* weight_data = reinterpret_cast<double*>(weight->data);
+        double* out_data = reinterpret_cast<double*>(out->data);
+
+        auto func_s_ = reinterpret_cast<CblasDouble>(GetSymbol(encoded_name));
+        (*func_s_)(d_data, weight_data, out_data);
+      } else {
+        LOG(FATAL) << "Only support float32 and float64 types.";
+      }
+      
+      *rv = out;
+    });
   }
 
   /*!
@@ -117,48 +108,60 @@ class CblasModuleNode : public ExternModuleNodeBase {
     Function func = Downcast<Function>(expr);
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "CBLAS expects a single convolution or dense op.";
-
-    const auto* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "CBLAS expects a single convolution or dense op.";
-    Op op = GetRef<Op>(op_node);
-    if (op == Op::Get("nn.conv2d")) {
-      // const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
-      // TODO(@zhiics) Generate the template.
-    } else if (op == Op::Get("nn.dense")) {
-      // TODO(@zhiics) Generate the template.
-      // const auto* dense_attr = call->attrs.as<DenseAttrs>();
+    CHECK(call) << "CBLAS expects a single dense op.";
+
+    // Record subgraph ID for runtime invoke.
+    auto id = GetSubgraphID(func);
+    std::string encoded_id = _prefix + id;
+    std::string code = "";
+
+    // Args: ID
+    std::vector<std::string> args;
+    args.push_back(encoded_id);
+
+    if (IsOp(call, "nn.dense")) {
+      auto ishape = GetShape(call->args[0]);
+      auto wshape = GetShape(call->args[1]);
+
+      // Args: M, N, K
+      args.push_back(std::to_string(ishape[0]));
+      args.push_back(std::to_string(wshape[1]));
+      args.push_back(std::to_string(ishape[1]));
+
+      auto type_node = call->checked_type().as<TensorTypeNode>();
+      CHECK(type_node != nullptr);
+      CHECK(type_node->dtype.is_float()) << "Only support float types";
+
+      code = "DENSE_FP" + std::to_string(type_node->dtype.bits()) + "(" +
+             args[0] + ", " + args[1] + ", " + args[2] + ", " + args[3] + ");";
     } else {
-      LOG(FATAL) << "CBLAS expects a single convolution or dense op.";
+      LOG(FATAL) << "CBLAS expects a single dense op.";
     }
 
     if (!std::getenv("MKLROOT")) {
       LOG(FATAL) << "MKLROOT not found. Did you source mklvars.sh?";
     }
-    int ret = std::system("g++ -O2 -Wall -std=c++11 -shared -fPIC "
-                          "src/relay/backend/contrib/cblas/libs.cc "
-                          "-o /tmp/relay_extern_cblas.so -ldl -lpthread -lm -lmkl_rt");
+    std::string lib_src_name = "/tmp/relay_cblas_lib_" + id + ".cc";
+    std::string lib_name = "/tmp/relay_cblas_lib_" + id + ".so";
+
+    // Prepare library source
+    std::string cmd = "cp src/relay/backend/contrib/cblas/libs.cc " + lib_src_name;
+    std::system(cmd.c_str());
+
+    cmd = "echo \"" + code + "\" >> " + lib_src_name;
+    std::system(cmd.c_str());
+
+    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name + " -o " + lib_name +
+          " -ldl -lpthread -lm -lmkl_rt";
+    int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile CBLAS library. Error code: " << ret;
     }
   }
 
  private:
-  // Get the number of rows of a ndarray.
-  int CountRow(const runtime::NDArray& data) const {
-    const DLTensor* tensor = data.operator->();
-    CHECK(tensor) << "No container is defined in the NDArray" << "\n";
-    return tensor->shape[0];
-  }
-
-  // Get the number of columns of a ndarray.
-  int CountColumn(const runtime::NDArray& data) const {
-    const DLTensor* tensor = data.operator->();
-    CHECK(tensor) << "No container is defined in the NDArray" << "\n";
-    return tensor->shape[1];
-  }
-
-  std::string curr_op_name;
+  std::string _curr_id;
+  std::string _prefix = "cblas_";
 };
 
 /*!
diff --git a/src/relay/backend/contrib/cblas/libs.cc b/src/relay/backend/contrib/cblas/libs.cc
index d9a33f880a66..6cd08d75ca30 100644
--- a/src/relay/backend/contrib/cblas/libs.cc
+++ b/src/relay/backend/contrib/cblas/libs.cc
@@ -16,20 +16,17 @@
  * under the License.
  */
 
-#ifdef __cplusplus
-extern "C" {
 #include <mkl_cblas.h>
 #include <stdio.h>
-#endif  // extern "C"
 
-void dense_2_32(float* A, float* B, float* C, int M, int N, int K) {
-  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
-}
+#define DENSE_FP32(p_ID_, p_M_, p_N_, p_K_)                                                       \
+  extern "C" void p_ID_(float* A, float* B, float* C) {                                           \
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p_M_, p_N_, p_K_, 1.0, A, p_K_, B, p_N_, \
+                0.0, C, p_N_);                                                                    \
+  }
 
-void dense_2_64(double* A, double* B, double* C, int M, int N, int K) {
-  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0, A, K, B, N, 0.0, C, N);
-}
-
-#ifdef __cplusplus
-}
-#endif
+#define DENSE_FP64(p_ID_, p_M_, p_N_, p_K_)                                                       \
+  extern "C" void p_ID_(double* A, double* B, double* C) {                                        \
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p_M_, p_N_, p_K_, 1.0, A, p_K_, B, p_N_, \
+                0.0, C, p_N_);                                                                    \
+  }
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
new file mode 100644
index 000000000000..e90d8aa43812
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -0,0 +1,236 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <dlpack/dlpack.h>
+#include <stdlib.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/contrib_codegen.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/util.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+typedef void (*DNNL2PFP32)(float* input, float* out);
+typedef void (*DNNL3PFP32)(float* input, float* weights, float* out);
+
+class DNNLModuleNode : public ExternModuleNodeBase {
+ public:
+  const std::vector<std::string> GetExternLibPaths(std::string id) const override {
+    return {"/tmp/relay_dnnl_lib_" + id + ".so"};
+  }
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters.
+   *
+   * \param name the name of the external function.
+   * \param func_s The function symbol retrieved from the external library.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  runtime::PackedFunc InvokeExternFunc(const std::string& name,
+                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    _curr_id = GetSubgraphID(name);
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      if (args.size() == 3U) {
+        runtime::NDArray data = args[0];
+        runtime::NDArray weight = args[1];
+        runtime::NDArray out = args[2];
+
+        const DLTensor* dptr = data.operator->();
+        std::string encoded_name = _prefix + _curr_id;
+
+        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+          float* d_data = reinterpret_cast<float*>(data->data);
+          float* weight_data = reinterpret_cast<float*>(weight->data);
+          float* out_data = reinterpret_cast<float*>(out->data);
+
+          auto func_s_ = reinterpret_cast<DNNL3PFP32>(GetSymbol(encoded_name));
+          try {
+            (*func_s_)(d_data, weight_data, out_data);
+          } catch (const std::exception& e) {
+            LOG(FATAL) << e.what();
+          }
+        } else {
+          LOG(FATAL) << "Only support float32 types.";
+        }
+        *rv = out;
+      }
+      else if (args.size() == 2U) {
+        runtime::NDArray data = args[0];
+        runtime::NDArray out = args[1];
+
+        const DLTensor* dptr = data.operator->();
+        std::string encoded_name = _prefix + _curr_id;
+
+        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+          float* d_data = reinterpret_cast<float*>(data->data);
+          float* out_data = reinterpret_cast<float*>(out->data);
+
+          auto func_s_ = reinterpret_cast<DNNL2PFP32>(GetSymbol(encoded_name));
+          try {
+            (*func_s_)(d_data, out_data);
+          } catch (const std::exception& e) {
+            LOG(FATAL) << e.what();
+          }
+        } else {
+          LOG(FATAL) << "Only support float32 types.";
+        }
+        *rv = out;
+      }
+      else {
+        LOG(FATAL) << "Unsupported argument number: " << args.size();
+      }
+    });
+  }
+
+  /*!
+   * \brief Get the source code of the external module.
+   *
+   * \param format The format of the source code.
+   *
+   * \return The source code of the external library module in the text form.
+   */
+  TVM_DLL std::string GetSource(const std::string& format = "") override { return ""; }
+
+  const char* type_key() const override { return "DNNLModule"; }
+
+  void Build(const Expr& expr) override {
+    Function func = Downcast<Function>(expr);
+    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    const auto* call = func->body.as<CallNode>();
+    CHECK(call) << "DNNL expects a single convolution or dense op.";
+
+    // Record subgraph ID for runtime invoke.
+    auto id = GetSubgraphID(func);
+    std::string encoded_id = _prefix + id;
+    std::string code = "";
+
+    // Args: ID
+    std::vector<std::string> args;
+    args.push_back(encoded_id);
+
+    if (IsOp(call, "nn.conv2d")) {
+      code = "CONV2D";
+      const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
+
+      auto ishape = GetShape(call->args[0]);
+      auto wshape = GetShape(call->args[1]);
+
+      // Args: N, C, H, W
+      for (auto s : ishape) {
+        args.push_back(std::to_string(s));
+      }
+
+      // Args: O, G, Ph, Pw, Kh, Kw, Sh, Sw
+      args.push_back(std::to_string(wshape[0]));
+      args.push_back(std::to_string(conv2d_attr->groups));
+      args.push_back(std::to_string(conv2d_attr->padding[0].as<IntImm>()->value));
+      args.push_back(std::to_string(conv2d_attr->padding[1].as<IntImm>()->value));
+      args.push_back(std::to_string(wshape[2]));
+      args.push_back(std::to_string(wshape[3]));
+      args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
+      args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
+    } else if (IsOp(call, "nn.dense")) {
+      code = "DENSE";
+
+      auto ishape = GetShape(call->args[0]);
+      auto wshape = GetShape(call->args[1]);
+
+      // Args: N, C, O
+      args.push_back(std::to_string(ishape[0]));
+      args.push_back(std::to_string(ishape[1]));
+      args.push_back(std::to_string(wshape[0]));
+
+    } else if (IsOp(call, "nn.relu")) {
+      code = "RELU";
+
+      auto ishape = GetShape(call->args[0]);
+
+      // Args: N, C, H, W
+      for (auto s : ishape) {
+        args.push_back(std::to_string(s));
+      }
+    } else {
+      LOG(FATAL) << "DNNL expects a single convolution or dense op.";
+    }
+    Compile(id, code, args);
+  }
+
+ private:
+  void Compile(std::string id, std::string code, std::vector<std::string> args) {
+    // FIXME: Now we compile N libraries for N subgraphs, but we should merge them to one.
+    std::string lib_src_name = "/tmp/relay_dnnl_lib_" + id + ".cc";
+    std::string lib_name = "/tmp/relay_dnnl_lib_" + id + ".so";
+
+    // Prepare library source
+    std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + lib_src_name;
+    std::system(cmd.c_str());
+
+    // Push macro implementation
+    bool first = true;
+    std::string macro = code + "(";
+    for (auto arg : args) {
+      if (!first) macro += ", ";
+      first = false;
+      macro += arg;
+    }
+    macro += ")";
+    cmd = "echo \"" + macro + ";\" >> " + lib_src_name;
+    std::system(cmd.c_str());
+
+    // Compile
+    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name +
+          " -o " + lib_name + " -ldl -lpthread -lm -ldnnl";
+    int ret = std::system(cmd.c_str());
+    if (ret != 0) {
+      LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
+    }
+  }
+
+  std::string _curr_id;
+  const std::string _prefix = "dnnl_";
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression and
+ * compile it into a runtime module.
+ */
+runtime::Module DNNLCompiler(const Expr& expr) {
+  std::shared_ptr<DNNLModuleNode> n = std::make_shared<DNNLModuleNode>();
+  n->Build(expr);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_API("relay.ext.dnnl").set_body_typed(DNNLCompiler);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
new file mode 100644
index 000000000000..023cb58a26a6
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -0,0 +1,148 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdlib.h>
+#include <algorithm>
+#include <numeric>
+#include <iostream>
+#include <string>
+
+#include "dnnl.hpp"
+
+using namespace dnnl;
+
+// Read from memory, write to handle
+inline void read_from_dnnl_memory(void* handle, memory& mem) {
+  size_t bytes = mem.get_desc().get_size();
+
+  uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
+  std::copy(src, src + bytes, (uint8_t*)handle);
+}
+
+#define CONV2D(p_ID_, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_,       \
+               p_Sw_)                                                                              \
+  extern "C" void p_ID_(float* data, float* weights, float* out) {                                 \
+    using tag = memory::format_tag;                                                                \
+    using dt = memory::data_type;                                                                  \
+    engine eng(engine::kind::cpu, 0);                                                              \
+    stream s(eng);                                                                                 \
+                                                                                                   \
+    memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};                                         \
+    memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};                                   \
+    if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};                        \
+    memory::dims conv2d_bias_tz = {p_O_};                                                          \
+    memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,          \
+                                  (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};                     \
+    memory::dims conv2d_strides = {p_Sh_, p_Sw_};                                                  \
+    memory::dims conv2d_padding = {p_Ph_, p_Pw_};                                                  \
+                                                                                                   \
+    std::vector<float> conv2d_bias(p_O_, 0);                                                       \
+                                                                                                   \
+    auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);               \
+    auto user_weights_memory =                                                                     \
+        memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights); \
+    auto conv2d_user_bias_memory =                                                                 \
+        memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());                      \
+                                                                                                   \
+    auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);                         \
+    auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);                       \
+    auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);                 \
+    auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any);                         \
+                                                                                                   \
+    auto conv2d_desc =                                                                             \
+        convolution_forward::desc(prop_kind::forward_inference, algorithm::convolution_direct,     \
+                                  conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md, \
+                                  conv2d_strides, conv2d_padding, conv2d_padding);                 \
+    auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);                 \
+                                                                                                   \
+    auto conv2d_src_memory = user_src_memory;                                                      \
+    auto conv2d_weights_memory = user_weights_memory;                                              \
+    auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);                             \
+                                                                                                   \
+    auto conv = convolution_forward(conv2d_prim_desc);                                             \
+    conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},                                            \
+                     {DNNL_ARG_WEIGHTS, conv2d_weights_memory},                                    \
+                     {DNNL_ARG_BIAS, conv2d_user_bias_memory},                                     \
+                     {DNNL_ARG_DST, conv2d_dst_memory}});                                          \
+    s.wait();                                                                                      \
+    read_from_dnnl_memory(out, conv2d_dst_memory);                                                 \
+  }
+
+#define DENSE(p_ID_, p_B_, p_I_, p_O_)                                                   \
+  extern "C" void p_ID_(float* data, float* weight, float* out) {                        \
+    using tag = memory::format_tag;                                                      \
+    using dt = memory::data_type;                                                        \
+                                                                                         \
+    engine eng(engine::kind::cpu, 0);                                                    \
+    stream s(eng);                                                                       \
+                                                                                         \
+    memory::dims data_tz = {p_B_, p_I_};                                                 \
+    memory::dims weight_tz = {p_O_, p_I_};                                               \
+    memory::dims bias_tz = {p_O_};                                                       \
+    memory::dims dst_tz = {p_B_, p_O_};                                                  \
+                                                                                         \
+    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};                            \
+    auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});                      \
+    auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});                           \
+    auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});                            \
+                                                                                         \
+    std::vector<float> bias(p_O_, 0);                                                    \
+    auto data_memory = memory(data_md, eng, data);                                       \
+    auto weight_memory = memory(weight_md, eng, weight);                                 \
+    auto bias_memory = memory(bias_md, eng, bias.data());                                \
+    auto dst_memory = memory(dst_md, eng);                                               \
+                                                                                         \
+    auto dense_desc = inner_product_forward::desc(prop_kind::forward_inference, data_md, \
+                                                  weight_md, bias_md, dst_md);           \
+    auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);       \
+    assert(dst_md == dense_prim_desc.dst_desc());                                        \
+                                                                                         \
+    auto dense = inner_product_forward(dense_prim_desc);                                 \
+    dense.execute(s, {{DNNL_ARG_SRC, data_memory},                                       \
+                      {DNNL_ARG_WEIGHTS, weight_memory},                                 \
+                      {DNNL_ARG_BIAS, bias_memory},                                      \
+                      {DNNL_ARG_DST, dst_memory}});                                      \
+    s.wait();                                                                            \
+    read_from_dnnl_memory(out, dst_memory);                                              \
+  }
+
+#define RELU(p_ID_, p_N_, p_C_, p_H_, p_W_)                                                       \
+  extern "C" void p_ID_(float* data, float* out) {                                                \
+    using tag = memory::format_tag;                                                               \
+    using dt = memory::data_type;                                                                 \
+                                                                                                  \
+    engine eng(engine::kind::cpu, 0);                                                             \
+    stream s(eng);                                                                                \
+                                                                                                  \
+    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                              \
+                                                                                                  \
+    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                                   \
+                                                                                                  \
+    auto data_memory = memory(data_md, eng, data);                                                \
+    auto dst_memory = memory(data_md, eng);                                                       \
+                                                                                                  \
+    auto relu_desc =                                                                              \
+        eltwise_forward::desc(prop_kind::forward_inference, algorithm::eltwise_relu, data_md, 0); \
+    auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);                        \
+    assert(data_md == relu_prim_desc.dst_desc());                                                 \
+                                                                                                  \
+    auto relu = eltwise_forward(relu_prim_desc);                                                  \
+    relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});                   \
+    s.wait();                                                                                     \
+    read_from_dnnl_memory(out, dst_memory);                                                       \
+  }
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index d4e8fb7af811..6b33adf10a5d 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -34,8 +34,8 @@ typedef void (*GccBinaryFunc)(ExternalTensor a, ExternalTensor b, ExternalTensor
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths() const override {
-    return {"/tmp/relay_extern_gcc.so"};
+  const std::vector<std::string> GetExternLibPaths(std::string id = "") const override {
+    return {"/tmp/relay_gcc_lib_" + id + ".so"};
   }
 
   /*!
@@ -50,42 +50,37 @@ class GccModuleNode : public ExternModuleNodeBase {
    */
   runtime::PackedFunc InvokeExternFunc(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    if (name == "subtract" || "add" || "multiply") {
-      func_s_ = reinterpret_cast<GccBinaryFunc>(GetSymbol(name));
-
-      return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 3U);
-        runtime::NDArray a = args[0];
-        ExternalTensor lhs;
-        lhs.data = a->data;
-        lhs.ndim = a.Shape().size();
-        // lhs.shape = a.Shape().data();
-        lhs.shape = new int64_t[lhs.ndim];
-
-        runtime::NDArray b = args[1];
-        ExternalTensor rhs;
-        rhs.data = b->data;
-        rhs.ndim = b.Shape().size();
-        rhs.shape = new int64_t[rhs.ndim];
-        // rhs.shape = b.Shape().data();
-
-        runtime::NDArray c = args[2];
-        ExternalTensor out;
-        out.data = c->data;
-        out.ndim = c.Shape().size();
-        out.shape = c.Shape().data();
-
-        for (int i = 0; i < lhs.ndim; i++) {
-          lhs.shape[i] = a.Shape()[i];
-          rhs.shape[i] = b.Shape()[i];
-        }
-        (*func_s_)(lhs, rhs, &out);
-        *rv = c;
-      });
-    } else {
-      LOG(FATAL) << "Unknown function found when invoking extern library: " << name;
-      return PackedFunc();
-    }
+    std::string _curr_id = GetSubgraphID(name);
+    std::string encoded_id = _prefix + _curr_id;
+    func_s_ = reinterpret_cast<GccBinaryFunc>(GetSymbol(encoded_id));
+
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 3U);
+      runtime::NDArray a = args[0];
+      ExternalTensor lhs;
+      lhs.data = a->data;
+      lhs.ndim = a.Shape().size();
+      lhs.shape = new int64_t[lhs.ndim];
+
+      runtime::NDArray b = args[1];
+      ExternalTensor rhs;
+      rhs.data = b->data;
+      rhs.ndim = b.Shape().size();
+      rhs.shape = new int64_t[rhs.ndim];
+
+      runtime::NDArray c = args[2];
+      ExternalTensor out;
+      out.data = c->data;
+      out.ndim = c.Shape().size();
+      out.shape = c.Shape().data();
+
+      for (int i = 0; i < lhs.ndim; i++) {
+        lhs.shape[i] = a.Shape()[i];
+        rhs.shape[i] = b.Shape()[i];
+      }
+      (*func_s_)(lhs, rhs, &out);
+      *rv = c;
+    });
   }
 
   /*!
@@ -106,9 +101,38 @@ class GccModuleNode : public ExternModuleNodeBase {
   void Build(const Expr& expr) override {
     Function func = Downcast<Function>(expr);
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
-    int ret = std::system(
-        "g++ -std=c++11 -shared -fPIC -ldl src/relay/backend/contrib/gcc/libs.cc "
-        "-o /tmp/relay_extern_gcc.so");
+    const auto* call = func->body.as<CallNode>();
+    CHECK(call) << "GCC expects a single op.";
+
+    // Record subgraph ID for runtime invoke.
+    auto id = GetSubgraphID(func);
+    std::string encoded_id = _prefix + id;
+    std::string code = "GCC_BINARY_OP(" + encoded_id + ", ";
+
+    if (IsOp(call, "add")) {
+      code += "+";
+    } else if (IsOp(call, "subtract")) {
+      code += "-";
+    } else if (IsOp(call, "multiply")) {
+      code += "*";
+    } else {
+      LOG(FATAL) << "Unrecognized op: ";
+    }
+    code += ");";
+
+    // Prepare library source
+    std::string lib_src_name = "/tmp/relay_gcc_lib_" + id + ".cc";
+    std::string lib_name = "/tmp/relay_gcc_lib_" + id + ".so";
+    std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + lib_src_name;
+    std::system(cmd.c_str());
+    std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/");
+    
+    cmd = "echo \"" + code + "\" >> " + lib_src_name;
+    std::system(cmd.c_str());
+
+    cmd = "g++ -std=c++11 -shared -fPIC -ldl " + lib_src_name +
+          " -o " + lib_name;
+    int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
@@ -116,6 +140,7 @@ class GccModuleNode : public ExternModuleNodeBase {
 
  private:
   GccBinaryFunc func_s_;
+  std::string _prefix = "gcc_";
 };
 
 
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
index 4c81567c3845..4a94b564465d 100644
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -56,38 +56,3 @@
     }                                                                      \
   }
 
-GCC_BINARY_OP(subtract, -);
-GCC_BINARY_OP(add, +);
-GCC_BINARY_OP(multiply, *);
-
-// extern "C" void Subtract(ExternalTensor a, ExternalTensor b, ExternalTensor* out) {
-//   if (a.ndim > 2 || a.ndim != b.ndim || a.ndim  != out->ndim) {
-//     std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim
-//               << ", b.ndim = " << b.ndim
-//               << ", out ndim = " << out->ndim << std::endl;
-//   }
-//   for (int i = 0; i < a.ndim; i++) {
-//     if (a.shape[i] != b.shape[i]) {
-//       std::cerr << "shape[" << i << "]: a = " << a.shape[i] << ", b = " << b.shape[i] << std::endl;
-//     }
-//   }
-//   std::cout << "dim: " << a.ndim << " shape: " << std::endl;
-//   for (int i = 0; i < a.ndim; i++) {
-//     std::cout << a.shape[i] << " " << b.shape[i] << std::endl;
-//   }
-//   float* a_ptr = static_cast<float*>(a.data);
-//   float* b_ptr = static_cast<float*>(b.data);
-//   float* out_ptr = static_cast<float*>(out->data);
-//   if (a.ndim == 1) {
-//     for (int64_t i = 0; i < a.shape[0]; i++) {
-//       out_ptr[i] = a_ptr[i] - b_ptr[i];
-//     }
-//   } else {
-//     for (int64_t i = 0; i < a.shape[0]; i++) {
-//       for (int64_t j = 0; j < a.shape[1]; j++) {
-//         int64_t k = i * a.shape[1] + j;
-//         out_ptr[k] = a_ptr[k] - b_ptr[k];
-//       }
-//     }
-//   }
-// }
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index bd3c284f25ed..714a07c26505 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -66,6 +66,9 @@ class ExternOpWrapper : public ExprMutator {
         return end;
       }
     }
+    else {
+      LOG(WARNING) << op.operator->()->name << " in " << compiler_ << " is not registered";
+    }
     return new_e;
   }
 
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index e205a6fa6b60..60ffe3bc7234 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -201,10 +201,10 @@ class Partitioner : public ExprMutator {
       // external funciton and leave the processing of the function to codegen.
       // Otherwise, it's hard to deal with multiple-node subgraphs.
       Expr arg0 = call->args[0];
-      std::string name = "subgraph";
+      std::string name = "subgraph_" + std::to_string(subgraph->id);
       if (const auto* arg_call = arg0.as<CallNode>()) {
         if (const auto* op_node = arg_call->op.as<OpNode>()) {
-          name = op_node->name;
+          name += "_" + op_node->name;
         }
       }
       subgraph_func =
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 4d8be1461e48..7f69801aedc5 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -87,23 +87,92 @@ def test_extern_cblas():
     m = 16
     n = 224
     k = 224
-    for dtype in ['float32', 'float64']:
-        x = relay.var('x', shape=(m, k), dtype=dtype)
-        y = relay.var('y', shape=(n, k), dtype=dtype)
-        f = relay.Function([x, y], relay.op.nn.dense(x, y))
-        mod = relay.Module()
-        mod['main'] = f
-        mod = relay.transform.ExternOp('cblas')(mod)
-        mod = relay.transform.PartitionGraph()(mod)
-
-        x_data = np.random.uniform(0, 1, (m, k)).astype(dtype)
-        y_data = np.random.uniform(0, 1, (n, k)).astype(dtype)
-        ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-        res = ex.evaluate()(x_data, y_data)
-        tvm.testing.assert_allclose(
-            res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
+    dtype = 'float64'
+    x = relay.var('x', shape=(m, k), dtype=dtype)
+    y = relay.var('y', shape=(n, k), dtype=dtype)
+    f = relay.Function([x, y], relay.op.nn.dense(x, y))
+    mod = relay.Module()
+    mod['main'] = f
+    mod = relay.transform.ExternOp('cblas')(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+
+    x_data = np.random.uniform(0, 1, (m, k)).astype(dtype)
+    y_data = np.random.uniform(0, 1, (n, k)).astype(dtype)
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, y_data)
+    tvm.testing.assert_allclose(
+        res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
+
+def test_extern_dnnl():
+    dtype = 'float32'
+    ishape = (1, 32, 14, 14)
+    w1shape = (32, 1, 3, 3)
+    w2shape = (100, 32 * 14 * 14)
+
+    data = relay.var('data', shape=(ishape), dtype=dtype)
+    weight1 = relay.var('weight1', shape=(w1shape), dtype=dtype)
+    depthwise_conv2d_1 = relay.nn.conv2d(data,
+                                         weight1,
+                                         kernel_size=(3, 3),
+                                         padding=(1, 1),
+                                         groups=32)
+    depthwise_conv2d_2 = relay.nn.conv2d(depthwise_conv2d_1,
+                                         weight1,
+                                         kernel_size=(3, 3),
+                                         padding=(1, 1),
+                                         groups=32)
+    out1 = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
+    out2 = relay.nn.batch_flatten(data=out1)
+    weight2 = relay.var('weight2', shape=(w2shape), dtype=dtype)
+    out3 = relay.nn.dense(out2, weight2)
+
+    f = relay.Function([data, weight1, weight2], out3)
+
+    mod = relay.Module()
+    mod['main'] = f
+    mod = relay.transform.ExternOp('dnnl')(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+
+    ref_mod = relay.Module()
+    ref_mod['main'] = f
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
+    w2_data = np.random.uniform(0, 1, w2shape).astype(dtype)
+
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(i_data, w1_data, w2_data)
+
+    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_res = ref_ex.evaluate()(i_data, w1_data, w2_data)
+
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+
+
+def test_extern_dnnl_mobilenet():
+    # FIXME: This test is only for demo purpose and supposed to be removed.
+    dtype = 'float32'
+    ishape = (1, 3, 224, 224)
+    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
+
+    mod = relay.transform.ExternOp('dnnl')(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(i_data, **params)
+
+    ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
+    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_res = ref_ex.evaluate()(i_data, **params)
+
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+
 
 if __name__ == "__main__":
     test_partition_graph()
     test_extern_gcc()
     test_extern_cblas()
+    test_extern_dnnl()
+    #test_extern_dnnl_mobilenet()

From 47517afc44c19992111507031708906c1930b674 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Fri, 20 Sep 2019 14:55:24 -0700
Subject: [PATCH 15/34] Simplify runtime invoke

Remaining issues:
- For the op that has multiple outputs like batch norm, we don't know
how many it has. This is also a problem when supporting subgraph with
many nodes.
- Have no idea how to deal with the function that has different data
types of inputs.
---
 include/tvm/relay/contrib_codegen.h           | 127 +++++++++++++++---
 python/tvm/relay/op/contrib/dnnl/extern_op.py |   5 +
 python/tvm/relay/op/contrib/extern_op.py      |   6 +
 src/relay/backend/contrib/cblas/codegen.cc    |  58 +-------
 src/relay/backend/contrib/dnnl/codegen.cc     |  90 +++----------
 src/relay/backend/contrib/dnnl/libs.cc        |  41 ++++++
 src/relay/backend/contrib/gcc/codegen.cc      |  70 +++-------
 src/relay/backend/contrib/gcc/libs.cc         |  47 ++-----
 src/relay/pass/fuse_ops.cc                    |   3 +-
 .../python/relay/test_pass_partition_graph.py |  41 +++++-
 10 files changed, 261 insertions(+), 227 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 79ed2223a5b9..05fb9adaab68 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -38,6 +38,21 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
+template <typename T>
+using F2ARGS = void (*)(T* a, T* b);
+template <typename T>
+using F3ARGS = void (*)(T* a, T* b, T* c);
+template <typename T>
+using F4ARGS = void (*)(T* a, T* b, T* c, T* d);
+template <typename T>
+using F5ARGS = void (*)(T* a, T* b, T* c, T* d, T* e);
+template <typename T>
+using F6ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f);
+template <typename T>
+using F7ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f, T* g);
+template <typename T>
+using F8ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f, T* g, T* h);
+
 class ExternModuleNodeBase : public runtime:: ModuleNode {
  public:
   ExternModuleNodeBase() = default;
@@ -53,24 +68,62 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   virtual const std::vector<std::string> GetExternLibPaths(std::string id = "") const = 0;
 
   /*!
-   * \brief Build the shared library of external ops.
-   *
-   * \param expr The subgraph Relay expression to be executed using extern ops.
+   * \brief Get the function prefix of this compiler.
    *
+   * \return A string of the function name prefix in the library.
    */
-  virtual void Build(const Expr& expr) = 0;
+  virtual const std::string GetPrefix() const = 0;
 
   /*!
-   * \brief The extern module specific implementation of invoking pre-built functions.
+   * \brief Build the shared library of external ops.
    *
-   * \param name the name of the external function.
-   * \param func_s The function symbol retrieved from the external library.
-   * \param sptr_to_self The shared_ptr that points to this module node.
+   * \param expr The subgraph Relay expression to be executed using extern ops.
    *
-   * \return PackedFunc(nullptr) when it is not available.
    */
-  virtual runtime::PackedFunc InvokeExternFunc(const std::string& name,
-                                               const std::shared_ptr<ModuleNode>& sptr_to_self) = 0;
+  virtual void Build(const Expr& expr) = 0;
+
+  void SetSubgraphInfo(std::string id, const DLDataType type, int num_args) {
+    _subgraph_info[id] = std::make_pair(type, num_args);
+  }
+
+  std::pair<DLDataType, int> GetSubgraphInfo(std::string id) {
+    if (_subgraph_info.count(id) == 0) {
+      LOG(FATAL) << "Info of subgraph " << id << " is missing.";
+    }
+    return _subgraph_info[id];
+  }
+
+  template<typename T>
+  void Invoke(void* func_s, std::vector<T*> data) {
+    try {
+      if (data.size() == 2) {
+        auto func = reinterpret_cast<F2ARGS<T>>(func_s);
+        (*func)(data[0], data[1]);
+      } else if (data.size() == 3) {
+        auto func = reinterpret_cast<F3ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2]);
+      } else if (data.size() == 4) {
+        auto func = reinterpret_cast<F4ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2], data[3]);
+      } else if (data.size() == 5) {
+        auto func = reinterpret_cast<F5ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2], data[3], data[4]);
+      } else if (data.size() == 6) {
+        auto func = reinterpret_cast<F6ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2], data[3], data[4], data[5]);
+      } else if (data.size() == 7) {
+        auto func = reinterpret_cast<F7ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
+      } else if (data.size() == 8) {
+        auto func = reinterpret_cast<F8ARGS<T>>(func_s);
+        (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
+      } else {
+          LOG(FATAL) << "Unsupported argument number: " << data.size();
+      }
+    } catch (const std::exception& e) {
+      LOG(FATAL) << "Execution failure: " << e.what();
+    }
+  }
 
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
@@ -83,17 +136,47 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    auto id = GetSubgraphID(name);
-    Open(this->GetExternLibPaths(id));
+    _curr_id = GetSubgraphID(name);
+    Open(this->GetExternLibPaths(_curr_id));
     CHECK(handle_) << "The external module has not been built or failed to open.\n";
 
-    auto func_s = this->InvokeExternFunc(name, sptr_to_self);
-    char* error = dlerror();
-    if (error != NULL) {
-      LOG(FATAL) << error;
-      return PackedFunc();
-    }
-    return func_s;
+    // Generate an external packed function
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      const DLTensor* dptr = ((runtime::NDArray) args[0]).operator->();
+
+      // Check type and argument number
+      auto info = GetSubgraphInfo(_curr_id);
+      CHECK(info.first.code == dptr->dtype.code && info.first.bits == dptr->dtype.bits)
+            << "Data type of subgraph " << _curr_id << " and input is mismatch";
+      CHECK(info.second == args.size())
+            << "Argument number of subgraph " << _curr_id
+            << " and input data is mismatch: " << info.second
+            << " vs. " << args.size();
+
+      // Get function from the library
+      std::string encoded_name = GetPrefix() + _curr_id;
+      auto func_s = GetSymbol(encoded_name);
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        std::vector<float *> data;
+        for (int i = 0; i < args.size(); ++i) {
+          runtime::NDArray arg = args[i];
+          data.push_back(reinterpret_cast<float*>(arg->data));
+        }
+        Invoke<float>(func_s, data);
+      } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
+        std::vector<double*> data;
+        for (int i = 0; i < args.size(); ++i) {
+          runtime::NDArray arg = args[i];
+          data.push_back(reinterpret_cast<double*>(arg->data));
+        }
+        Invoke<double>(func_s, data);
+      } else {
+        LOG(FATAL) << "Only support float32 and float64 types.";
+      }
+      //*rv = out;
+    });
   }
 
   /*!
@@ -223,6 +306,10 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     }
   }
 #endif
+
+private:
+  std::string _curr_id;
+  std::unordered_map<std::string, std::pair<DLDataType, int>> _subgraph_info;
 };
 
 }  // namespace contrib
diff --git a/python/tvm/relay/op/contrib/dnnl/extern_op.py b/python/tvm/relay/op/contrib/dnnl/extern_op.py
index a8da36a5c32c..ede78c668516 100644
--- a/python/tvm/relay/op/contrib/dnnl/extern_op.py
+++ b/python/tvm/relay/op/contrib/dnnl/extern_op.py
@@ -32,3 +32,8 @@ def relu(attrs, args):
     """Check if the external codegen should be used.
     """
     return True
+
+def batch_norm(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return False
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index ab43bff40091..05eb0f1edd7f 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -72,6 +72,12 @@ def external_relu(attrs, args, compiler):
     """
     return get_extern_op(compiler, 'relu')(attrs, args)
 
+@reg.register_extern_op("nn.batch_norm")
+def external_batch_norm(attrs, args, compiler):
+    """Check if the external compiler should be used.
+    """
+    return get_extern_op(compiler, 'batch_norm')(attrs, args)
+
 
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
index 0308db23c515..dc335d13723e 100644
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ b/src/relay/backend/contrib/cblas/codegen.cc
@@ -36,57 +36,14 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*CblasFloat)(float* a, float* b, float* out);
-typedef void (*CblasDouble)(double* a, double* b, double* out);
-
 class CblasModuleNode : public ExternModuleNodeBase {
  public:
   const std::vector<std::string> GetExternLibPaths(std::string id = "") const override {
     return {"/tmp/relay_cblas_lib_" + id + ".so"};
   }
 
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters.
-   *
-   * \param name the name of the external function.
-   * \param func_s The function symbol retrieved from the external library.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  runtime::PackedFunc InvokeExternFunc(const std::string& name,
-                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    _curr_id = GetSubgraphID(name);
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 3U);
-      runtime::NDArray data = args[0];
-      runtime::NDArray weight = args[1];
-      runtime::NDArray out = args[2];
-
-      const DLTensor* dptr = data.operator->();
-      std::string encoded_name = _prefix + _curr_id;
-
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        float* d_data = reinterpret_cast<float*>(data->data);
-        float* weight_data = reinterpret_cast<float*>(weight->data);
-        float* out_data = reinterpret_cast<float*>(out->data);
-
-        auto func_s_ = reinterpret_cast<CblasFloat>(GetSymbol(encoded_name));
-        (*func_s_)(d_data, weight_data, out_data);
-      } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
-        double* d_data = reinterpret_cast<double*>(data->data);
-        double* weight_data = reinterpret_cast<double*>(weight->data);
-        double* out_data = reinterpret_cast<double*>(out->data);
-
-        auto func_s_ = reinterpret_cast<CblasDouble>(GetSymbol(encoded_name));
-        (*func_s_)(d_data, weight_data, out_data);
-      } else {
-        LOG(FATAL) << "Only support float32 and float64 types.";
-      }
-      
-      *rv = out;
-    });
+  const std::string GetPrefix() const override {
+    return "cblas_";
   }
 
   /*!
@@ -112,12 +69,11 @@ class CblasModuleNode : public ExternModuleNodeBase {
 
     // Record subgraph ID for runtime invoke.
     auto id = GetSubgraphID(func);
-    std::string encoded_id = _prefix + id;
     std::string code = "";
 
     // Args: ID
     std::vector<std::string> args;
-    args.push_back(encoded_id);
+    args.push_back(GetPrefix() + id);
 
     if (IsOp(call, "nn.dense")) {
       auto ishape = GetShape(call->args[0]);
@@ -131,8 +87,10 @@ class CblasModuleNode : public ExternModuleNodeBase {
       auto type_node = call->checked_type().as<TensorTypeNode>();
       CHECK(type_node != nullptr);
       CHECK(type_node->dtype.is_float()) << "Only support float types";
+      auto bits = type_node->dtype.bits();
+      SetSubgraphInfo(id, DLDataType{kDLFloat, static_cast<uint8_t>(bits), 1}, 3);
 
-      code = "DENSE_FP" + std::to_string(type_node->dtype.bits()) + "(" +
+      code = "DENSE_FP" + std::to_string(bits) + "(" +
              args[0] + ", " + args[1] + ", " + args[2] + ", " + args[3] + ");";
     } else {
       LOG(FATAL) << "CBLAS expects a single dense op.";
@@ -158,10 +116,6 @@ class CblasModuleNode : public ExternModuleNodeBase {
       LOG(FATAL) << "Failed to compile CBLAS library. Error code: " << ret;
     }
   }
-
- private:
-  std::string _curr_id;
-  std::string _prefix = "cblas_";
 };
 
 /*!
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index e90d8aa43812..d8f65cac504d 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -36,79 +36,14 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*DNNL2PFP32)(float* input, float* out);
-typedef void (*DNNL3PFP32)(float* input, float* weights, float* out);
-
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
   const std::vector<std::string> GetExternLibPaths(std::string id) const override {
     return {"/tmp/relay_dnnl_lib_" + id + ".so"};
   }
 
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters.
-   *
-   * \param name the name of the external function.
-   * \param func_s The function symbol retrieved from the external library.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  runtime::PackedFunc InvokeExternFunc(const std::string& name,
-                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    _curr_id = GetSubgraphID(name);
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      if (args.size() == 3U) {
-        runtime::NDArray data = args[0];
-        runtime::NDArray weight = args[1];
-        runtime::NDArray out = args[2];
-
-        const DLTensor* dptr = data.operator->();
-        std::string encoded_name = _prefix + _curr_id;
-
-        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-          float* d_data = reinterpret_cast<float*>(data->data);
-          float* weight_data = reinterpret_cast<float*>(weight->data);
-          float* out_data = reinterpret_cast<float*>(out->data);
-
-          auto func_s_ = reinterpret_cast<DNNL3PFP32>(GetSymbol(encoded_name));
-          try {
-            (*func_s_)(d_data, weight_data, out_data);
-          } catch (const std::exception& e) {
-            LOG(FATAL) << e.what();
-          }
-        } else {
-          LOG(FATAL) << "Only support float32 types.";
-        }
-        *rv = out;
-      }
-      else if (args.size() == 2U) {
-        runtime::NDArray data = args[0];
-        runtime::NDArray out = args[1];
-
-        const DLTensor* dptr = data.operator->();
-        std::string encoded_name = _prefix + _curr_id;
-
-        if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-          float* d_data = reinterpret_cast<float*>(data->data);
-          float* out_data = reinterpret_cast<float*>(out->data);
-
-          auto func_s_ = reinterpret_cast<DNNL2PFP32>(GetSymbol(encoded_name));
-          try {
-            (*func_s_)(d_data, out_data);
-          } catch (const std::exception& e) {
-            LOG(FATAL) << e.what();
-          }
-        } else {
-          LOG(FATAL) << "Only support float32 types.";
-        }
-        *rv = out;
-      }
-      else {
-        LOG(FATAL) << "Unsupported argument number: " << args.size();
-      }
-    });
+  const std::string GetPrefix() const override {
+    return "dnnl_";
   }
 
   /*!
@@ -130,7 +65,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
 
     // Record subgraph ID for runtime invoke.
     auto id = GetSubgraphID(func);
-    std::string encoded_id = _prefix + id;
+    std::string encoded_id = GetPrefix() + id;
     std::string code = "";
 
     // Args: ID
@@ -138,6 +73,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
     args.push_back(encoded_id);
 
     if (IsOp(call, "nn.conv2d")) {
+      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
       code = "CONV2D";
       const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
 
@@ -159,8 +95,8 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
       args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
     } else if (IsOp(call, "nn.dense")) {
+      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
       code = "DENSE";
-
       auto ishape = GetShape(call->args[0]);
       auto wshape = GetShape(call->args[1]);
 
@@ -170,14 +106,27 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       args.push_back(std::to_string(wshape[0]));
 
     } else if (IsOp(call, "nn.relu")) {
+      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 2);
       code = "RELU";
-
       auto ishape = GetShape(call->args[0]);
 
       // Args: N, C, H, W
       for (auto s : ishape) {
         args.push_back(std::to_string(s));
       }
+    } else if (IsOp(call, "nn.batch_norm")) {
+      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 8);
+      code = "BN";
+      const auto* bn_attr = call->attrs.as<BatchNormAttrs>();
+      auto ishape = GetShape(call->args[0]);
+
+      // Args: N, C, H, W
+      for (auto s : ishape) {
+        args.push_back(std::to_string(s));
+        }
+
+        // Args: epilson
+        args.push_back(std::to_string(bn_attr->epsilon));
     } else {
       LOG(FATAL) << "DNNL expects a single convolution or dense op.";
     }
@@ -216,7 +165,6 @@ class DNNLModuleNode : public ExternModuleNodeBase {
   }
 
   std::string _curr_id;
-  const std::string _prefix = "dnnl_";
 };
 
 /*!
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index 023cb58a26a6..9380c48df736 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -146,3 +146,44 @@ inline void read_from_dnnl_memory(void* handle, memory& mem) {
     s.wait();                                                                                     \
     read_from_dnnl_memory(out, dst_memory);                                                       \
   }
+
+#define BN(p_ID_, p_N_, p_C_, p_H_, p_W_, p_E_)                                               \
+  extern "C" void p_ID_(float* data, float* gamma, float* beta, float* mean, float* variance, \
+                        float* out) {                                                         \
+    using tag = memory::format_tag;                                                           \
+    using dt = memory::data_type;                                                             \
+                                                                                              \
+    engine eng(engine::kind::cpu, 0);                                                         \
+    stream s(eng);                                                                            \
+                                                                                              \
+    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                          \
+                                                                                              \
+    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                               \
+                                                                                              \
+    auto data_memory = memory(data_md, eng, data);                                            \
+    auto dst_memory = memory(data_md, eng);                                                   \
+                                                                                              \
+    auto bn_desc = batch_normalization_forward::desc(                                         \
+        prop_kind::forward_inference, data_md, p_E_,                                          \
+        normalization_flags::use_global_stats | normalization_flags::use_scale_shift);        \
+    auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);            \
+    assert(data_md == bn_prim_desc.dst_desc());                                               \
+                                                                                              \
+    float* weight = (float*)malloc(sizeof(float) * 2 * p_C_);                                 \
+    memcpy(weight, gamma, sizeof(float) * p_C_);                                              \
+    memcpy(weight + p_C_, beta, sizeof(float) * p_C_);                                        \
+                                                                                              \
+    auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);                    \
+    auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);                           \
+    auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);               \
+                                                                                              \
+    auto bn = batch_normalization_forward(bn_prim_desc);                                      \
+    bn.execute(s, {{DNNL_ARG_SRC, data_memory},                                               \
+                   {DNNL_ARG_DST, dst_memory},                                                \
+                   {DNNL_ARG_SCALE_SHIFT, weight_memory},                                     \
+                   {DNNL_ARG_MEAN, mean_memory},                                              \
+                   {DNNL_ARG_VARIANCE, variance_memory}});                                    \
+    s.wait();                                                                                 \
+    read_from_dnnl_memory(out, dst_memory);                                                   \
+    free(weight);                                                                             \
+  }
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 6b33adf10a5d..45319b5ef4bf 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -38,49 +38,8 @@ class GccModuleNode : public ExternModuleNodeBase {
     return {"/tmp/relay_gcc_lib_" + id + ".so"};
   }
 
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters.
-   *
-   * \param name the name of the external function.
-   * \param func_s The function symbol retrieved from the external library.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  runtime::PackedFunc InvokeExternFunc(const std::string& name,
-                                       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    std::string _curr_id = GetSubgraphID(name);
-    std::string encoded_id = _prefix + _curr_id;
-    func_s_ = reinterpret_cast<GccBinaryFunc>(GetSymbol(encoded_id));
-
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 3U);
-      runtime::NDArray a = args[0];
-      ExternalTensor lhs;
-      lhs.data = a->data;
-      lhs.ndim = a.Shape().size();
-      lhs.shape = new int64_t[lhs.ndim];
-
-      runtime::NDArray b = args[1];
-      ExternalTensor rhs;
-      rhs.data = b->data;
-      rhs.ndim = b.Shape().size();
-      rhs.shape = new int64_t[rhs.ndim];
-
-      runtime::NDArray c = args[2];
-      ExternalTensor out;
-      out.data = c->data;
-      out.ndim = c.Shape().size();
-      out.shape = c.Shape().data();
-
-      for (int i = 0; i < lhs.ndim; i++) {
-        lhs.shape[i] = a.Shape()[i];
-        rhs.shape[i] = b.Shape()[i];
-      }
-      (*func_s_)(lhs, rhs, &out);
-      *rv = c;
-    });
+  const std::string GetPrefix() const override {
+    return "gcc_";
   }
 
   /*!
@@ -104,10 +63,23 @@ class GccModuleNode : public ExternModuleNodeBase {
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "GCC expects a single op.";
 
+    auto ashape = GetShape(call->args[0]);
+    auto bshape = GetShape(call->args[1]);
+
+    // Check shape
+    CHECK(ashape.size() <= 2 && ashape.size() == bshape.size())
+        << "Input shape dimensions are not consistent, " << ashape.size() << " vs. "
+        << bshape.size();
+    for (int i = 0; i < ashape.size(); ++i) {
+      CHECK(ashape[i] == bshape[i]) << "Input shapes are not consistent at dim " << i << ":"
+                                    << ashape[i] << " vs. " << bshape[i];
+    }
+
     // Record subgraph ID for runtime invoke.
     auto id = GetSubgraphID(func);
-    std::string encoded_id = _prefix + id;
-    std::string code = "GCC_BINARY_OP(" + encoded_id + ", ";
+    SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
+    std::string code =
+        "GCC_BINARY_OP_" + std::to_string(ashape.size()) + "D(" + GetPrefix() + id + ", ";
 
     if (IsOp(call, "add")) {
       code += "+";
@@ -118,6 +90,10 @@ class GccModuleNode : public ExternModuleNodeBase {
     } else {
       LOG(FATAL) << "Unrecognized op: ";
     }
+
+    for (int i = 0; i < ashape.size(); ++i) {
+      code += ", " + std::to_string(ashape[i]);
+    }
     code += ");";
 
     // Prepare library source
@@ -137,10 +113,6 @@ class GccModuleNode : public ExternModuleNodeBase {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
   }
-
- private:
-  GccBinaryFunc func_s_;
-  std::string _prefix = "gcc_";
 };
 
 
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
index 4a94b564465d..658537a06f3a 100644
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -21,38 +21,19 @@
 #include <cstdint>
 #include <iostream>
 
-#define GCC_BINARY_OP(OP, SYMB)                                            \
-  extern "C" void OP(ExternalTensor a, ExternalTensor b,                   \
-                     ExternalTensor* out) {                                \
-    if (a.ndim > 2 || a.ndim != b.ndim || a.ndim != out->ndim) {           \
-      std::cerr << "Array sizes are not consistent, a.ndim = " << a.ndim   \
-                << ", b.ndim = " << b.ndim << ", out ndim = " << out->ndim \
-                << std::endl;                                              \
-    }                                                                      \
-    for (int i = 0; i < a.ndim; i++) {                                     \
-      if (a.shape[i] != b.shape[i]) {                                      \
-        std::cerr << "shape[" << i << "]: a = " << a.shape[i]              \
-                  << ", b = " << b.shape[i] << std::endl;                  \
-      }                                                                    \
-    }                                                                      \
-    std::cout << "dim: " << a.ndim << " shape: " << std::endl;             \
-    for (int i = 0; i < a.ndim; i++) {                                     \
-      std::cout << a.shape[i] << " " << b.shape[i] << std::endl;           \
-    }                                                                      \
-    float* a_ptr = static_cast<float*>(a.data);                            \
-    float* b_ptr = static_cast<float*>(b.data);                            \
-    float* out_ptr = static_cast<float*>(out->data);                       \
-    if (a.ndim == 1) {                                                     \
-      for (int64_t i = 0; i < a.shape[0]; i++) {                           \
-        out_ptr[i] = a_ptr[i] SYMB b_ptr[i];                               \
-      }                                                                    \
-    } else {                                                               \
-      for (int64_t i = 0; i < a.shape[0]; i++) {                           \
-        for (int64_t j = 0; j < a.shape[1]; j++) {                         \
-          int64_t k = i * a.shape[1] + j;                                  \
-          out_ptr[k] = a_ptr[k] SYMB b_ptr[k];                             \
-        }                                                                  \
-      }                                                                    \
-    }                                                                      \
+#define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
+  extern "C" void p_ID_(float* a, float* b, float* out) { \
+    for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+      out[i] = a[i] p_OP_ b[i];                           \
+    }                                                     \
   }
 
+#define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
+  extern "C" void p_ID_(float* a, float* b, float* out) { \
+    for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+      for (int64_t j = 0; j < p_DIM2_; ++j) {             \
+        int64_t k = i * p_DIM2_ + j;                      \
+        out[k] = a[k] p_OP_ b[k];                         \
+      }                                                   \
+    }                                                     \
+  }
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 904d24657cad..df3d5e58a3c2 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -239,7 +239,8 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     // Finally if the operator position is not a call node we will
     // need to call Update, as it may be an arbitrary expression.
     OpPatternKind op_pattern = kOpaque;
-    if (const OpNode* opnode = call->op.as<OpNode>()) {
+    const OpNode* opnode = call->op.as<OpNode>();
+    if (opnode != nullptr && call->op != Op::Get("nn.batch_norm")) {
       op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
     } else {
       this->Update(call->op, node, kOpaque);
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 7f69801aedc5..7814296ff606 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -148,6 +148,44 @@ def test_extern_dnnl():
 
     tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
+def test_extern_dnnl_bn():
+    dtype = 'float32'
+    shapes = [
+        (1, 1024, 7, 7),
+        (1024, ),
+        (1024, ),
+        (1024, ),
+        (1024, )
+    ]
+    eps = 1e-5
+
+    data = [np.absolute(np.random.normal(size=shape).astype('float32'))
+            for shape in shapes]
+    relay_args = [
+        relay.var('data' + str(idx), shape=arg.shape, dtype=dtype)
+        for idx, arg in enumerate(data)
+    ]
+
+    out = relay.nn.batch_norm(*relay_args, epsilon=eps)[0]
+
+    f = relay.Function([*relay_args], out)
+
+    mod = relay.Module()
+    mod['main'] = f
+    mod = relay.transform.ExternOp('dnnl')(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+
+    ref_mod = relay.Module()
+    ref_mod['main'] = f
+
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(*data)
+
+    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_res = ref_ex.evaluate()(*data)
+
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+
 
 def test_extern_dnnl_mobilenet():
     # FIXME: This test is only for demo purpose and supposed to be removed.
@@ -175,4 +213,5 @@ def test_extern_dnnl_mobilenet():
     test_extern_gcc()
     test_extern_cblas()
     test_extern_dnnl()
-    #test_extern_dnnl_mobilenet()
+    test_extern_dnnl_bn()
+    #test_extern_dnnl_mobilenet()
\ No newline at end of file

From 66779ecf07750865c19cd8ee199b921bb0c01d4b Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 23 Sep 2019 20:57:01 +0000
Subject: [PATCH 16/34] to vm: add an InvokeExternal Instruction

---
 include/tvm/relay/contrib_codegen.h    | 44 ++++++-------
 include/tvm/relay/expr.h               |  8 +++
 include/tvm/runtime/vm.h               | 38 ++++++++++++
 src/relay/backend/compile_engine.cc    |  4 +-
 src/relay/backend/contrib/dnnl/libs.cc |  1 +
 src/relay/backend/vm/compiler.cc       | 85 +++++++++++++++++---------
 src/relay/backend/vm/compiler.h        |  2 +
 src/relay/ir/expr.cc                   |  6 ++
 src/runtime/vm/vm.cc                   | 63 +++++++++++++++++++
 9 files changed, 199 insertions(+), 52 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 05fb9adaab68..e6bfc369b496 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -83,39 +83,39 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   virtual void Build(const Expr& expr) = 0;
 
   void SetSubgraphInfo(std::string id, const DLDataType type, int num_args) {
-    _subgraph_info[id] = std::make_pair(type, num_args);
+    subgraph_info_[id] = std::make_pair(type, num_args);
   }
 
   std::pair<DLDataType, int> GetSubgraphInfo(std::string id) {
-    if (_subgraph_info.count(id) == 0) {
+    if (subgraph_info_.count(id) == 0) {
       LOG(FATAL) << "Info of subgraph " << id << " is missing.";
     }
-    return _subgraph_info[id];
+    return subgraph_info_[id];
   }
 
   template<typename T>
-  void Invoke(void* func_s, std::vector<T*> data) {
+  void Invoke(void* func_sym, std::vector<T*> data) {
     try {
       if (data.size() == 2) {
-        auto func = reinterpret_cast<F2ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F2ARGS<T>>(func_sym);
         (*func)(data[0], data[1]);
       } else if (data.size() == 3) {
-        auto func = reinterpret_cast<F3ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F3ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2]);
       } else if (data.size() == 4) {
-        auto func = reinterpret_cast<F4ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F4ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2], data[3]);
       } else if (data.size() == 5) {
-        auto func = reinterpret_cast<F5ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F5ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2], data[3], data[4]);
       } else if (data.size() == 6) {
-        auto func = reinterpret_cast<F6ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F6ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2], data[3], data[4], data[5]);
       } else if (data.size() == 7) {
-        auto func = reinterpret_cast<F7ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F7ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
       } else if (data.size() == 8) {
-        auto func = reinterpret_cast<F8ARGS<T>>(func_s);
+        auto func = reinterpret_cast<F8ARGS<T>>(func_sym);
         (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
       } else {
           LOG(FATAL) << "Unsupported argument number: " << data.size();
@@ -136,8 +136,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    _curr_id = GetSubgraphID(name);
-    Open(this->GetExternLibPaths(_curr_id));
+    curr_id_ = GetSubgraphID(name);
+    Open(this->GetExternLibPaths(curr_id_));
     CHECK(handle_) << "The external module has not been built or failed to open.\n";
 
     // Generate an external packed function
@@ -145,17 +145,17 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
       const DLTensor* dptr = ((runtime::NDArray) args[0]).operator->();
 
       // Check type and argument number
-      auto info = GetSubgraphInfo(_curr_id);
+      auto info = GetSubgraphInfo(curr_id_);
       CHECK(info.first.code == dptr->dtype.code && info.first.bits == dptr->dtype.bits)
-            << "Data type of subgraph " << _curr_id << " and input is mismatch";
+            << "Data type of subgraph " << curr_id_ << " and input is mismatch";
       CHECK(info.second == args.size())
-            << "Argument number of subgraph " << _curr_id
+            << "Argument number of subgraph " << curr_id_
             << " and input data is mismatch: " << info.second
             << " vs. " << args.size();
 
       // Get function from the library
-      std::string encoded_name = GetPrefix() + _curr_id;
-      auto func_s = GetSymbol(encoded_name);
+      std::string encoded_name = GetPrefix() + curr_id_;
+      auto func_sym = GetSymbol(encoded_name);
 
       // Reinterpret data and function to the right type and invoke
       if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
@@ -164,14 +164,14 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
           runtime::NDArray arg = args[i];
           data.push_back(reinterpret_cast<float*>(arg->data));
         }
-        Invoke<float>(func_s, data);
+        Invoke<float>(func_sym, data);
       } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
         std::vector<double*> data;
         for (int i = 0; i < args.size(); ++i) {
           runtime::NDArray arg = args[i];
           data.push_back(reinterpret_cast<double*>(arg->data));
         }
-        Invoke<double>(func_s, data);
+        Invoke<double>(func_sym, data);
       } else {
         LOG(FATAL) << "Only support float32 and float64 types.";
       }
@@ -308,8 +308,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
 #endif
 
 private:
-  std::string _curr_id;
-  std::unordered_map<std::string, std::pair<DLDataType, int>> _subgraph_info;
+  std::string curr_id_;
+  std::unordered_map<std::string, std::pair<DLDataType, int>> subgraph_info_;
 };
 
 }  // namespace contrib
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 2aa88099a69c..db4cc6c993b0 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -268,6 +268,14 @@ class FunctionNode : public ExprNode {
    */
   bool IsPrimitive() const;
 
+  /*!
+   * \brief Check whether the function is an external function.
+   * External functions are subgraphes that supported by external libraries.
+   *
+   * \return Whether the function is external or not.
+   */
+  bool IsExternal() const;
+
   TVM_DLL static Function make(tvm::Array<Var> params,
                                Expr body,
                                Type ret_type,
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index f7188e4b7896..820cca296f94 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_VM_H_
 #define TVM_RUNTIME_VM_H_
 
+#include <tvm/relay/expr.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -139,6 +140,7 @@ enum class Opcode {
   LoadConsti = 14U,
   Fatal = 15U,
   AllocStorage = 16U,
+  InvokeExternal = 17U,
 };
 
 /*! \brief A single virtual machine instruction.
@@ -202,6 +204,16 @@ struct Instruction {
       /*! \brief The arguments to pass to the packed function. */
       RegName* packed_args;
     };
+    struct /* InvokeExternal Operands */ {
+      /*! \brief The index into the external function table. */
+      Index ext_index;
+      /*! \brief The arity of the external function. */
+      Index ext_arity;
+      /*! \brief The number of outputs produced by the external function. */
+      Index ext_output_size;
+      /*! \brief The arguments to pass to the external function. */
+      RegName* ext_args;
+    };
     struct /* If Operands */ {
       /*! \brief The register containing the test value. */
       RegName test;
@@ -299,6 +311,16 @@ struct Instruction {
    */
   static Instruction AllocTensor(RegName storage,
                                  const std::vector<int64_t>& shape, DLDataType dtype, RegName dst);
+  /*! 
+   * \brief Construct an invoke external instruction.
+   * \param packed_index The index of the external function.
+   * \param ext_arity The arity of the function.
+   * \param ext_output_size The number of outputs of the external function.
+   * \param args The argument registers.
+   * \return The invoke external instruction.
+   */
+  static Instruction InvokeExternal(Index external_index, Index ext_arity, Index ext_output_size,
+                                    const std::vector<RegName>& args);
   /*!
    * \brief Construct an allocate tensor instruction with register.
    * \param storage The storage to allocate out of.
@@ -697,6 +719,20 @@ class VirtualMachine : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name,
                                  const ObjectPtr<Object>& sptr_to_self);
 
+  /*!
+   * \brief Invoke an external function.
+   *
+   * \param external_index The offset of the external function in all functions.
+   * \param func The external function to be invoked.
+   * \param arg_count The number of arguments to the external function.
+   * \param output_size The number of outputs of the external function.
+   * \param args Arguments to the external function.
+   *
+   * \note The return value will be stored in the last output_size slots of args.
+   */
+  virtual void InvokeExternal(Index External_index, const relay::Function& func, Index arg_count,
+                              Index output_size, const std::vector<Object>& args);
+
   virtual ~VirtualMachine() {}
 
   const char* type_key() const final {
@@ -714,6 +750,8 @@ class VirtualMachine : public runtime::ModuleNode {
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
+  /*! \brief The virtual machine's external function table. */
+  std::vector<relay::Function> external_funcs;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames_;
   /*! \brief The fuction table index of the current function. */
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 534d96b814ae..84cfe976ffbf 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -656,8 +656,8 @@ class CompileEngineImpl : public CompileEngineNode {
       cache_[key] = value;
     }
 
-    auto compiler = FunctionGetAttr(key->source_func, "External");
-    if (compiler.defined()) {
+    if (key->source_func->IsExternal()) {
+      auto compiler = FunctionGetAttr(key->source_func, "External");
       const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
       CHECK(code_gen);
       std::string ext_name = "relay.ext." + code_gen->value;
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index 9380c48df736..dbe5ebe029b4 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -16,6 +16,7 @@
  * under the License.
  */
 
+#include <assert.h>
 #include <stdlib.h>
 #include <algorithm>
 #include <numeric>
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index c38ca1ae0469..28ed0277e489 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -27,6 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/qnn/transform.h>
+#include <tvm/ir.h>
 #include <tvm/logging.h>
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
@@ -295,6 +296,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         last_register_ = instr.dst;
         break;
       case Opcode::InvokePacked:
+      case Opcode::InvokeExternal:
       case Opcode::If:
       case Opcode::Ret:
       case Opcode::Goto:
@@ -444,6 +446,54 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       argument_registers));
   }
 
+  void EmitInvokeExternal(const Function& func,
+                          const std::vector<Index>& unpacked_arg_regs,
+                          size_t arity,
+                          size_t return_count) {
+    CHECK(func->IsExternal());
+    auto comp = FunctionGetAttr(func, "External");
+    const auto* comp_name = comp.as<tvm::ir::StringImm>();
+    CHECK(comp_name);
+    // Append all subgraphs to a list, and then perform codegen for each
+    // category (i.e. the ones that use the same codegen should be compiled
+    // together.)
+    context_->external_funcs.push_back(func);
+    size_t subgraph_id = context_->external_funcs.size();
+    // Emit an instruction to invoke the external function/subgraph.
+    Emit(Instruction::InvokeExternal(subgraph_id, arity, return_count, unpacked_arg_regs));
+  }
+
+  void EmitInvokePacked(const Function& func,
+                        const std::vector<Index>& unpacked_arg_regs,
+                        size_t arity,
+                        size_t return_count) {
+    Target target;
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      for (auto kv : targets_) {
+        target = kv.second;
+      }
+    } else {
+      // heterogeneous execution.
+      LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
+    }
+    auto key = CCacheKeyNode::make(func, target);
+    auto cfunc = engine_->Lower(key);
+    // TODO(jroesch): support lowered funcs for multiple targets
+    CHECK_EQ(cfunc->funcs.size(), 1);
+    auto op_index = -1;
+    if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
+      op_index = context_->cached_funcs.size();
+      context_->cached_funcs.push_back(cfunc);
+      context_->seen_funcs[cfunc->funcs[0]] = op_index;
+    } else {
+      op_index = context_->seen_funcs[cfunc->funcs[0]];
+    }
+
+    Emit(Instruction::InvokePacked(op_index, arity, return_count, unpacked_arg_regs));
+  }
+
+
   void EmitInvokeTVMOp(const Function& func,
                        const Expr& inputs,
                        const Expr& outputs) {
@@ -477,35 +527,14 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     }
 
     // Next generate the invoke instruction.
-    Target target;
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      for (auto kv : targets_) {
-        target = kv.second;
-      }
-    } else {
-      // heterogeneous execution.
-      LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
-    }
-
-    auto key = CCacheKeyNode::make(func, target);
-    auto cfunc = engine_->Lower(key);
-
-    // TODO(jroesch): support lowered funcs for multiple targets
-    CHECK_EQ(cfunc->funcs.size(), 1);
-    auto op_index = -1;
-    if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
-      op_index = context_->cached_funcs.size();
-      context_->cached_funcs.push_back(cfunc);
-      context_->seen_funcs[cfunc->funcs[0]] = op_index;
+    CHECK(func->IsPrimitive() || func->IsExternal());
+    if (func->IsExternal()) {
+      EmitInvokeExternal(op_index, argument_registers.size(), output_tuple->fields.size(),
+                         argument_registers);
     } else {
-      op_index = context_->seen_funcs[cfunc->funcs[0]];
+      EmitInvokePacked(op_index, argument_registers.size(), output_tuple->fields.size(),
+                       argument_registers);
     }
-
-    Emit(Instruction::InvokePacked(op_index,
-      argument_registers.size(),
-      output_tuple->fields.size(),
-      argument_registers));
   }
 
   void VisitExpr_(const CallNode* call_node) {
@@ -639,7 +668,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   }
 
   void VisitExpr_(const FunctionNode* func_node) {
-    if (!func_node->IsPrimitive()) {
+    if (!func_node->IsPrimitive() && !func_node->IsExternal()) {
       LOG(FATAL) << "local functions should have been removed by lambda lifting:" << std::endl
                  << "Program: " << AsText(GetRef<Function>(func_node), false) << std::endl
                  << "AST: " << GetRef<Function>(func_node);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 8cdb12e4dafa..e37cb25a414a 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -77,6 +77,8 @@ struct VMCompilerContext {
   std::vector<CachedFunc> cached_funcs;
   // The functions that have been lowered.
   std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
+  // List of external functions that are used by external libraries.
+  std::vector<Function> external_funcs;
 };
 
 
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 47e735f20fc8..c98470b8bfa8 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -182,6 +182,12 @@ TVM_REGISTER_API("relay._expr.FunctionGetParams")
   return func->GetParams();
 });
 
+bool FunctionNode::IsExternal() const {
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "External");
+  const ir::IntImm* pval = res.as<ir::IntImm>();
+  return pval && pval->value != 0;
+}
+
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
   if (!func->attrs.defined()) { return NodeRef(); }
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 333dd1e44506..e25298279523 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -105,6 +105,12 @@ Instruction::Instruction(const Instruction& instr) {
       this->output_size = instr.output_size;
       this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
       return;
+    case Opcode::InvokeExternal:
+      this->ext_index = instr.ext_index;
+      this->ext_arity = instr.ext_arity;
+      this->ext_output_size = instr.ext_output_size;
+      this->ext_args = Duplicate<RegName>(instr.ext_args, instr.ext_arity);
+      return;
     case Opcode::InvokeClosure:
       this->closure = instr.closure;
       this->num_closure_args = instr.num_closure_args;
@@ -198,6 +204,13 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       FreeIf(this->packed_args);
       this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
       return *this;
+    case Opcode::InvokeExternal:
+      this->ext_index = instr.ext_index;
+      this->ext_arity = instr.ext_arity;
+      this->ext_output_size = instr.ext_output_size;
+      FreeIf(this->ext_args);
+      this->ext_args = Duplicate<RegName>(instr.ext_args, instr.ext_arity);
+      return *this;
     case Opcode::InvokeClosure:
       this->closure = instr.closure;
       this->num_closure_args = instr.num_closure_args;
@@ -262,6 +275,9 @@ Instruction::~Instruction() {
     case Opcode::InvokePacked:
       delete this->packed_args;
       return;
+    case Opcode::InvokeExternal:
+      delete this->ext_args;
+      return;
     case Opcode::InvokeClosure:
       delete this->closure_args;
       return;
@@ -303,6 +319,22 @@ Instruction Instruction::InvokePacked(Index packed_index,
   return instr;
 }
 
+Instruction Instruction::InvokeExternal(Index ext_index,
+                                        Index ext_arity,
+                                        Index ext_output_size,
+                                        const std::vector<RegName>& args) {
+  Instruction instr;
+  instr.op = Opcode::InvokeExternal;
+  instr.ext_index = ext_index;
+  instr.ext_arity = ext_arity;
+  instr.ext_output_size = ext_output_size;
+  instr.ext_args = new RegName[ext_arity];
+  for (Index i = 0; i < ext_arity; ++i) {
+    instr.ext_args[i] = args[i];
+  }
+  return instr;
+}
+
 Instruction Instruction::AllocTensor(
   RegName storage,
   const std::vector<int64_t>& shape,
@@ -516,6 +548,16 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
          << ")";
       break;
     }
+    case Opcode::InvokeExternal: {
+      os << "invoke_external Function[" << instr.ext_index << "] (in: $"
+         << StrJoin<RegName>(instr.ext_args, 0,
+                             instr.ext_arity - instr.ext_output_size, ", $")
+         << ", out: $"
+         << StrJoin<RegName>(instr.ext_args, instr.ext_arity - instr.ext_output_size,
+                             instr.ext_output_size, ", $")
+         << ")";
+      break;
+    }
     case Opcode::AllocTensor: {
       os << "alloc_tensor $" << instr.dst << " $"
          << instr.alloc_tensor.storage << " ["
@@ -802,6 +844,12 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
   }
 }
 
+// TODO(@zhiics) Invoke the external function/subgraph.
+void VirtualMachine::InvokeExternal(Index ext_index,
+                                    const relay::Function& func,
+                                    Index arg_count, Index output_size,
+                                    const std::vector<Object>& args) {
+}
 
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
   ctxs_ = ctxs;
@@ -907,6 +955,21 @@ void VirtualMachine::RunLoop() {
         pc_++;
         goto main_loop;
       }
+      case Opcode::InvokeExternal: {
+        const auto& func = external_funcs[instr.ext_index];
+        const auto& arity = instr.ext_arity;
+        std::vector<Object> args;
+        for (Index i = 0; i < arity; ++i) {
+          args.push_back(ReadRegister(instr.ext_args[i]));
+        }
+        InvokeExternal(instr.ext_index, func, arity, instr.ext_output_size, args);
+        for (Index i = 0; i < instr.ext_output_size; ++i) {
+          WriteRegister(instr.ext_args[instr.ext_arity - instr.ext_output_size + i],
+                        args[instr.ext_arity - instr.ext_output_size + i]);
+        }
+        pc++;
+        goto main_loop;
+      }
       case Opcode::InvokeClosure: {
         auto object = ReadRegister(instr.closure);
         const auto* closure = object.as<ClosureObj>();

From a7380a11f1ef38a81e6b982ec55899baf774b16d Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 25 Sep 2019 11:55:44 -0700
Subject: [PATCH 17/34] refactor backend interface and remove cblas

---
 cmake/modules/contrib/Extern.cmake            |   8 -
 include/tvm/relay/contrib_codegen.h           | 126 +-------
 python/tvm/relay/op/contrib/cblas/__init__.py |  20 --
 .../tvm/relay/op/contrib/cblas/extern_op.py   |  25 --
 python/tvm/relay/op/contrib/dnnl/extern_op.py |   6 +
 python/tvm/relay/op/contrib/extern_op.py      |   3 -
 src/relay/backend/contrib/cblas/codegen.cc    | 136 ---------
 src/relay/backend/contrib/cblas/libs.cc       |  32 --
 src/relay/backend/contrib/dnnl/codegen.cc     | 285 ++++++++++++++----
 src/relay/backend/contrib/dnnl/libs.cc        |  35 ++-
 src/relay/backend/contrib/dnnl/libs.h         |  24 ++
 src/relay/backend/contrib/gcc/codegen.cc      | 208 ++++++++++---
 src/relay/backend/contrib/gcc/libs.h          |   6 +-
 src/relay/ir/expr.cc                          |   4 +-
 src/relay/pass/partition_graph.cc             |  17 +-
 .../python/relay/test_pass_partition_graph.py | 262 +++++++++-------
 16 files changed, 620 insertions(+), 577 deletions(-)
 delete mode 100644 python/tvm/relay/op/contrib/cblas/__init__.py
 delete mode 100644 python/tvm/relay/op/contrib/cblas/extern_op.py
 delete mode 100644 src/relay/backend/contrib/cblas/codegen.cc
 delete mode 100644 src/relay/backend/contrib/cblas/libs.cc
 create mode 100644 src/relay/backend/contrib/dnnl/libs.h

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 2fc88449d8cc..20e0cb6fa100 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -21,14 +21,6 @@ message(STATUS "Build with relay.backend.contrib")
 file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
 list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
-# CBLAS (for demo purpose)
-file(GLOB CBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cblas/codegen.cc)
-if(USE_BLAS STREQUAL "mkl")
-    list(APPEND COMPILER_SRCS ${CBLAS_RELAY_CONTRIB_SRC})
-elseif(USE_BLAS STREQUAL "none")
-    # pass
-endif()
-
 # DNNL (for demo purpose)
 file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
 list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index e6bfc369b496..6edef52d52c7 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -38,21 +38,6 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-template <typename T>
-using F2ARGS = void (*)(T* a, T* b);
-template <typename T>
-using F3ARGS = void (*)(T* a, T* b, T* c);
-template <typename T>
-using F4ARGS = void (*)(T* a, T* b, T* c, T* d);
-template <typename T>
-using F5ARGS = void (*)(T* a, T* b, T* c, T* d, T* e);
-template <typename T>
-using F6ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f);
-template <typename T>
-using F7ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f, T* g);
-template <typename T>
-using F8ARGS = void (*)(T* a, T* b, T* c, T* d, T* e, T* f, T* g, T* h);
-
 class ExternModuleNodeBase : public runtime:: ModuleNode {
  public:
   ExternModuleNodeBase() = default;
@@ -82,49 +67,6 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   virtual void Build(const Expr& expr) = 0;
 
-  void SetSubgraphInfo(std::string id, const DLDataType type, int num_args) {
-    subgraph_info_[id] = std::make_pair(type, num_args);
-  }
-
-  std::pair<DLDataType, int> GetSubgraphInfo(std::string id) {
-    if (subgraph_info_.count(id) == 0) {
-      LOG(FATAL) << "Info of subgraph " << id << " is missing.";
-    }
-    return subgraph_info_[id];
-  }
-
-  template<typename T>
-  void Invoke(void* func_sym, std::vector<T*> data) {
-    try {
-      if (data.size() == 2) {
-        auto func = reinterpret_cast<F2ARGS<T>>(func_sym);
-        (*func)(data[0], data[1]);
-      } else if (data.size() == 3) {
-        auto func = reinterpret_cast<F3ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2]);
-      } else if (data.size() == 4) {
-        auto func = reinterpret_cast<F4ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2], data[3]);
-      } else if (data.size() == 5) {
-        auto func = reinterpret_cast<F5ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2], data[3], data[4]);
-      } else if (data.size() == 6) {
-        auto func = reinterpret_cast<F6ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2], data[3], data[4], data[5]);
-      } else if (data.size() == 7) {
-        auto func = reinterpret_cast<F7ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6]);
-      } else if (data.size() == 8) {
-        auto func = reinterpret_cast<F8ARGS<T>>(func_sym);
-        (*func)(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
-      } else {
-          LOG(FATAL) << "Unsupported argument number: " << data.size();
-      }
-    } catch (const std::exception& e) {
-      LOG(FATAL) << "Execution failure: " << e.what();
-    }
-  }
-
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
    * for execution given some parameters.
@@ -134,50 +76,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  runtime::PackedFunc GetFunction(const std::string& name,
-                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    curr_id_ = GetSubgraphID(name);
-    Open(this->GetExternLibPaths(curr_id_));
-    CHECK(handle_) << "The external module has not been built or failed to open.\n";
-
-    // Generate an external packed function
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      const DLTensor* dptr = ((runtime::NDArray) args[0]).operator->();
-
-      // Check type and argument number
-      auto info = GetSubgraphInfo(curr_id_);
-      CHECK(info.first.code == dptr->dtype.code && info.first.bits == dptr->dtype.bits)
-            << "Data type of subgraph " << curr_id_ << " and input is mismatch";
-      CHECK(info.second == args.size())
-            << "Argument number of subgraph " << curr_id_
-            << " and input data is mismatch: " << info.second
-            << " vs. " << args.size();
-
-      // Get function from the library
-      std::string encoded_name = GetPrefix() + curr_id_;
-      auto func_sym = GetSymbol(encoded_name);
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        std::vector<float *> data;
-        for (int i = 0; i < args.size(); ++i) {
-          runtime::NDArray arg = args[i];
-          data.push_back(reinterpret_cast<float*>(arg->data));
-        }
-        Invoke<float>(func_sym, data);
-      } else if (runtime::TypeMatch(dptr->dtype, kDLFloat, 64)) {
-        std::vector<double*> data;
-        for (int i = 0; i < args.size(); ++i) {
-          runtime::NDArray arg = args[i];
-          data.push_back(reinterpret_cast<double*>(arg->data));
-        }
-        Invoke<double>(func_sym, data);
-      } else {
-        LOG(FATAL) << "Only support float32 and float64 types.";
-      }
-      //*rv = out;
-    });
-  }
+  virtual runtime::PackedFunc GetFunction(
+      const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
 
   /*!
    * \brief Get the source code of the external module.
@@ -226,25 +126,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     return tokens[1];
   }
 
-  bool IsOp(const CallNode* call, std::string op_name) {
-    const auto* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "Expects a single op.";
-    Op op = GetRef<Op>(op_node);
-    return op == Op::Get(op_name);
-  }
-
  protected:
-  std::vector<int> GetShape(const Expr& expr) const {
-    const auto* ttype = expr->checked_type().as<TensorTypeNode>();
-    CHECK(ttype);
-    std::vector<int> _shape;
-    for (int i = 0; i < ttype->shape.size(); ++i) {
-      auto* val = ttype->shape[i].as<IntImm>();
-      CHECK(val);
-      _shape.push_back(val->value);
-    }
-    return _shape;
-  }
 
   // Platform dependent handlers for opening system lib.
 #if defined(_WIN32)
@@ -306,10 +188,6 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     }
   }
 #endif
-
-private:
-  std::string curr_id_;
-  std::unordered_map<std::string, std::pair<DLDataType, int>> subgraph_info_;
 };
 
 }  // namespace contrib
diff --git a/python/tvm/relay/op/contrib/cblas/__init__.py b/python/tvm/relay/op/contrib/cblas/__init__.py
deleted file mode 100644
index 0da426ab4741..000000000000
--- a/python/tvm/relay/op/contrib/cblas/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=wildcard-import
-"""Neural network related operators."""
-from __future__ import absolute_import as _abs
-from .extern_op import *
diff --git a/python/tvm/relay/op/contrib/cblas/extern_op.py b/python/tvm/relay/op/contrib/cblas/extern_op.py
deleted file mode 100644
index 8de8a3f45534..000000000000
--- a/python/tvm/relay/op/contrib/cblas/extern_op.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""CBLAS library supported operators."""
-from __future__ import absolute_import
-
-
-def dense(attrs, args):
-    """Check if the external codegen should be used.
-    """
-    return (args[0]._checked_type_.dtype == 'float32' or args[0]._checked_type_.dtype == 'float64')
diff --git a/python/tvm/relay/op/contrib/dnnl/extern_op.py b/python/tvm/relay/op/contrib/dnnl/extern_op.py
index ede78c668516..f5e26cde1cd0 100644
--- a/python/tvm/relay/op/contrib/dnnl/extern_op.py
+++ b/python/tvm/relay/op/contrib/dnnl/extern_op.py
@@ -35,5 +35,11 @@ def relu(attrs, args):
 
 def batch_norm(attrs, args):
     """Check if the external codegen should be used.
+    FIXME: Turn off due to not support of multiple outputs.
     """
     return False
+
+def add(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
\ No newline at end of file
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index 05eb0f1edd7f..c046a17a7710 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -78,21 +78,18 @@ def external_batch_norm(attrs, args, compiler):
     """
     return get_extern_op(compiler, 'batch_norm')(attrs, args)
 
-
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'subtract')(attrs, args)
 
-
 @reg.register_extern_op("add")
 def external_add(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'add')(attrs, args)
 
-
 @reg.register_extern_op("multiply")
 def external_multiply(attrs, args, compiler):
     """Check if the external compiler should be used.
diff --git a/src/relay/backend/contrib/cblas/codegen.cc b/src/relay/backend/contrib/cblas/codegen.cc
deleted file mode 100644
index dc335d13723e..000000000000
--- a/src/relay/backend/contrib/cblas/codegen.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <dlpack/dlpack.h>
-#include <stdlib.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/contrib_codegen.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/util.h>
-
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-class CblasModuleNode : public ExternModuleNodeBase {
- public:
-  const std::vector<std::string> GetExternLibPaths(std::string id = "") const override {
-    return {"/tmp/relay_cblas_lib_" + id + ".so"};
-  }
-
-  const std::string GetPrefix() const override {
-    return "cblas_";
-  }
-
-  /*!
-   * \brief Get the source code of the external module.
-   *
-   * \param format The format of the source code.
-   *
-   * \return The source code of the external library module in the text form.
-   */
-  TVM_DLL std::string GetSource(const std::string& format = "") override {
-    return "";
-  }
-
-  const char* type_key() const override {
-    return "CblasModule";
-  }
-
-  void Build(const Expr& expr) override {
-    Function func = Downcast<Function>(expr);
-    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
-    const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "CBLAS expects a single dense op.";
-
-    // Record subgraph ID for runtime invoke.
-    auto id = GetSubgraphID(func);
-    std::string code = "";
-
-    // Args: ID
-    std::vector<std::string> args;
-    args.push_back(GetPrefix() + id);
-
-    if (IsOp(call, "nn.dense")) {
-      auto ishape = GetShape(call->args[0]);
-      auto wshape = GetShape(call->args[1]);
-
-      // Args: M, N, K
-      args.push_back(std::to_string(ishape[0]));
-      args.push_back(std::to_string(wshape[1]));
-      args.push_back(std::to_string(ishape[1]));
-
-      auto type_node = call->checked_type().as<TensorTypeNode>();
-      CHECK(type_node != nullptr);
-      CHECK(type_node->dtype.is_float()) << "Only support float types";
-      auto bits = type_node->dtype.bits();
-      SetSubgraphInfo(id, DLDataType{kDLFloat, static_cast<uint8_t>(bits), 1}, 3);
-
-      code = "DENSE_FP" + std::to_string(bits) + "(" +
-             args[0] + ", " + args[1] + ", " + args[2] + ", " + args[3] + ");";
-    } else {
-      LOG(FATAL) << "CBLAS expects a single dense op.";
-    }
-
-    if (!std::getenv("MKLROOT")) {
-      LOG(FATAL) << "MKLROOT not found. Did you source mklvars.sh?";
-    }
-    std::string lib_src_name = "/tmp/relay_cblas_lib_" + id + ".cc";
-    std::string lib_name = "/tmp/relay_cblas_lib_" + id + ".so";
-
-    // Prepare library source
-    std::string cmd = "cp src/relay/backend/contrib/cblas/libs.cc " + lib_src_name;
-    std::system(cmd.c_str());
-
-    cmd = "echo \"" + code + "\" >> " + lib_src_name;
-    std::system(cmd.c_str());
-
-    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name + " -o " + lib_name +
-          " -ldl -lpthread -lm -lmkl_rt";
-    int ret = std::system(cmd.c_str());
-    if (ret != 0) {
-      LOG(FATAL) << "Failed to compile CBLAS library. Error code: " << ret;
-    }
-  }
-};
-
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression and
- * compile it into a runtime module.
- */
-runtime::Module CblasCompiler(const Expr& expr) {
-  std::shared_ptr<CblasModuleNode> n = std::make_shared<CblasModuleNode>();
-  n->Build(expr);
-  return runtime::Module(n);
-}
-
-TVM_REGISTER_API("relay.ext.cblas")
-.set_body_typed(CblasCompiler);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/contrib/cblas/libs.cc b/src/relay/backend/contrib/cblas/libs.cc
deleted file mode 100644
index 6cd08d75ca30..000000000000
--- a/src/relay/backend/contrib/cblas/libs.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <mkl_cblas.h>
-#include <stdio.h>
-
-#define DENSE_FP32(p_ID_, p_M_, p_N_, p_K_)                                                       \
-  extern "C" void p_ID_(float* A, float* B, float* C) {                                           \
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p_M_, p_N_, p_K_, 1.0, A, p_K_, B, p_N_, \
-                0.0, C, p_N_);                                                                    \
-  }
-
-#define DENSE_FP64(p_ID_, p_M_, p_N_, p_K_)                                                       \
-  extern "C" void p_ID_(double* A, double* B, double* C) {                                        \
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p_M_, p_N_, p_K_, 1.0, A, p_K_, B, p_N_, \
-                0.0, C, p_N_);                                                                    \
-  }
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index d8f65cac504d..2440a4448d3b 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -32,53 +32,46 @@
 #include <dlfcn.h>
 #endif
 
+#include "libs.h"
+
 namespace tvm {
 namespace relay {
 namespace contrib {
 
-class DNNLModuleNode : public ExternModuleNodeBase {
+typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
+
+// FIXME: This is an experimental implementation. We should implement all utilities
+// and make a base claaa such as ExternBuilder for users to implement.
+class DnnlBuilder : public ExprVisitor {
  public:
-  const std::vector<std::string> GetExternLibPaths(std::string id) const override {
-    return {"/tmp/relay_dnnl_lib_" + id + ".so"};
-  }
+  DnnlBuilder(std::string id) { this->_subgraph_id = id; }
 
-  const std::string GetPrefix() const override {
-    return "dnnl_";
+  void VisitExpr_(const VarNode* node) final {
+    _subgraph_args.push_back(node->name_hint());
+    _out.clear();
+    _out.push_back({node->name_hint(), 0});
   }
 
-  /*!
-   * \brief Get the source code of the external module.
-   *
-   * \param format The format of the source code.
-   *
-   * \return The source code of the external library module in the text form.
-   */
-  TVM_DLL std::string GetSource(const std::string& format = "") override { return ""; }
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    ; // Do nothing
+  }
 
-  const char* type_key() const override { return "DNNLModule"; }
+  void VisitExpr_(const CallNode* call) final {
+    std::string func_name = _subgraph_id + "_" + std::to_string(_func_idx++);
 
-  void Build(const Expr& expr) override {
-    Function func = Downcast<Function>(expr);
-    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
-    const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "DNNL expects a single convolution or dense op.";
-
-    // Record subgraph ID for runtime invoke.
-    auto id = GetSubgraphID(func);
-    std::string encoded_id = GetPrefix() + id;
-    std::string code = "";
+    // Make function declaration
+    std::string decl = "";
 
     // Args: ID
+    std::string macro = "";
     std::vector<std::string> args;
-    args.push_back(encoded_id);
 
     if (IsOp(call, "nn.conv2d")) {
-      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
-      code = "CONV2D";
+      macro = "CONV2D";
       const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
 
-      auto ishape = GetShape(call->args[0]);
-      auto wshape = GetShape(call->args[1]);
+      auto ishape = GetShape(call->args[0]->checked_type());
+      auto wshape = GetShape(call->args[1]->checked_type());
 
       // Args: N, C, H, W
       for (auto s : ishape) {
@@ -95,10 +88,9 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
       args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
     } else if (IsOp(call, "nn.dense")) {
-      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
-      code = "DENSE";
-      auto ishape = GetShape(call->args[0]);
-      auto wshape = GetShape(call->args[1]);
+      macro = "DENSE";
+      auto ishape = GetShape(call->args[0]->checked_type());
+      auto wshape = GetShape(call->args[1]->checked_type());
 
       // Args: N, C, O
       args.push_back(std::to_string(ishape[0]));
@@ -106,65 +98,234 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       args.push_back(std::to_string(wshape[0]));
 
     } else if (IsOp(call, "nn.relu")) {
-      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 2);
-      code = "RELU";
-      auto ishape = GetShape(call->args[0]);
+      macro = "RELU";
+      auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: N, C, H, W
       for (auto s : ishape) {
         args.push_back(std::to_string(s));
       }
     } else if (IsOp(call, "nn.batch_norm")) {
-      SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 8);
-      code = "BN";
+      macro = "BN";
       const auto* bn_attr = call->attrs.as<BatchNormAttrs>();
-      auto ishape = GetShape(call->args[0]);
+      auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: N, C, H, W
       for (auto s : ishape) {
         args.push_back(std::to_string(s));
-        }
+      }
+
+      // Args: epilson
+      args.push_back(std::to_string(bn_attr->epsilon));
+    } else if (IsOp(call, "add")) {
+      macro = "ADD";
+      auto ishape = GetShape(call->args[0]->checked_type());
 
-        // Args: epilson
-        args.push_back(std::to_string(bn_attr->epsilon));
+      // Args: H, W
+      for (auto s : ishape) {
+        args.push_back(std::to_string(s));
+      }
     } else {
-      LOG(FATAL) << "DNNL expects a single convolution or dense op.";
+      LOG(FATAL) << "Unsupported op: " << AsText(call->op, false);
+    }
+
+    decl = macro + "(" + func_name;
+    for (int i = 0; i < args.size(); ++i) {
+      decl += ", " + args[i];
+    }
+    decl += ");";
+    _func_decl.push_back(decl);
+
+    // Make function call when visiting arguments
+    bool first = true;
+    std::string func_call = func_name + "(";
+    for (int i = 0; i < call->args.size(); ++i) {
+      VisitExpr(call->args[i]);
+      for (auto out : _out) {
+        if (!first) {
+          func_call += ", ";
+        }
+        first = false;
+        func_call += out.first;
+      }
+    }
+
+    auto type_node = call->checked_type().as<TensorTypeNode>();
+    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
+        << "Only support single output tensor with float type";
+    std::string out = "buf_" + std::to_string(_buf_idx++);
+    auto out_shape = GetShape(call->checked_type());
+    int out_size = 1;
+    for (int i = 0; i < out_shape.size(); ++i) {
+      out_size *= out_shape[i];
     }
-    Compile(id, code, args);
+    std::string buf_decl =
+        "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
+    _buf_decl.push_back(buf_decl);
+
+    func_call += ", " + out + ");";
+    _subgraph_body.push_back(func_call);
+
+    // Update output buffer
+    _out.clear();
+    _out.push_back({out, out_size});
+  }
+
+  std::string build() {
+    std::string code = "";
+
+    // Write function macros
+    for (auto decl : _func_decl) {
+      code += decl + "\n";
+    }
+
+    // Write subgraph function declaration
+    code += "extern \\\"C\\\" void " + _subgraph_id + "(DnnlPackedArgs args, float* out) {\n";
+
+    // Unpack inputs
+    for (int i = 0; i < _subgraph_args.size(); ++i) {
+      code += "float* " + _subgraph_args[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
+    }
+    // Function body
+    for (auto decl : _buf_decl) {
+      code += decl + "\n";
+    }
+    for (auto stmt : _subgraph_body) {
+      code += stmt + "\n";
+    }
+
+    // Copy output
+    CHECK(_out.size() == 1) << "Internal error";
+    code += "memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+
+    code += "}\n";
+    return code;
   }
 
  private:
-  void Compile(std::string id, std::string code, std::vector<std::string> args) {
+  std::string _subgraph_id = "";
+  int _func_idx = 0;
+  int _buf_idx = 0;
+  std::vector<std::string> _subgraph_args;
+  std::vector<std::string> _subgraph_body;
+  std::vector<std::string> _func_decl;
+  std::vector<std::string> _buf_decl;
+  std::vector<std::pair<std::string, int>> _out;
+
+  std::vector<int> GetShape(const Type& type) const {
+    const auto* ttype = type.as<TensorTypeNode>();
+    CHECK(ttype);
+    std::vector<int> _shape;
+    for (int i = 0; i < ttype->shape.size(); ++i) {
+      auto* val = ttype->shape[i].as<IntImm>();
+      CHECK(val);
+      _shape.push_back(val->value);
+    }
+    return _shape;
+  }
+
+  bool IsOp(const CallNode* call, std::string op_name) {
+    const auto* op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "Expects a single op.";
+    Op op = GetRef<Op>(op_node);
+    return op == Op::Get(op_name);
+  }
+};
+
+class DNNLModuleNode : public ExternModuleNodeBase {
+ public:
+  const std::vector<std::string> GetExternLibPaths(std::string id) const override {
+    return {"/tmp/relay_dnnl_lib_" + id + ".so"};
+  }
+
+  const std::string GetPrefix() const override {
+    return "dnnl_";
+  }
+
+  /*!
+   * \brief Get the source code of the external module.
+   *
+   * \param format The format of the source code.
+   *
+   * \return The source code of the external library module in the text form.
+   */
+  TVM_DLL std::string GetSource(const std::string& format = "") override { return ""; }
+
+  const char* type_key() const override { return "DNNLModule"; }
+
+  /*!
+   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
+   * for execution given some parameters.
+   *
+   * \param name the name of the external function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc(nullptr) when it is not available.
+   */
+  runtime::PackedFunc GetFunction(const std::string& name,
+                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    curr_id_ = GetSubgraphID(name);
+    Open(this->GetExternLibPaths(curr_id_));
+    CHECK(handle_) << "The external module has not been built or failed to open.\n";
+
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
+      runtime::NDArray out_arg = args[args.size() - 1];
+      auto out = reinterpret_cast<float*>(out_arg->data);
+
+      // Get function from the library
+      std::string encoded_name = GetPrefix() + curr_id_;
+      auto func_s = reinterpret_cast<DnnlSubgraphFunc>(GetSymbol(encoded_name));
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        DnnlPackedArgs packed_args;
+        packed_args.data = (void**)malloc(sizeof(float*) * args.size());
+        for (int i = 0; i < args.size() - 1; ++i) {
+          runtime::NDArray arg = args[i];
+          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
+        }
+        (*func_s)(packed_args, out);
+      } else {
+        LOG(FATAL) << "Only support float32 type.";
+      }
+      *rv = out;
+    });
+  }
+
+  void Build(const Expr& expr) override {
+    Function func = Downcast<Function>(expr);
+    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    const auto* call = func->body.as<CallNode>();
+    CHECK(call) << "DNNL expects a single convolution or dense op";
+
+    // Record subgraph ID for runtime invoke.
+    auto id = GetSubgraphID(func);
+    auto builder = DnnlBuilder(GetPrefix() + id);
+    builder.VisitExpr(func->body);
+    std::string code = builder.build();
+
+    // Prepare library source
     // FIXME: Now we compile N libraries for N subgraphs, but we should merge them to one.
     std::string lib_src_name = "/tmp/relay_dnnl_lib_" + id + ".cc";
     std::string lib_name = "/tmp/relay_dnnl_lib_" + id + ".so";
-
-    // Prepare library source
     std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + lib_src_name;
     std::system(cmd.c_str());
+    std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/");
 
-    // Push macro implementation
-    bool first = true;
-    std::string macro = code + "(";
-    for (auto arg : args) {
-      if (!first) macro += ", ";
-      first = false;
-      macro += arg;
-    }
-    macro += ")";
-    cmd = "echo \"" + macro + ";\" >> " + lib_src_name;
+    cmd = "echo \"" + code + "\" >> " + lib_src_name;
     std::system(cmd.c_str());
 
-    // Compile
-    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name +
-          " -o " + lib_name + " -ldl -lpthread -lm -ldnnl";
+    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name + " -o " + lib_name +
+          " -ldl -lpthread -lm -ldnnl";
     int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
     }
   }
 
-  std::string _curr_id;
+ private:
+  std::string curr_id_;
 };
 
 /*!
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index dbe5ebe029b4..9ba12f869531 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -19,11 +19,12 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <algorithm>
-#include <numeric>
 #include <iostream>
+#include <numeric>
 #include <string>
 
 #include "dnnl.hpp"
+#include "libs.h"
 
 using namespace dnnl;
 
@@ -63,7 +64,7 @@ inline void read_from_dnnl_memory(void* handle, memory& mem) {
     auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);                         \
     auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);                       \
     auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);                 \
-    auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any);                         \
+    auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);                        \
                                                                                                    \
     auto conv2d_desc =                                                                             \
         convolution_forward::desc(prop_kind::forward_inference, algorithm::convolution_direct,     \
@@ -188,3 +189,33 @@ inline void read_from_dnnl_memory(void* handle, memory& mem) {
     read_from_dnnl_memory(out, dst_memory);                                                   \
     free(weight);                                                                             \
   }
+
+#define ADD(p_ID_, p_N_, p_C_, p_H_, p_W_)                                           \
+  extern "C" void p_ID_(float* data, float* weight, float* out) {                    \
+    using tag = memory::format_tag;                                                  \
+    using dt = memory::data_type;                                                    \
+                                                                                     \
+    engine eng(engine::kind::cpu, 0);                                                \
+    stream s(eng);                                                                   \
+                                                                                     \
+    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                 \
+                                                                                     \
+    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                      \
+    auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});                  \
+    auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});                     \
+                                                                                     \
+    auto data_memory = memory(data_md, eng, data);                                   \
+    auto weight_memory = memory(weight_md, eng, weight);                             \
+    auto dst_memory = memory(dst_md, eng);                                           \
+                                                                                     \
+    auto add_desc = binary::desc(algorithm::binary_add, data_md, weight_md, dst_md); \
+    auto add_prim_desc = binary::primitive_desc(add_desc, eng);                      \
+    assert(dst_md == add_prim_desc.dst_desc());                                      \
+                                                                                     \
+    auto add = binary(add_prim_desc);                                                \
+    add.execute(s, {{DNNL_ARG_SRC_0, data_memory},                                   \
+                    {DNNL_ARG_SRC_1, weight_memory},                                 \
+                    {DNNL_ARG_DST, dst_memory}});                                    \
+    s.wait();                                                                        \
+    read_from_dnnl_memory(out, dst_memory);                                          \
+  }
diff --git a/src/relay/backend/contrib/dnnl/libs.h b/src/relay/backend/contrib/dnnl/libs.h
new file mode 100644
index 000000000000..beaeaefcd3db
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/libs.h
@@ -0,0 +1,24 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstdint>
+#include <iostream>
+
+typedef struct {
+  void** data;
+} DnnlPackedArgs;
\ No newline at end of file
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 45319b5ef4bf..a862b0e6a70a 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -30,7 +30,133 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*GccBinaryFunc)(ExternalTensor a, ExternalTensor b, ExternalTensor* out);
+typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
+
+// FIXME: This is an experimental implementation. We should implement all utilities
+// and make a base claaa such as ExternBuilder for users to implement.
+class GccBuilder : public ExprVisitor {
+ public:
+  GccBuilder(std::string id) { this->_subgraph_id = id; }
+
+  void VisitExpr_(const VarNode* node) {
+    _subgraph_args.push_back(node->name_hint());
+    _out.clear();
+    _out.push_back({node->name_hint(), 0});
+  }
+ 
+  void VisitExpr_(const CallNode* call) final {
+    auto op_node = call->op.as<OpNode>();
+    std::string func_name = _subgraph_id + "_" + std::to_string(_func_idx++);
+
+    // Make function declaration
+    std::string decl = "GCC_BINARY_OP_" + std::to_string(call->args.size()) +
+                       "D(" + func_name + ", ";
+
+    if (GetRef<Op>(op_node) == Op::Get("add")) {
+      decl += "+";
+    } else if (GetRef<Op>(op_node) == Op::Get("subtract")) {
+      decl += "-";
+    } else if (GetRef<Op>(op_node) == Op::Get("multiply")) {
+      decl += "*";
+    } else {
+      LOG(FATAL) << "Unrecognized op";
+    }
+
+    auto in_shape = GetShape(call->args[0]->checked_type());
+    for (int i = 0; i < in_shape.size(); ++i) {
+      decl += ", " + std::to_string(in_shape[i]);
+    }
+    decl += ");";
+    _func_decl.push_back(decl);
+
+    // Make function call when visiting arguments
+    bool first = true;
+    std::string gcc_call = func_name + "(";
+    for (int i = 0; i < call->args.size(); ++i) {
+      VisitExpr(call->args[i]);
+      for (auto out : _out) {
+        if (!first) {
+          gcc_call += ", ";
+        }
+        first = false;
+        gcc_call += out.first;
+      }
+    }
+
+    auto type_node = call->checked_type().as<TensorTypeNode>();
+    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
+        << "Only support single output tensor with float type";
+    std::string out = "buf_" + std::to_string(_buf_idx++);
+    auto out_shape = GetShape(call->checked_type());
+    int out_size = 1;
+    for (int i = 0; i < out_shape.size(); ++i) {
+      out_size *= out_shape[i];
+    }
+    std::string buf_decl =
+        "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
+    _buf_decl.push_back(buf_decl);
+
+    gcc_call += ", " + out + ");";
+    _subgraph_body.push_back(gcc_call);
+
+    // Update output buffer
+    _out.clear();
+    _out.push_back({out, out_size});
+  }
+
+  std::string build() {
+    std::string code = "";
+
+    // Write function macros
+    for (auto decl : _func_decl) {
+      code += decl + "\n";
+    }
+
+    // Write subgraph function declaration
+    code += "extern \\\"C\\\" void " + _subgraph_id + "(GccPackedArgs args, float* out) {\n";
+
+    // Unpack inputs
+    for (int i = 0; i < _subgraph_args.size(); ++i) {
+      code += "float* " + _subgraph_args[i] + " = args.data[" + std::to_string(i) + "];";
+    }
+    // Function body
+    for (auto decl : _buf_decl) {
+      code += decl + "\n";
+    }
+    for (auto stmt : _subgraph_body) {
+      code += stmt + "\n";
+    }
+
+    // Copy output
+    CHECK(_out.size() == 1) << "Internal error";
+    code += "memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+
+    code += "}\n";
+    return code;
+  }
+
+ private:
+  std::string _subgraph_id = "";
+  int _func_idx = 0;
+  int _buf_idx = 0;
+  std::vector<std::string> _subgraph_args;
+  std::vector<std::string> _subgraph_body;
+  std::vector<std::string> _func_decl;
+  std::vector<std::string> _buf_decl;
+  std::vector<std::pair<std::string, int>> _out;
+
+  std::vector<int> GetShape(const Type& type) const {
+    const auto* ttype = type.as<TensorTypeNode>();
+    CHECK(ttype);
+    std::vector<int> _shape;
+    for (int i = 0; i < ttype->shape.size(); ++i) {
+      auto* val = ttype->shape[i].as<IntImm>();
+      CHECK(val);
+      _shape.push_back(val->value);
+    }
+    return _shape;
+  }
+};
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
@@ -57,44 +183,49 @@ class GccModuleNode : public ExternModuleNodeBase {
     return "GccModule";
   }
 
+  runtime::PackedFunc GetFunction(const std::string& name,
+                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+    _curr_id = GetSubgraphID(name);
+    Open(this->GetExternLibPaths(_curr_id));
+    CHECK(handle_) << "The external module has not been built or failed to open.\n";
+
+    // Generate an external packed function
+    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
+      runtime::NDArray out_arg = args[args.size() - 1];
+      auto out = reinterpret_cast<float*>(out_arg->data);
+
+      // Get function from the library
+      std::string encoded_name = GetPrefix() + _curr_id;
+      auto func_s = reinterpret_cast<GccSubgraphFunc>(GetSymbol(encoded_name));
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        GccPackedArgs packed_args;
+        packed_args.data = (float**)malloc(sizeof(float*) * args.size());
+        for (int i = 0; i < args.size() - 1; ++i) {
+          runtime::NDArray arg = args[i];
+          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
+        }
+        (*func_s)(packed_args, out);
+      } else {
+        LOG(FATAL) << "Only support float32 type.";
+      }
+      *rv = out;
+    });
+  }
+
   void Build(const Expr& expr) override {
     Function func = Downcast<Function>(expr);
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "GCC expects a single op.";
-
-    auto ashape = GetShape(call->args[0]);
-    auto bshape = GetShape(call->args[1]);
-
-    // Check shape
-    CHECK(ashape.size() <= 2 && ashape.size() == bshape.size())
-        << "Input shape dimensions are not consistent, " << ashape.size() << " vs. "
-        << bshape.size();
-    for (int i = 0; i < ashape.size(); ++i) {
-      CHECK(ashape[i] == bshape[i]) << "Input shapes are not consistent at dim " << i << ":"
-                                    << ashape[i] << " vs. " << bshape[i];
-    }
+    CHECK(call) << "Unknown error";  // comaniac: Don't know in what case this will fail.
 
     // Record subgraph ID for runtime invoke.
     auto id = GetSubgraphID(func);
-    SetSubgraphInfo(id, DLDataType{kDLFloat, 32, 1}, 3);
-    std::string code =
-        "GCC_BINARY_OP_" + std::to_string(ashape.size()) + "D(" + GetPrefix() + id + ", ";
-
-    if (IsOp(call, "add")) {
-      code += "+";
-    } else if (IsOp(call, "subtract")) {
-      code += "-";
-    } else if (IsOp(call, "multiply")) {
-      code += "*";
-    } else {
-      LOG(FATAL) << "Unrecognized op: ";
-    }
-
-    for (int i = 0; i < ashape.size(); ++i) {
-      code += ", " + std::to_string(ashape[i]);
-    }
-    code += ");";
+    auto builder = GccBuilder(GetPrefix() + id);
+    builder.VisitExpr(func->body);
+    std::string code = builder.build();
 
     // Prepare library source
     std::string lib_src_name = "/tmp/relay_gcc_lib_" + id + ".cc";
@@ -102,17 +233,19 @@ class GccModuleNode : public ExternModuleNodeBase {
     std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + lib_src_name;
     std::system(cmd.c_str());
     std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/");
-    
+
     cmd = "echo \"" + code + "\" >> " + lib_src_name;
     std::system(cmd.c_str());
 
-    cmd = "g++ -std=c++11 -shared -fPIC -ldl " + lib_src_name +
-          " -o " + lib_name;
+    cmd = "g++ -std=c++11 -shared -fPIC -ldl " + lib_src_name + " -o " + lib_name;
     int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
   }
+
+ private:
+  std::string _curr_id;
 };
 
 
@@ -123,11 +256,6 @@ class GccModuleNode : public ExternModuleNodeBase {
  * The external codegen tool should have been registered similiarly to LLVM,
  * CUDA, etc, under TVM so the generated code could be packed in a runtime
  * module. This module simplifies code serialization and invocation.
- *
- * TODO(@zhiics)
- *  1. Let the external compiler ingest a Relay module instead of
- * a single expression/function.
- *  2. Return runtime::Module.
  */
 runtime::Module GccCompiler(const Expr& expr) {
   std::shared_ptr<GccModuleNode> n = std::make_shared<GccModuleNode>();
diff --git a/src/relay/backend/contrib/gcc/libs.h b/src/relay/backend/contrib/gcc/libs.h
index 0467567d9795..1549cc2f6ef8 100644
--- a/src/relay/backend/contrib/gcc/libs.h
+++ b/src/relay/backend/contrib/gcc/libs.h
@@ -20,7 +20,5 @@
 #include <iostream>
 
 typedef struct {
-  void* data;
-  int ndim;
-  int64_t* shape;
-} ExternalTensor;
+  float** data;
+} GccPackedArgs;
\ No newline at end of file
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index c98470b8bfa8..60fac27d7ef8 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -184,8 +184,8 @@ TVM_REGISTER_API("relay._expr.FunctionGetParams")
 
 bool FunctionNode::IsExternal() const {
   NodeRef res = FunctionGetAttr(GetRef<Function>(this), "External");
-  const ir::IntImm* pval = res.as<ir::IntImm>();
-  return pval && pval->value != 0;
+  const ir::StringImm* pval = res.as<ir::StringImm>();
+  return pval;
 }
 
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 60ffe3bc7234..f67c24efe5b0 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -113,6 +113,10 @@ class Partitioner : public ExprMutator {
   }
 
   void MergeSubgraph(Subgraph* subgraph1, Subgraph* subgraph2) {
+    if (subgraph1 == subgraph2) {
+      return;
+    }
+
     // Merge subgraph 2 to subgraph 1 and erase subgraph 2.
     subgraph1->nodes.insert(subgraph2->nodes.begin(), subgraph2->nodes.end());
     for (auto arg : subgraph2->args) {
@@ -157,7 +161,7 @@ class Partitioner : public ExprMutator {
       // Find the corresponding subgraph and add the argument.
       auto subgraph = GetSubgraph(GetRef<Call>(call));
       if (!subgraph) {
-        throw Error(RELAY_ERROR("Cannot find the corresponding subgraph for end annotation:\n"
+        throw Error(RELAY_ERROR("Cannot find the corresponding subgraph for start annotation:\n"
                                 << AsText(GetRef<Call>(call), false)));
       }
       subgraph->args.push_back({var, input_expr});
@@ -194,19 +198,8 @@ class Partitioner : public ExprMutator {
       auto subgraph_func =
           FunctionNode::make(params, input, call->args[0]->checked_type_, {}, Attrs());
 
-      // FIXME: How to determine the function name?
-      // This is a hack for multiple subgraph test where each subgraph only has
-      // one call node.
-      // We can probably only pass "external" to indicate that this is an
-      // external funciton and leave the processing of the function to codegen.
-      // Otherwise, it's hard to deal with multiple-node subgraphs.
       Expr arg0 = call->args[0];
       std::string name = "subgraph_" + std::to_string(subgraph->id);
-      if (const auto* arg_call = arg0.as<CallNode>()) {
-        if (const auto* op_node = arg_call->op.as<OpNode>()) {
-          name += "_" + op_node->name;
-        }
-      }
       subgraph_func =
           FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make(name));
       subgraph_func = FunctionSetAttr(subgraph_func, "Primitive", tvm::Integer(1));
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 7814296ff606..1d799e9a32ab 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -24,47 +24,159 @@
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.annotation import subgraph_begin, subgraph_end
 
-class MyAnnotator(ExprMutator):
+class GCCAnnotator(ExprMutator):
+    """
+    A simple annotator that creates the following subgraph:
+           |
+      -- begin --
+           |
+          add
+           |
+        subtract
+           |
+        multiply
+           |
+       -- end --
+           |
+    """
+    def __init__(self):
+        super(GCCAnnotator, self).__init__()
+        self.in_subgraph = 0
+
     def visit_call(self, call):
-        #print(call.op.name)
         if call.op.name == "add": # Annotate begin at args
-            lhs = subgraph_begin(call.args[0], "gcc")
-            rhs = subgraph_begin(call.args[1], "gcc")
-            op = relay.add(lhs, rhs)
+            if self.in_subgraph == 1:
+                lhs = subgraph_begin(super().visit(call.args[0]), "gcc")
+                rhs = subgraph_begin(super().visit(call.args[1]), "gcc")
+                op = relay.add(lhs, rhs)
+                self.in_subgraph = 2
+                return op
+        elif call.op.name == "subtract":
+            if self.in_subgraph == 1:
+                lhs = super().visit(call.args[0])
+                rhs = super().visit(call.args[1])
+                if isinstance(lhs, relay.expr.Var):
+                    lhs = subgraph_begin(lhs, "gcc")
+                if isinstance(rhs, relay.expr.Var):
+                    rhs = subgraph_begin(rhs, "gcc")
+                return relay.subtract(lhs, rhs)
+        elif call.op.name == "multiply": # Annotate end at output
+            self.in_subgraph = 1
+            lhs = super().visit(call.args[0])
+            rhs = super().visit(call.args[1])
+            if isinstance(lhs, relay.expr.Var):
+                lhs = subgraph_begin(lhs, "gcc")
+            if isinstance(rhs, relay.expr.Var):
+                rhs = subgraph_begin(rhs, "gcc")
+            op = relay.multiply(lhs, rhs)
+            if self.in_subgraph == 2:
+                op = subgraph_end(op, "gcc")
+            self.in_subgraph = 0
             return op
-        elif call.op.name == "concatenate": # Annotate end at output
-            op = super().visit_call(call)
-            return subgraph_end(op, "gcc")
         return super().visit_call(call)
 
-    def visit_function(self, func):
-        return relay.Function(func.params, self.visit(func.body))
+class WholeGraphAnnotator(ExprMutator):
+    """
+    An annotator that creates a subgraph for an entire graph.
+    """
+
+    def __init__(self, compiler):
+        super(WholeGraphAnnotator, self).__init__()
+        self.compiler = compiler
+        self.last_call = True
+
+    def visit_call(self, call):
+        curr_last = self.last_call
+        self.last_call = False
+
+        params = []
+        for arg in call.args:
+            param = super().visit(arg)
+            if isinstance(param, relay.expr.Var):
+                param = subgraph_begin(param, self.compiler)
+            params.append(param)
+
+        new_call = relay.Call(call.op, params, call.attrs)
+        if curr_last:
+            new_call = subgraph_end(new_call, self.compiler)
+        return new_call
+
+class MobileNetAnnotator(ExprMutator):
+    """
+    Annotate mobilenet until global_avg_pool.
+    """
+
+    def __init__(self, compiler):
+        super(MobileNetAnnotator, self).__init__()
+        self.compiler = compiler
+        self.subgraph_open = False
+
+    def visit_call(self, call):
 
-def annotate(expr):
-    ann = MyAnnotator()
-    return ann.visit(expr)
+        if call.op.name == 'nn.global_avg_pool2d':
+            self.subgraph_open = True
+        subgraph_open = self.subgraph_open
 
-def test_partition_graph():
+        params = []
+        for arg in call.args:
+            param = super().visit(arg)
+            if call.op.name == 'nn.global_avg_pool2d':
+                param = subgraph_end(param, self.compiler)
+            if subgraph_open and isinstance(param, relay.expr.Var):
+                param = subgraph_begin(param, self.compiler)
+            params.append(param)
+
+        new_call = relay.Call(call.op, params, call.attrs)
+        return new_call
+
+def test_multi_node_subgraph():
     x = relay.var('x', shape=(10, 10))
-    z0 = relay.add(x, relay.const(0, dtype='float32'))
-    z1 = relay.add(x, relay.const(5, dtype='float32'))
-    z2 = relay.multiply(x, relay.const(2, dtype='float32'))
-    p0 = relay.subtract(z0, relay.const(3, dtype='float32'))
-    p1 = relay.subtract(z1, relay.const(4, dtype='float32'))
-    p2 = relay.add(z2, relay.const(7, dtype='float32'))
-    q = relay.concatenate((p0, p1, p2), axis=0)
-    f = relay.Function([x], q)
+    w0 = relay.var('w', shape=(10, 10))
+    w1 = relay.var('w', shape=(10, 10))
+    w2 = relay.var('w', shape=(10, 10))
+    w3 = relay.var('w', shape=(10, 10))
+    w4 = relay.var('w', shape=(10, 10))
+    w5 = relay.var('w', shape=(10, 10))
+    w6 = relay.var('w', shape=(10, 10))
+    w7 = relay.var('w', shape=(10, 10))
+
+    # Subgraph on GCC
+    # FIXME: We generate two subgraphs for this case but they should be merged to one
+    # due to the common input (x).
+    z0 = relay.add(x, w0)
+    p0 = relay.subtract(z0, w1)
+    q0 = relay.multiply(p0, w2)
+
+    z1 = relay.add(x, w3)
+    p1 = relay.subtract(z1, w4)
+    q1 = relay.multiply(p1, w5)
+
+    # Other parts on TVM
+    z2 = relay.add(x, w6)
+    q2 = relay.subtract(z2, w7)
+
+    r = relay.concatenate((q0, q1, q2), axis=0)
+    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
     mod = relay.Module()
-    mod["main"] = annotate(f)
-    print(mod['main'])
+    ann = GCCAnnotator()
+    mod["main"] = ann.visit(f)
     mod = relay.transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
     print(mod['main'])
-    #x_data = np.random.rand(10, 10).astype('float32')
-    #y_data = np.random.rand(10, 10).astype('float32')
-    # ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    # res = ex.evaluate()(x_data)
-    # tvm.testing.assert_allclose(res.asnumpy(), y_data - (x_data + x_data))
+
+    x_data = np.random.rand(10, 10).astype('float32')
+    w_data = []
+    for _ in range(8):
+        w_data.append(np.random.rand(10, 10).astype('float32'))
+    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+    res = ex.evaluate()(x_data, *w_data)
+    tvm.testing.assert_allclose(
+        res.asnumpy(),
+        np.concatenate(
+            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
+             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+             x_data + w_data[6] - w_data[7]),
+            axis=0))
 
 def test_extern_gcc():
     x = relay.var('x', shape=(10, 10))
@@ -78,37 +190,15 @@ def test_extern_gcc():
     mod["main"] = f
     mod = relay.transform.ExternOp("gcc")(mod)
     mod = relay.transform.PartitionGraph()(mod)
-    print(mod['main'])
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
-
-def test_extern_cblas():
-    m = 16
-    n = 224
-    k = 224
-    dtype = 'float64'
-    x = relay.var('x', shape=(m, k), dtype=dtype)
-    y = relay.var('y', shape=(n, k), dtype=dtype)
-    f = relay.Function([x, y], relay.op.nn.dense(x, y))
-    mod = relay.Module()
-    mod['main'] = f
-    mod = relay.transform.ExternOp('cblas')(mod)
-    mod = relay.transform.PartitionGraph()(mod)
 
-    x_data = np.random.uniform(0, 1, (m, k)).astype(dtype)
-    y_data = np.random.uniform(0, 1, (n, k)).astype(dtype)
     ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
     res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(
-        res.asnumpy(), np.dot(x_data, y_data.T), rtol=1e-5)
+    tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
 
 def test_extern_dnnl():
     dtype = 'float32'
     ishape = (1, 32, 14, 14)
     w1shape = (32, 1, 3, 3)
-    w2shape = (100, 32 * 14 * 14)
-
     data = relay.var('data', shape=(ishape), dtype=dtype)
     weight1 = relay.var('weight1', shape=(w1shape), dtype=dtype)
     depthwise_conv2d_1 = relay.nn.conv2d(data,
@@ -121,16 +211,12 @@ def test_extern_dnnl():
                                          kernel_size=(3, 3),
                                          padding=(1, 1),
                                          groups=32)
-    out1 = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-    out2 = relay.nn.batch_flatten(data=out1)
-    weight2 = relay.var('weight2', shape=(w2shape), dtype=dtype)
-    out3 = relay.nn.dense(out2, weight2)
+    out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
 
-    f = relay.Function([data, weight1, weight2], out3)
+    f = relay.Function([data, weight1], out)
 
     mod = relay.Module()
-    mod['main'] = f
-    mod = relay.transform.ExternOp('dnnl')(mod)
+    mod['main'] = WholeGraphAnnotator('dnnl').visit(f)
     mod = relay.transform.PartitionGraph()(mod)
 
     ref_mod = relay.Module()
@@ -138,51 +224,12 @@ def test_extern_dnnl():
 
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-    w2_data = np.random.uniform(0, 1, w2shape).astype(dtype)
-
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(i_data, w1_data, w2_data)
-
-    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
-    ref_res = ref_ex.evaluate()(i_data, w1_data, w2_data)
-
-    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
-
-def test_extern_dnnl_bn():
-    dtype = 'float32'
-    shapes = [
-        (1, 1024, 7, 7),
-        (1024, ),
-        (1024, ),
-        (1024, ),
-        (1024, )
-    ]
-    eps = 1e-5
-
-    data = [np.absolute(np.random.normal(size=shape).astype('float32'))
-            for shape in shapes]
-    relay_args = [
-        relay.var('data' + str(idx), shape=arg.shape, dtype=dtype)
-        for idx, arg in enumerate(data)
-    ]
-
-    out = relay.nn.batch_norm(*relay_args, epsilon=eps)[0]
-
-    f = relay.Function([*relay_args], out)
-
-    mod = relay.Module()
-    mod['main'] = f
-    mod = relay.transform.ExternOp('dnnl')(mod)
-    mod = relay.transform.PartitionGraph()(mod)
-
-    ref_mod = relay.Module()
-    ref_mod['main'] = f
 
     ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(*data)
+    res = ex.evaluate()(i_data, w1_data)
 
     ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
-    ref_res = ref_ex.evaluate()(*data)
+    ref_res = ref_ex.evaluate()(i_data, w1_data)
 
     tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -193,6 +240,7 @@ def test_extern_dnnl_mobilenet():
     ishape = (1, 3, 224, 224)
     mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
 
+    #mod['main'] = MobileNetAnnotator('dnnl').visit(mod['main'])
     mod = relay.transform.ExternOp('dnnl')(mod)
     mod = relay.transform.PartitionGraph()(mod)
 
@@ -201,17 +249,17 @@ def test_extern_dnnl_mobilenet():
     ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
     res = ex.evaluate()(i_data, **params)
 
-    ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
-    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
-    ref_res = ref_ex.evaluate()(i_data, **params)
+    # FIXME: When subgraph has only one op, Relay executor will use the cache value instead
+    # of re-computing, so the following checking logic does not work.
+    #ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
+    #ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
+    #ref_res = ref_ex.evaluate()(i_data, **params)
 
-    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+    #tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
 if __name__ == "__main__":
-    test_partition_graph()
+    test_multi_node_subgraph()
     test_extern_gcc()
-    test_extern_cblas()
     test_extern_dnnl()
-    test_extern_dnnl_bn()
     #test_extern_dnnl_mobilenet()
\ No newline at end of file

From d86a5f758ebd79fc6eeec6318d93fd2a39df2980 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 30 Sep 2019 18:40:32 +0000
Subject: [PATCH 18/34] To vm: enalbe multiple function compilation

---
 include/tvm/relay/contrib_codegen.h           | 30 ++++--
 include/tvm/runtime/vm.h                      | 22 ++---
 src/relay/backend/compile_engine.cc           |  2 +-
 src/relay/backend/contrib/dnnl/codegen.cc     | 93 ++++++++++++------
 src/relay/backend/contrib/dnnl/libs.cc        |  1 +
 src/relay/backend/contrib/gcc/codegen.cc      | 97 +++++++++++++------
 src/relay/backend/contrib/gcc/libs.cc         |  1 +
 src/relay/backend/vm/compiler.cc              | 53 ++++++++--
 src/relay/backend/vm/compiler.h               |  7 +-
 src/runtime/vm/vm.cc                          | 13 ++-
 .../python/relay/test_pass_partition_graph.py | 52 +++++++---
 11 files changed, 264 insertions(+), 107 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 6edef52d52c7..48cc1c078bd0 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -50,7 +50,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return An array of strings of the library paths.
    */
-  virtual const std::vector<std::string> GetExternLibPaths(std::string id = "") const = 0;
+  virtual const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const = 0;
 
   /*!
    * \brief Get the function prefix of this compiler.
@@ -59,13 +59,18 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    */
   virtual const std::string GetPrefix() const = 0;
 
+  /*!
+   * \brief Compile the external library.
+   */
+  virtual void CompileExternLib() = 0;
+
   /*!
    * \brief Build the shared library of external ops.
    *
-   * \param expr The subgraph Relay expression to be executed using extern ops.
+   * \param ref The subgraph Relay expression/module to be executed using extern ops.
    *
    */
-  virtual void Build(const Expr& expr) = 0;
+  virtual void Build(const NodeRef& ref) = 0;
 
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
@@ -101,14 +106,14 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return a vector of tokenized function name splitted by "_".
    */
-  std::string GetSubgraphID(Function& func) {
+  std::string GetSubgraphID(const Function& func) const {
     const auto name_node = FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
     CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
     std::string name = name_node->value;
     return GetSubgraphID(name);
   }
   
-  std::string GetSubgraphID(std::string name) {
+  std::string GetSubgraphID(const std::string& name) const {
     std::string temp = name;
     std::vector<std::string> tokens;
     std::string delimiter = "_";
@@ -133,7 +138,12 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   // The handle.
   HMODULE handle_{nullptr};
 
-  // Open the library
+  // Check if the handle_ is open.
+  bool IsOpen() const {
+    return handle_ != nullptr;
+  }
+
+  // Open the library.
   virtual void Open(const std::string& name) {
     std::wstring wname(name.begin(), name.end());
     handle_ = LoadLibraryW(wname.c_str());
@@ -155,9 +165,13 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   // The handle.
   void* handle_{nullptr};
 
-  // load the library
+  // Check if the handle_ is open.
+  bool IsOpen() const {
+    return handle_ != nullptr;
+  }
+
+  // load the library.
   virtual void Open(const std::vector<std::string> lib_names) {
-    Close();
     CHECK(lib_names.size() == 1) << "Default library loader only loads one library. "
                                  << "Please override the loader if multiple libraries are used";
     handle_ = dlopen(lib_names[0].c_str(), RTLD_LAZY | RTLD_LOCAL);
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 820cca296f94..bcdbbbb6cf16 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -719,20 +719,6 @@ class VirtualMachine : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name,
                                  const ObjectPtr<Object>& sptr_to_self);
 
-  /*!
-   * \brief Invoke an external function.
-   *
-   * \param external_index The offset of the external function in all functions.
-   * \param func The external function to be invoked.
-   * \param arg_count The number of arguments to the external function.
-   * \param output_size The number of outputs of the external function.
-   * \param args Arguments to the external function.
-   *
-   * \note The return value will be stored in the last output_size slots of args.
-   */
-  virtual void InvokeExternal(Index External_index, const relay::Function& func, Index arg_count,
-                              Index output_size, const std::vector<Object>& args);
-
   virtual ~VirtualMachine() {}
 
   const char* type_key() const final {
@@ -752,6 +738,8 @@ class VirtualMachine : public runtime::ModuleNode {
   std::vector<PackedFunc> packed_funcs_;
   /*! \brief The virtual machine's external function table. */
   std::vector<relay::Function> external_funcs;
+  /*! \brief The external module/library. */
+  std::vector<runtime::Module> ext_libs;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames_;
   /*! \brief The fuction table index of the current function. */
@@ -845,6 +833,12 @@ class VirtualMachine : public runtime::ModuleNode {
   /*! \brief Get device context for params. */
   TVMContext GetParamsContext() const;
 
+  std::unordered_map<Index, Index> external_map;
+
+  /*! \brief A mapping from the subgraph id to the external function name.
+   */
+  std::unordered_map<Index, std::string> external_func_map;
+
  private:
   /*!
    * \brief Invoke a global setting up the VM state to execute.
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 84cfe976ffbf..c35b49fba8d4 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -28,6 +28,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
 #include <topi/tags.h>
@@ -668,7 +669,6 @@ class CompileEngineImpl : public CompileEngineNode {
       runtime::Module mod = (*pf)(key->source_func);
       value->lib = mod;
       value->cached_func = CachedFunc();
-      // value->packed_func = (*pf)(key->source_func);;
       return value;
     }
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 2440a4448d3b..44afb6f5b02d 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -184,19 +184,19 @@ class DnnlBuilder : public ExprVisitor {
 
     // Unpack inputs
     for (int i = 0; i < _subgraph_args.size(); ++i) {
-      code += "float* " + _subgraph_args[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
+      code += "  float* " + _subgraph_args[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
     }
     // Function body
     for (auto decl : _buf_decl) {
-      code += decl + "\n";
+      code += "  " + decl + "\n";
     }
     for (auto stmt : _subgraph_body) {
-      code += stmt + "\n";
+      code += "  " + stmt + "\n";
     }
 
     // Copy output
     CHECK(_out.size() == 1) << "Internal error";
-    code += "memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+    code += "  memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
 
     code += "}\n";
     return code;
@@ -234,8 +234,10 @@ class DnnlBuilder : public ExprVisitor {
 
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(std::string id) const override {
-    return {"/tmp/relay_dnnl_lib_" + id + ".so"};
+  const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const override {
+    CHECK_GT(src_lib_path_.count(id), 0U);
+    const auto& pair = src_lib_path_.at(id);
+    return {pair.second};
   }
 
   const std::string GetPrefix() const override {
@@ -264,17 +266,19 @@ class DNNLModuleNode : public ExternModuleNodeBase {
    */
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    curr_id_ = GetSubgraphID(name);
-    Open(this->GetExternLibPaths(curr_id_));
-    CHECK(handle_) << "The external module has not been built or failed to open.\n";
+    std::string curr_id = GetSubgraphID(name);
+    if (!IsOpen()) {
+      Open(this->GetExternLibPaths(curr_id));
+    }
+    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
 
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
       runtime::NDArray out_arg = args[args.size() - 1];
       auto out = reinterpret_cast<float*>(out_arg->data);
 
       // Get function from the library
-      std::string encoded_name = GetPrefix() + curr_id_;
+      std::string encoded_name = GetPrefix() + curr_id;
       auto func_s = reinterpret_cast<DnnlSubgraphFunc>(GetSymbol(encoded_name));
 
       // Reinterpret data and function to the right type and invoke
@@ -293,48 +297,77 @@ class DNNLModuleNode : public ExternModuleNodeBase {
     });
   }
 
-  void Build(const Expr& expr) override {
-    Function func = Downcast<Function>(expr);
+  void CreateExternSignature(const Function& func, bool update) {
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "DNNL expects a single convolution or dense op";
 
     // Record subgraph ID for runtime invoke.
-    auto id = GetSubgraphID(func);
-    auto builder = DnnlBuilder(GetPrefix() + id);
+    auto sid = GetSubgraphID(func);
+
+    if (update) {
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      std::uniform_int_distribution<uint64_t> distr;
+      std::stringstream ss;
+      ss << std::hex << distr(gen);
+      src_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".cc";
+      lib_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".so";
+      std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + src_path_;
+      std::system(cmd.c_str());
+      std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/");
+    }
+
+    // Save the src and lib path.
+    src_lib_path_.emplace(sid, std::make_pair(src_path_, lib_path_));
+
+    auto builder = DnnlBuilder(GetPrefix() + sid);
     builder.VisitExpr(func->body);
     std::string code = builder.build();
 
-    // Prepare library source
-    // FIXME: Now we compile N libraries for N subgraphs, but we should merge them to one.
-    std::string lib_src_name = "/tmp/relay_dnnl_lib_" + id + ".cc";
-    std::string lib_name = "/tmp/relay_dnnl_lib_" + id + ".so";
-    std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + lib_src_name;
-    std::system(cmd.c_str());
-    std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/");
-
-    cmd = "echo \"" + code + "\" >> " + lib_src_name;
+    std::string cmd = "echo \"" + code + "\" >> " + src_path_;
     std::system(cmd.c_str());
+  }
 
-    cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + lib_src_name + " -o " + lib_name +
-          " -ldl -lpthread -lm -ldnnl";
+  void CompileExternLib() override {
+    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + src_path_ + " -o " + lib_path_ +
+                      " -ldl -lpthread -lm -ldnnl";
     int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
     }
   }
 
+  void Build(const NodeRef& ref) override {
+    if (ref->derived_from<FunctionNode>()) {
+      CreateExternSignature(Downcast<Function>(ref), true);
+      CompileExternLib();
+    } else if (ref->derived_from<relay::ModuleNode>()) {
+      relay::Module mod = Downcast<relay::Module>(ref);
+      bool update = true;
+      for (const auto& it : mod->functions) {
+        CreateExternSignature(Downcast<Function>(it.second), update);
+        update = false;
+      }
+      CompileExternLib();
+    } else {
+      LOG(FATAL) << "The input ref is expected to be a Relay function or module"
+                 << "\n";
+    }
+  }
  private:
-  std::string curr_id_;
+  std::string src_path_;
+  std::string lib_path_;
+  std::unordered_map<std::string, std::pair<std::string, std::string> > src_lib_path_;
 };
 
 /*!
- * \brief The external compiler/codegen tool. It takes a Relay expression and
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
  * compile it into a runtime module.
  */
-runtime::Module DNNLCompiler(const Expr& expr) {
+runtime::Module DNNLCompiler(const NodeRef& ref) {
   std::shared_ptr<DNNLModuleNode> n = std::make_shared<DNNLModuleNode>();
-  n->Build(expr);
+  n->Build(ref);
   return runtime::Module(n);
 }
 
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index 9ba12f869531..801d4fb9d73a 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -22,6 +22,7 @@
 #include <iostream>
 #include <numeric>
 #include <string>
+#include <string.h>
 
 #include "dnnl.hpp"
 #include "libs.h"
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index a862b0e6a70a..602617447a5c 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -17,6 +17,8 @@
  */
 #include <dlfcn.h>
 #include <stdlib.h>
+#include <random>
+#include <sstream>
 #include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
@@ -36,7 +38,7 @@ typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
 // and make a base claaa such as ExternBuilder for users to implement.
 class GccBuilder : public ExprVisitor {
  public:
-  GccBuilder(std::string id) { this->_subgraph_id = id; }
+  GccBuilder(const std::string& id) { this->_subgraph_id = id; }
 
   void VisitExpr_(const VarNode* node) {
     _subgraph_args.push_back(node->name_hint());
@@ -117,19 +119,19 @@ class GccBuilder : public ExprVisitor {
 
     // Unpack inputs
     for (int i = 0; i < _subgraph_args.size(); ++i) {
-      code += "float* " + _subgraph_args[i] + " = args.data[" + std::to_string(i) + "];";
+      code += "  float* " + _subgraph_args[i] + " = args.data[" + std::to_string(i) + "];\n";
     }
     // Function body
     for (auto decl : _buf_decl) {
-      code += decl + "\n";
+      code += "  " + decl + "\n";
     }
     for (auto stmt : _subgraph_body) {
-      code += stmt + "\n";
+      code += "  " + stmt + "\n";
     }
 
     // Copy output
     CHECK(_out.size() == 1) << "Internal error";
-    code += "memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+    code += "  memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
 
     code += "}\n";
     return code;
@@ -160,8 +162,10 @@ class GccBuilder : public ExprVisitor {
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(std::string id = "") const override {
-    return {"/tmp/relay_gcc_lib_" + id + ".so"};
+  const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const override {
+    CHECK_GT(src_lib_path_.count(id), 0U);
+    const auto& pair = src_lib_path_.at(id);
+    return {pair.second};
   }
 
   const std::string GetPrefix() const override {
@@ -185,18 +189,19 @@ class GccModuleNode : public ExternModuleNodeBase {
 
   runtime::PackedFunc GetFunction(const std::string& name,
                                   const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    _curr_id = GetSubgraphID(name);
-    Open(this->GetExternLibPaths(_curr_id));
-    CHECK(handle_) << "The external module has not been built or failed to open.\n";
-
+    std::string curr_id = GetSubgraphID(name);
+    if (!IsOpen()) {
+      Open(this->GetExternLibPaths(curr_id));
+    }
+    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
     // Generate an external packed function
-    return PackedFunc([sptr_to_self, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
       runtime::NDArray out_arg = args[args.size() - 1];
       auto out = reinterpret_cast<float*>(out_arg->data);
 
       // Get function from the library
-      std::string encoded_name = GetPrefix() + _curr_id;
+      std::string encoded_name = GetPrefix() + curr_id;
       auto func_s = reinterpret_cast<GccSubgraphFunc>(GetSymbol(encoded_name));
 
       // Reinterpret data and function to the right type and invoke
@@ -215,51 +220,83 @@ class GccModuleNode : public ExternModuleNodeBase {
     });
   }
 
-  void Build(const Expr& expr) override {
-    Function func = Downcast<Function>(expr);
+  void CreateExternSignature (const Function& func, bool update) {
     CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "Unknown error";  // comaniac: Don't know in what case this will fail.
 
     // Record subgraph ID for runtime invoke.
-    auto id = GetSubgraphID(func);
-    auto builder = GccBuilder(GetPrefix() + id);
-    builder.VisitExpr(func->body);
-    std::string code = builder.build();
+    auto sid = GetSubgraphID(func);
 
     // Prepare library source
-    std::string lib_src_name = "/tmp/relay_gcc_lib_" + id + ".cc";
-    std::string lib_name = "/tmp/relay_gcc_lib_" + id + ".so";
-    std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + lib_src_name;
-    std::system(cmd.c_str());
-    std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/");
+    if (update) {
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      std::uniform_int_distribution<uint64_t> distr;
+      std::stringstream ss;
+      ss << std::hex << distr(gen);
+      src_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".cc";
+      lib_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".so";
+      std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + src_path_;
+      std::system(cmd.c_str());
+      std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/");
+    }
+    // Save the src and lib path.
+    src_lib_path_.emplace(sid, std::make_pair(src_path_, lib_path_));
+
+    auto builder = GccBuilder(GetPrefix() + sid);
+    builder.VisitExpr(func->body);
+    std::string code = builder.build();
 
-    cmd = "echo \"" + code + "\" >> " + lib_src_name;
+    // Append the signature.
+    auto cmd = "echo \"" + code + "\" >> " + src_path_;
     std::system(cmd.c_str());
+  }
 
-    cmd = "g++ -std=c++11 -shared -fPIC -ldl " + lib_src_name + " -o " + lib_name;
+  void CompileExternLib() override {
+    std::string cmd =
+        "g++ -std=c++11 -shared -fPIC -ldl " + src_path_ + " -o " + lib_path_;
     int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
   }
 
+  void Build(const NodeRef& ref) override {
+    if (ref->derived_from<FunctionNode>()) {
+      CreateExternSignature(Downcast<Function>(ref), true);
+      CompileExternLib();
+    } else if (ref->derived_from<relay::ModuleNode>()) {
+      relay::Module mod = Downcast<relay::Module>(ref);
+      bool update = true;
+      for (const auto& it : mod->functions) {
+        CreateExternSignature(Downcast<Function>(it.second), update);
+        update = false;
+      }
+      CompileExternLib();
+    } else {
+      LOG(FATAL) << "The input ref is expected to be a Relay function or module" << "\n";
+    }
+  }
+
  private:
-  std::string _curr_id;
+  std::string src_path_;
+  std::string lib_path_;
+  std::unordered_map<std::string, std::pair<std::string, std::string> > src_lib_path_;
 };
 
 
 /*!
- * \brief The external compiler/codegen tool. It takes a Relay expression and
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
  * compile it into a runtime module.
  *
  * The external codegen tool should have been registered similiarly to LLVM,
  * CUDA, etc, under TVM so the generated code could be packed in a runtime
  * module. This module simplifies code serialization and invocation.
  */
-runtime::Module GccCompiler(const Expr& expr) {
+runtime::Module GccCompiler(const NodeRef& ref) {
   std::shared_ptr<GccModuleNode> n = std::make_shared<GccModuleNode>();
-  n->Build(expr);
+  n->Build(ref);
   return runtime::Module(n);
 }
 
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
index 658537a06f3a..361f9eda56c8 100644
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -20,6 +20,7 @@
 
 #include <cstdint>
 #include <iostream>
+#include <string.h>
 
 #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
   extern "C" void p_ID_(float* a, float* b, float* out) { \
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 28ed0277e489..ef0c4dc10e6d 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -451,14 +451,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                           size_t arity,
                           size_t return_count) {
     CHECK(func->IsExternal());
-    auto comp = FunctionGetAttr(func, "External");
-    const auto* comp_name = comp.as<tvm::ir::StringImm>();
-    CHECK(comp_name);
     // Append all subgraphs to a list, and then perform codegen for each
     // category (i.e. the ones that use the same codegen should be compiled
     // together.)
-    context_->external_funcs.push_back(func);
     size_t subgraph_id = context_->external_funcs.size();
+    context_->external_funcs.push_back(func);
     // Emit an instruction to invoke the external function/subgraph.
     Emit(Instruction::InvokeExternal(subgraph_id, arity, return_count, unpacked_arg_regs));
   }
@@ -884,11 +881,13 @@ void VMCompiler::Compile(Module mod,
     exec_->constants.push_back(vm::Tensor(data));
   }
 
-  LibraryCodegen();
+  PrimitiveFuncCodegen();
 
   for (auto gv : context_.global_map) {
     exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
+  
+  ExternalFuncCodegen();
 }
 
 Module VMCompiler::OptimizeModule(const Module& mod, const TargetsMap& targets) {
@@ -974,7 +973,7 @@ void VMCompiler::PopulateGlobalMap() {
   }
 }
 
-void VMCompiler::LibraryCodegen() {
+void VMCompiler::PrimitiveFuncCodegen() {
   auto const &cached_funcs = context_.cached_funcs;
   if (cached_funcs.size() == 0) {
     return;
@@ -1008,6 +1007,48 @@ void VMCompiler::LibraryCodegen() {
   }
 }
 
+void VMCompiler::ExternalFuncCodegen() {
+  // The codegen tool/compiler to the list of function mapping.
+  std::unordered_map<std::string, Module > comp_module;
+  // The codegen tool to lib index mapping.
+  std::unordered_map<std::string, size_t> comp_map;
+  // The function index to the external function and codegen tool mapping.
+  std::unordered_map<int, std::pair<std::string, std::string> > func_codgen;
+  for (size_t i = 0; i < context_.external_funcs.size(); i++) {
+    const auto& it = context_.external_funcs[i];
+    auto func_name = FunctionGetAttr(it, "func_name");
+    CHECK(func_name.defined()) << "Cannot find func_name attribute";
+    const auto* func_name_str = func_name.as<tvm::ir::StringImm>();
+    CHECK(func_name_str);
+    CHECK(it->IsExternal());
+    auto comp = FunctionGetAttr(it, "External");
+    const auto* comp_name = comp.as<tvm::ir::StringImm>();
+    CHECK(comp_name);
+    if (comp_module.count(comp_name->value) == 0) {
+      comp_module.emplace(comp_name->value, relay::ModuleNode::make({}, {}));
+    }
+    CHECK(it->checked_type_.defined())
+        << "Please perform type inference on the external function first."
+        << "\n";
+    comp_module[comp_name->value]->Add(GlobalVarNode::make(func_name_str->value), it);
+    func_codgen[i] = std::make_pair(func_name_str->value, comp_name->value);
+  }
+
+
+  for (const auto& it : comp_module) {
+    const auto *cg = runtime::Registry::Get("relay.ext." + it.first);
+    CHECK(cg) << "relay.ext." << it.first << " is not registered";
+    runtime::Module mod = (*cg)(it.second);
+    comp_map.emplace(it.first, vm_->ext_libs.size());
+    vm_->ext_libs.push_back(mod);
+  }
+
+  for (size_t i = 0; i < context_.external_funcs.size(); i++) {
+    vm_->external_func_map.emplace(i, std::get<0>(func_codgen[i]));
+    vm_->external_map.emplace(i, comp_map[std::get<1>(func_codgen[i])]);
+  }
+}
+
 runtime::Module CreateVMCompiler() {
   auto exec = make_object<VMCompiler>();
   return runtime::Module(exec);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index e37cb25a414a..4600202a4be4 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -81,7 +81,6 @@ struct VMCompilerContext {
   std::vector<Function> external_funcs;
 };
 
-
 class VMCompiler : public runtime::ModuleNode {
  public:
   virtual ~VMCompiler() {}
@@ -132,7 +131,11 @@ class VMCompiler : public runtime::ModuleNode {
 
   void PopulateGlobalMap();
 
-  void LibraryCodegen();
+  /* \brief Use TVM codegen to generat code for primitive functions. */
+  void PrimitiveFuncCodegen();
+  
+  /* \brief Use TVM codegen to generat code for external functions. */
+  void ExternalFuncCodegen();
 
  protected:
   /*! \brief Target devices. */
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index e25298279523..48c5ddad992e 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -842,6 +842,17 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
     }
     packed_funcs_[packed_index] = lib.GetFunction(packed_name);
   }
+
+  for (const auto& it : external_map) {
+    Index subgraph_id = it.first;
+    Index ext_lib_indx = it.second;
+    if (external_funcs.size() <= static_cast<size_t>(subgraph_id)) {
+      external_funcs.resize(subgraph_id + 1);
+    }
+    CHECK_GT(external_func_map.count(subgraph_id), 0U);
+    external_funcs[subgraph_id] =
+        ext_libs[ext_lib_indx].GetFunction(external_func_map[subgraph_id]);
+  }
 }
 
 // TODO(@zhiics) Invoke the external function/subgraph.
@@ -962,7 +973,7 @@ void VirtualMachine::RunLoop() {
         for (Index i = 0; i < arity; ++i) {
           args.push_back(ReadRegister(instr.ext_args[i]));
         }
-        InvokeExternal(instr.ext_index, func, arity, instr.ext_output_size, args);
+        InvokePacked(instr.ext_index, func, arity, instr.ext_output_size, args);
         for (Index i = 0; i < instr.ext_output_size; ++i) {
           WriteRegister(instr.ext_args[instr.ext_arity - instr.ext_output_size + i],
                         args[instr.ext_arity - instr.ext_output_size + i]);
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 1d799e9a32ab..74dacbe384cf 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -178,22 +178,41 @@ def test_multi_node_subgraph():
              x_data + w_data[6] - w_data[7]),
             axis=0))
 
+def test_extern_gcc_single_op():
+    x = relay.var('x', shape=(8, 8))
+    y = relay.var('y', shape=(8, 8))
+    z = x + y
+    f = relay.Function([x, y], z)
+    x_data = np.random.rand(8, 8).astype('float32')
+    y_data = np.random.rand(8, 8).astype('float32')
+    mod = relay.Module()
+    mod["main"] = f
+    mod = relay.transform.ExternOp("gcc")(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+        res = ex.evaluate()(x_data, y_data)
+        tvm.testing.assert_allclose(res.asnumpy(), (x_data + y_data))
+
 def test_extern_gcc():
-    x = relay.var('x', shape=(10, 10))
-    y = relay.var('y', shape=(10, 10))
+    x = relay.var('x', shape=(2, 2))
+    y = relay.var('y', shape=(2, 2))
     z = x + x
     p = y * y
     f = relay.Function([x, y], p - z)
-    x_data = np.random.rand(10, 10).astype('float32')
-    y_data = np.random.rand(10, 10).astype('float32')
+    x_data = np.random.rand(2, 2).astype('float32')
+    y_data = np.random.rand(2, 2).astype('float32')
     mod = relay.Module()
     mod["main"] = f
     mod = relay.transform.ExternOp("gcc")(mod)
     mod = relay.transform.PartitionGraph()(mod)
 
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, y_data)
-    tvm.testing.assert_allclose(res.asnumpy(), (y_data * y_data) - (x_data + x_data))
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+        res = ex.evaluate()(x_data, y_data)
+        tvm.testing.assert_allclose(res.asnumpy(),
+                                    (y_data * y_data) - (x_data + x_data))
 
 def test_extern_dnnl():
     dtype = 'float32'
@@ -225,13 +244,14 @@ def test_extern_dnnl():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(i_data, w1_data)
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu())
+        res = ex.evaluate()(i_data, w1_data)
 
-    ref_ex = relay.create_executor("debug", mod=ref_mod, ctx=tvm.cpu(0))
-    ref_res = ref_ex.evaluate()(i_data, w1_data)
+        ref_ex = relay.create_executor(kind, mod=ref_mod, ctx=tvm.cpu(0))
+        ref_res = ref_ex.evaluate()(i_data, w1_data)
 
-    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+        tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
 def test_extern_dnnl_mobilenet():
@@ -246,8 +266,9 @@ def test_extern_dnnl_mobilenet():
 
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(i_data, **params)
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(0))
+        res = ex.evaluate()(i_data, **params)
 
     # FIXME: When subgraph has only one op, Relay executor will use the cache value instead
     # of re-computing, so the following checking logic does not work.
@@ -260,6 +281,7 @@ def test_extern_dnnl_mobilenet():
 
 if __name__ == "__main__":
     test_multi_node_subgraph()
+    test_extern_gcc_single_op()
     test_extern_gcc()
     test_extern_dnnl()
-    #test_extern_dnnl_mobilenet()
\ No newline at end of file
+    #test_extern_dnnl_mobilenet()

From f4c55a5ef9d9b5b663d13e25d88975af06f86f2c Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 2 Oct 2019 20:08:12 +0000
Subject: [PATCH 19/34] enable vm test for subgraph with multiple nodes

---
 include/tvm/relay/contrib_codegen.h           |  27 +++--
 src/relay/backend/contrib/dnnl/codegen.cc     | 108 ++++++++++--------
 src/relay/backend/contrib/gcc/codegen.cc      | 103 +++++++++--------
 .../python/relay/test_pass_partition_graph.py |  27 +++--
 4 files changed, 148 insertions(+), 117 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 48cc1c078bd0..879203fffbda 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -50,7 +50,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return An array of strings of the library paths.
    */
-  virtual const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const = 0;
+  virtual const std::vector<std::string> GetExternLibPaths(
+      const std::string& id = "") const = 0;
 
   /*!
    * \brief Get the function prefix of this compiler.
@@ -82,7 +83,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    * \return PackedFunc(nullptr) when it is not available.
    */
   virtual runtime::PackedFunc GetFunction(
-      const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
 
   /*!
    * \brief Get the source code of the external module.
@@ -107,7 +109,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    * \return a vector of tokenized function name splitted by "_".
    */
   std::string GetSubgraphID(const Function& func) const {
-    const auto name_node = FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
+    const auto name_node =
+        FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
     CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
     std::string name = name_node->value;
     return GetSubgraphID(name);
@@ -127,7 +130,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     tokens.push_back(temp);
 
     CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
-    CHECK(tokens[0] == "subgraph") << "Function name does not start with \"subgraph\": " << name;
+    CHECK(tokens[0] == "subgraph")
+        << "Function name does not start with \"subgraph\": " << name;
     return tokens[1];
   }
 
@@ -147,12 +151,14 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   virtual void Open(const std::string& name) {
     std::wstring wname(name.begin(), name.end());
     handle_ = LoadLibraryW(wname.c_str());
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << name;
+    CHECK(handle_ != nullptr)
+        << "Failed to open the dynamic shared library " << name;
   }
 
   // Retrieve a symbol.
   virtual void* GetSymbol(const std::string& name) {
-    return reinterpret_cast<void*>(GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
+    return reinterpret_cast<void*>(
+        GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
   }
 
   // Close the handle.
@@ -172,11 +178,12 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
 
   // load the library.
   virtual void Open(const std::vector<std::string> lib_names) {
-    CHECK(lib_names.size() == 1) << "Default library loader only loads one library. "
-                                 << "Please override the loader if multiple libraries are used";
+    CHECK(lib_names.size() == 1)
+        << "Default library loader only loads one library. "
+        << "Please override the loader if multiple libraries are used";
     handle_ = dlopen(lib_names[0].c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library " << lib_names[0] << " "
-                              << dlerror();
+    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library "
+                              << lib_names[0] << " " << dlerror();
   }
 
   /*!
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 44afb6f5b02d..912430740ad0 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -16,7 +16,6 @@
  * under the License.
  */
 #include <dlpack/dlpack.h>
-#include <stdlib.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
@@ -26,6 +25,11 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/util.h>
 
+#include <random>
+#include <sstream>
+#include <stdlib.h>
+#include <unordered_map>
+
 #if defined(_WIN32)
 #include <windows.h>
 #else
@@ -41,15 +45,15 @@ namespace contrib {
 typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
 
 // FIXME: This is an experimental implementation. We should implement all utilities
-// and make a base claaa such as ExternBuilder for users to implement.
+// and make a base class such as ExternBuilder for users to implement.
 class DnnlBuilder : public ExprVisitor {
  public:
-  DnnlBuilder(std::string id) { this->_subgraph_id = id; }
+  DnnlBuilder(std::string id) { this->subgraph_id_ = id; }
 
   void VisitExpr_(const VarNode* node) final {
-    _subgraph_args.push_back(node->name_hint());
-    _out.clear();
-    _out.push_back({node->name_hint(), 0});
+    subgraph_args_.push_back(node->name_hint());
+    out_.clear();
+    out_.push_back({node->name_hint(), 0});
   }
 
   void VisitExpr_(const TupleGetItemNode* op) final {
@@ -57,7 +61,7 @@ class DnnlBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    std::string func_name = _subgraph_id + "_" + std::to_string(_func_idx++);
+    std::string func_name = subgraph_id_ + "_" + std::to_string(func_idx_++);
 
     // Make function declaration
     std::string decl = "";
@@ -130,18 +134,18 @@ class DnnlBuilder : public ExprVisitor {
     }
 
     decl = macro + "(" + func_name;
-    for (int i = 0; i < args.size(); ++i) {
+    for (size_t i = 0; i < args.size(); ++i) {
       decl += ", " + args[i];
     }
     decl += ");";
-    _func_decl.push_back(decl);
+    func_decl_.push_back(decl);
 
     // Make function call when visiting arguments
     bool first = true;
     std::string func_call = func_name + "(";
-    for (int i = 0; i < call->args.size(); ++i) {
+    for (size_t i = 0; i < call->args.size(); ++i) {
       VisitExpr(call->args[i]);
-      for (auto out : _out) {
+      for (auto out : out_) {
         if (!first) {
           func_call += ", ";
         }
@@ -153,75 +157,75 @@ class DnnlBuilder : public ExprVisitor {
     auto type_node = call->checked_type().as<TensorTypeNode>();
     CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
         << "Only support single output tensor with float type";
-    std::string out = "buf_" + std::to_string(_buf_idx++);
+    std::string out = "buf_" + std::to_string(buf_idx_++);
     auto out_shape = GetShape(call->checked_type());
     int out_size = 1;
-    for (int i = 0; i < out_shape.size(); ++i) {
+    for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
     std::string buf_decl =
         "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
-    _buf_decl.push_back(buf_decl);
+    buf_decl_.push_back(buf_decl);
 
     func_call += ", " + out + ");";
-    _subgraph_body.push_back(func_call);
+    subgraph_body.push_back(func_call);
 
     // Update output buffer
-    _out.clear();
-    _out.push_back({out, out_size});
+    out_.clear();
+    out_.push_back({out, out_size});
   }
 
   std::string build() {
     std::string code = "";
 
     // Write function macros
-    for (auto decl : _func_decl) {
+    for (auto decl : func_decl_) {
       code += decl + "\n";
     }
 
     // Write subgraph function declaration
-    code += "extern \\\"C\\\" void " + _subgraph_id + "(DnnlPackedArgs args, float* out) {\n";
+    code += "extern \\\"C\\\" void " + subgraph_id_ + "(DnnlPackedArgs args, float* out) {\n";
 
     // Unpack inputs
-    for (int i = 0; i < _subgraph_args.size(); ++i) {
-      code += "  float* " + _subgraph_args[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
+    for (size_t i = 0; i < subgraph_args_.size(); ++i) {
+      code += "  float* " + subgraph_args_[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
     }
     // Function body
-    for (auto decl : _buf_decl) {
+    for (auto decl : buf_decl_) {
       code += "  " + decl + "\n";
     }
-    for (auto stmt : _subgraph_body) {
+    for (auto stmt : subgraph_body) {
       code += "  " + stmt + "\n";
     }
 
     // Copy output
-    CHECK(_out.size() == 1) << "Internal error";
-    code += "  memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+    CHECK(out_.size() == 1) << "Internal error";
+    code += "  memcpy(out, " + out_[0].first + ", 4 *" + std::to_string(out_[0].second) + ");\n";
 
     code += "}\n";
     return code;
   }
 
  private:
-  std::string _subgraph_id = "";
-  int _func_idx = 0;
-  int _buf_idx = 0;
-  std::vector<std::string> _subgraph_args;
-  std::vector<std::string> _subgraph_body;
-  std::vector<std::string> _func_decl;
-  std::vector<std::string> _buf_decl;
-  std::vector<std::pair<std::string, int>> _out;
+  std::string subgraph_id_ = "";
+  int func_idx_ = 0;
+  int buf_idx_ = 0;
+  std::vector<std::string> subgraph_args_;
+  std::vector<std::string> subgraph_body;
+  std::vector<std::string> func_decl_;
+  std::vector<std::string> buf_decl_;
+  std::vector<std::pair<std::string, int>> out_;
 
   std::vector<int> GetShape(const Type& type) const {
     const auto* ttype = type.as<TensorTypeNode>();
     CHECK(ttype);
-    std::vector<int> _shape;
-    for (int i = 0; i < ttype->shape.size(); ++i) {
+    std::vector<int> shape;
+    for (size_t i = 0; i < ttype->shape.size(); ++i) {
       auto* val = ttype->shape[i].as<IntImm>();
       CHECK(val);
-      _shape.push_back(val->value);
+      shape.push_back(val->value);
     }
-    return _shape;
+    return shape;
   }
 
   bool IsOp(const CallNode* call, std::string op_name) {
@@ -234,7 +238,8 @@ class DnnlBuilder : public ExprVisitor {
 
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const override {
+  const std::vector<std::string> GetExternLibPaths(
+      const std::string& id = "") const override {
     CHECK_GT(src_lib_path_.count(id), 0U);
     const auto& pair = src_lib_path_.at(id);
     return {pair.second};
@@ -251,7 +256,9 @@ class DNNLModuleNode : public ExternModuleNodeBase {
    *
    * \return The source code of the external library module in the text form.
    */
-  TVM_DLL std::string GetSource(const std::string& format = "") override { return ""; }
+  TVM_DLL std::string GetSource(const std::string& format = "") override {
+    return "";
+  }
 
   const char* type_key() const override { return "DNNLModule"; }
 
@@ -264,15 +271,17 @@ class DNNLModuleNode : public ExternModuleNodeBase {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  runtime::PackedFunc GetFunction(const std::string& name,
-                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+  runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     std::string curr_id = GetSubgraphID(name);
     if (!IsOpen()) {
       Open(this->GetExternLibPaths(curr_id));
     }
     CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
 
-    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
+                                                    tvm::TVMRetValue* rv) {
       const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
       runtime::NDArray out_arg = args[args.size() - 1];
       auto out = reinterpret_cast<float*>(out_arg->data);
@@ -298,7 +307,8 @@ class DNNLModuleNode : public ExternModuleNodeBase {
   }
 
   void CreateExternSignature(const Function& func, bool update) {
-    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    CHECK(func.defined())
+        << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "DNNL expects a single convolution or dense op";
 
@@ -314,8 +324,8 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       src_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".cc";
       lib_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".so";
       std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + src_path_;
-      std::system(cmd.c_str());
-      std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/");
+      CHECK_GE(std::system(cmd.c_str()), 0);
+      CHECK_GE(std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/"), 0);
     }
 
     // Save the src and lib path.
@@ -326,14 +336,14 @@ class DNNLModuleNode : public ExternModuleNodeBase {
     std::string code = builder.build();
 
     std::string cmd = "echo \"" + code + "\" >> " + src_path_;
-    std::system(cmd.c_str());
+    CHECK_GE(std::system(cmd.c_str()), 0);
   }
 
   void CompileExternLib() override {
-    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + src_path_ + " -o " + lib_path_ +
-                      " -ldl -lpthread -lm -ldnnl";
+    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + src_path_ +
+                      " -o " + lib_path_ + " -ldl -lpthread -lm -ldnnl";
     int ret = std::system(cmd.c_str());
-    if (ret != 0) {
+    if (ret < 0) {
       LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
     }
   }
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 602617447a5c..b8fe7351be77 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -15,10 +15,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <dlfcn.h>
-#include <stdlib.h>
-#include <random>
-#include <sstream>
 #include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
@@ -26,6 +22,12 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <random>
+#include <sstream>
+#include <unordered_map>
+
 #include "libs.h"
 
 namespace tvm {
@@ -38,17 +40,17 @@ typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
 // and make a base claaa such as ExternBuilder for users to implement.
 class GccBuilder : public ExprVisitor {
  public:
-  GccBuilder(const std::string& id) { this->_subgraph_id = id; }
+  GccBuilder(const std::string& id) { this->subgraph_id_ = id; }
 
   void VisitExpr_(const VarNode* node) {
-    _subgraph_args.push_back(node->name_hint());
-    _out.clear();
-    _out.push_back({node->name_hint(), 0});
+    subgraph_args_.push_back(node->name_hint());
+    out_.clear();
+    out_.push_back({node->name_hint(), 0});
   }
  
   void VisitExpr_(const CallNode* call) final {
     auto op_node = call->op.as<OpNode>();
-    std::string func_name = _subgraph_id + "_" + std::to_string(_func_idx++);
+    std::string func_name = subgraph_id_ + "_" + std::to_string(func_idx++);
 
     // Make function declaration
     std::string decl = "GCC_BINARY_OP_" + std::to_string(call->args.size()) +
@@ -65,18 +67,18 @@ class GccBuilder : public ExprVisitor {
     }
 
     auto in_shape = GetShape(call->args[0]->checked_type());
-    for (int i = 0; i < in_shape.size(); ++i) {
+    for (size_t i = 0; i < in_shape.size(); ++i) {
       decl += ", " + std::to_string(in_shape[i]);
     }
     decl += ");";
-    _func_decl.push_back(decl);
+    func_decl_.push_back(decl);
 
     // Make function call when visiting arguments
     bool first = true;
     std::string gcc_call = func_name + "(";
-    for (int i = 0; i < call->args.size(); ++i) {
+    for (size_t i = 0; i < call->args.size(); ++i) {
       VisitExpr(call->args[i]);
-      for (auto out : _out) {
+      for (auto out : out_) {
         if (!first) {
           gcc_call += ", ";
         }
@@ -88,81 +90,82 @@ class GccBuilder : public ExprVisitor {
     auto type_node = call->checked_type().as<TensorTypeNode>();
     CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
         << "Only support single output tensor with float type";
-    std::string out = "buf_" + std::to_string(_buf_idx++);
+    std::string out = "buf_" + std::to_string(buf_idx_++);
     auto out_shape = GetShape(call->checked_type());
     int out_size = 1;
-    for (int i = 0; i < out_shape.size(); ++i) {
+    for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
     std::string buf_decl =
         "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
-    _buf_decl.push_back(buf_decl);
+    buf_decl_.push_back(buf_decl);
 
     gcc_call += ", " + out + ");";
-    _subgraph_body.push_back(gcc_call);
+    subgraph_body.push_back(gcc_call);
 
     // Update output buffer
-    _out.clear();
-    _out.push_back({out, out_size});
+    out_.clear();
+    out_.push_back({out, out_size});
   }
 
   std::string build() {
     std::string code = "";
 
     // Write function macros
-    for (auto decl : _func_decl) {
+    for (auto decl : func_decl_) {
       code += decl + "\n";
     }
 
     // Write subgraph function declaration
-    code += "extern \\\"C\\\" void " + _subgraph_id + "(GccPackedArgs args, float* out) {\n";
+    code += "extern \\\"C\\\" void " + subgraph_id_ + "(GccPackedArgs args, float* out) {\n";
 
     // Unpack inputs
-    for (int i = 0; i < _subgraph_args.size(); ++i) {
-      code += "  float* " + _subgraph_args[i] + " = args.data[" + std::to_string(i) + "];\n";
+    for (size_t i = 0; i < subgraph_args_.size(); ++i) {
+      code += "  float* " + subgraph_args_[i] + " = args.data[" + std::to_string(i) + "];\n";
     }
     // Function body
-    for (auto decl : _buf_decl) {
+    for (auto decl : buf_decl_) {
       code += "  " + decl + "\n";
     }
-    for (auto stmt : _subgraph_body) {
+    for (auto stmt : subgraph_body) {
       code += "  " + stmt + "\n";
     }
 
     // Copy output
-    CHECK(_out.size() == 1) << "Internal error";
-    code += "  memcpy(out, " + _out[0].first + ", 4 *" + std::to_string(_out[0].second) + ");\n";
+    CHECK(out_.size() == 1) << "Internal error";
+    code += "  memcpy(out, " + out_[0].first + ", 4 *" + std::to_string(out_[0].second) + ");\n";
 
     code += "}\n";
     return code;
   }
 
  private:
-  std::string _subgraph_id = "";
-  int _func_idx = 0;
-  int _buf_idx = 0;
-  std::vector<std::string> _subgraph_args;
-  std::vector<std::string> _subgraph_body;
-  std::vector<std::string> _func_decl;
-  std::vector<std::string> _buf_decl;
-  std::vector<std::pair<std::string, int>> _out;
+  std::string subgraph_id_ = "";
+  int func_idx = 0;
+  int buf_idx_ = 0;
+  std::vector<std::string> subgraph_args_;
+  std::vector<std::string> subgraph_body;
+  std::vector<std::string> func_decl_;
+  std::vector<std::string> buf_decl_;
+  std::vector<std::pair<std::string, int>> out_;
 
   std::vector<int> GetShape(const Type& type) const {
     const auto* ttype = type.as<TensorTypeNode>();
     CHECK(ttype);
-    std::vector<int> _shape;
-    for (int i = 0; i < ttype->shape.size(); ++i) {
+    std::vector<int> shape;
+    for (size_t i = 0; i < ttype->shape.size(); ++i) {
       auto* val = ttype->shape[i].as<IntImm>();
       CHECK(val);
-      _shape.push_back(val->value);
+      shape.push_back(val->value);
     }
-    return _shape;
+    return shape;
   }
 };
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(const std::string& id = "") const override {
+  const std::vector<std::string> GetExternLibPaths(
+      const std::string& id = "") const override {
     CHECK_GT(src_lib_path_.count(id), 0U);
     const auto& pair = src_lib_path_.at(id);
     return {pair.second};
@@ -187,15 +190,17 @@ class GccModuleNode : public ExternModuleNodeBase {
     return "GccModule";
   }
 
-  runtime::PackedFunc GetFunction(const std::string& name,
-                                  const std::shared_ptr<ModuleNode>& sptr_to_self) override {
+  runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     std::string curr_id = GetSubgraphID(name);
     if (!IsOpen()) {
       Open(this->GetExternLibPaths(curr_id));
     }
     CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
     // Generate an external packed function
-    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
+                                                    tvm::TVMRetValue* rv) {
       const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
       runtime::NDArray out_arg = args[args.size() - 1];
       auto out = reinterpret_cast<float*>(out_arg->data);
@@ -221,7 +226,8 @@ class GccModuleNode : public ExternModuleNodeBase {
   }
 
   void CreateExternSignature (const Function& func, bool update) {
-    CHECK(func.defined()) << "Input error: external codegen expects a Relay function.";
+    CHECK(func.defined())
+        << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "Unknown error";  // comaniac: Don't know in what case this will fail.
 
@@ -238,8 +244,8 @@ class GccModuleNode : public ExternModuleNodeBase {
       src_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".cc";
       lib_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".so";
       std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + src_path_;
-      std::system(cmd.c_str());
-      std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/");
+      CHECK_GE(std::system(cmd.c_str()), 0);
+      CHECK_GE(std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/"), 0);
     }
     // Save the src and lib path.
     src_lib_path_.emplace(sid, std::make_pair(src_path_, lib_path_));
@@ -250,7 +256,7 @@ class GccModuleNode : public ExternModuleNodeBase {
 
     // Append the signature.
     auto cmd = "echo \"" + code + "\" >> " + src_path_;
-    std::system(cmd.c_str());
+    CHECK_GE(std::system(cmd.c_str()), 0);
   }
 
   void CompileExternLib() override {
@@ -275,7 +281,8 @@ class GccModuleNode : public ExternModuleNodeBase {
       }
       CompileExternLib();
     } else {
-      LOG(FATAL) << "The input ref is expected to be a Relay function or module" << "\n";
+      LOG(FATAL) << "The input ref is expected to be a Relay function or module"
+                 << "\n";
     }
   }
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 74dacbe384cf..6751f63e7e18 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -75,6 +75,7 @@ def visit_call(self, call):
             return op
         return super().visit_call(call)
 
+
 class WholeGraphAnnotator(ExprMutator):
     """
     An annotator that creates a subgraph for an entire graph.
@@ -101,6 +102,7 @@ def visit_call(self, call):
             new_call = subgraph_end(new_call, self.compiler)
         return new_call
 
+
 class MobileNetAnnotator(ExprMutator):
     """
     Annotate mobilenet until global_avg_pool.
@@ -129,6 +131,7 @@ def visit_call(self, call):
         new_call = relay.Call(call.op, params, call.attrs)
         return new_call
 
+
 def test_multi_node_subgraph():
     x = relay.var('x', shape=(10, 10))
     w0 = relay.var('w', shape=(10, 10))
@@ -162,21 +165,23 @@ def test_multi_node_subgraph():
     mod["main"] = ann.visit(f)
     mod = relay.transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
-    print(mod['main'])
 
     x_data = np.random.rand(10, 10).astype('float32')
     w_data = []
     for _ in range(8):
         w_data.append(np.random.rand(10, 10).astype('float32'))
-    ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
-    res = ex.evaluate()(x_data, *w_data)
-    tvm.testing.assert_allclose(
-        res.asnumpy(),
-        np.concatenate(
-            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
-             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-             x_data + w_data[6] - w_data[7]),
-            axis=0))
+
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+        res = ex.evaluate()(x_data, *w_data)
+        tvm.testing.assert_allclose(
+            res.asnumpy(),
+            np.concatenate(
+                (((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                 ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                 x_data + w_data[6] - w_data[7]),
+                axis=0))
+
 
 def test_extern_gcc_single_op():
     x = relay.var('x', shape=(8, 8))
@@ -195,6 +200,7 @@ def test_extern_gcc_single_op():
         res = ex.evaluate()(x_data, y_data)
         tvm.testing.assert_allclose(res.asnumpy(), (x_data + y_data))
 
+
 def test_extern_gcc():
     x = relay.var('x', shape=(2, 2))
     y = relay.var('y', shape=(2, 2))
@@ -214,6 +220,7 @@ def test_extern_gcc():
         tvm.testing.assert_allclose(res.asnumpy(),
                                     (y_data * y_data) - (x_data + x_data))
 
+
 def test_extern_dnnl():
     dtype = 'float32'
     ishape = (1, 32, 14, 14)

From 5f8ecaff89393c127403cbde47c13e3fdc00a5b7 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 9 Oct 2019 17:24:47 +0000
Subject: [PATCH 20/34] fix lint

---
 include/tvm/relay/contrib_codegen.h           |  9 +++++----
 python/tvm/relay/op/contrib/dnnl/extern_op.py |  7 ++++++-
 python/tvm/relay/op/contrib/extern_op.py      | 13 +++++++++++--
 src/relay/backend/contrib/dnnl/codegen.cc     | 12 +++++++-----
 src/relay/backend/contrib/dnnl/libs.cc        | 13 ++++++++-----
 src/relay/backend/contrib/dnnl/libs.h         |  7 ++++++-
 src/relay/backend/contrib/gcc/codegen.cc      | 13 +++++++------
 src/relay/backend/contrib/gcc/libs.cc         |  2 +-
 src/relay/backend/contrib/gcc/libs.h          |  6 +++++-
 src/relay/backend/vm/compiler.cc              |  2 +-
 src/relay/backend/vm/compiler.h               |  2 +-
 src/relay/pass/extern_op.cc                   |  3 +--
 12 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index 879203fffbda..ff91881bcb9d 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -18,15 +18,17 @@
 #ifndef TVM_RELAY_CONTRIB_CODEGEN_H_
 #define TVM_RELAY_CONTRIB_CODEGEN_H_
 
-#include <dlpack/dlpack.h>
 #include <stdlib.h>
+#include <dlpack/dlpack.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/util.h>
+
 #include <string>
+#include <vector>
 
 #if defined(_WIN32)
 #include <windows.h>
@@ -82,7 +84,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
    *
    * \return PackedFunc(nullptr) when it is not available.
    */
-  virtual runtime::PackedFunc GetFunction(
+  runtime::PackedFunc GetFunction(
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
 
@@ -115,7 +117,7 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     std::string name = name_node->value;
     return GetSubgraphID(name);
   }
-  
+
   std::string GetSubgraphID(const std::string& name) const {
     std::string temp = name;
     std::vector<std::string> tokens;
@@ -136,7 +138,6 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
   }
 
  protected:
-
   // Platform dependent handlers for opening system lib.
 #if defined(_WIN32)
   // The handle.
diff --git a/python/tvm/relay/op/contrib/dnnl/extern_op.py b/python/tvm/relay/op/contrib/dnnl/extern_op.py
index f5e26cde1cd0..fb967872a588 100644
--- a/python/tvm/relay/op/contrib/dnnl/extern_op.py
+++ b/python/tvm/relay/op/contrib/dnnl/extern_op.py
@@ -18,28 +18,33 @@
 """CBLAS library supported operators."""
 from __future__ import absolute_import
 
+
 def conv2d(attrs, args):
     """Check if the external codegen should be used.
     """
     return True
 
+
 def dense(attrs, args):
     """Check if the external codegen should be used.
     """
     return True
 
+
 def relu(attrs, args):
     """Check if the external codegen should be used.
     """
     return True
 
+
 def batch_norm(attrs, args):
     """Check if the external codegen should be used.
     FIXME: Turn off due to not support of multiple outputs.
     """
     return False
 
+
 def add(attrs, args):
     """Check if the external codegen should be used.
     """
-    return True
\ No newline at end of file
+    return True
diff --git a/python/tvm/relay/op/contrib/extern_op.py b/python/tvm/relay/op/contrib/extern_op.py
index c046a17a7710..e1310f7a25bd 100644
--- a/python/tvm/relay/op/contrib/extern_op.py
+++ b/python/tvm/relay/op/contrib/extern_op.py
@@ -39,7 +39,9 @@
 # Load available contrib compilers
 compilers = {}
 for _, name, _ in pkgutil.iter_modules([Path(__file__).parent]):
-    compilers[name] = import_module('.%s' % name, package='.'.join(__name__.split('.')[:-1]))
+    compilers[name] = import_module(
+        '.%s' % name, package='.'.join(__name__.split('.')[:-1]))
+
 
 def get_extern_op(compiler, op_name):
     """Get the extern op function from the registered compiler
@@ -50,9 +52,11 @@ def get_extern_op(compiler, op_name):
             if hasattr(extern_op, op_name):
                 return getattr(extern_op, op_name)
 
-    logger.warning("%s in %s is not registered. Fallback to CPU" % (op_name, compiler))
+    logger.warning("%s in %s is not registered. Fallback to CPU", op_name,
+                   compiler)
     return lambda x, y: False
 
+
 @reg.register_extern_op("nn.conv2d")
 def external_conv2d(attrs, args, compiler):
     """Check if the external compiler should be used.
@@ -66,30 +70,35 @@ def external_dense(attrs, args, compiler):
     """
     return get_extern_op(compiler, 'dense')(attrs, args)
 
+
 @reg.register_extern_op("nn.relu")
 def external_relu(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'relu')(attrs, args)
 
+
 @reg.register_extern_op("nn.batch_norm")
 def external_batch_norm(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'batch_norm')(attrs, args)
 
+
 @reg.register_extern_op("subtract")
 def external_subtract(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'subtract')(attrs, args)
 
+
 @reg.register_extern_op("add")
 def external_add(attrs, args, compiler):
     """Check if the external compiler should be used.
     """
     return get_extern_op(compiler, 'add')(attrs, args)
 
+
 @reg.register_extern_op("multiply")
 def external_multiply(attrs, args, compiler):
     """Check if the external compiler should be used.
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 912430740ad0..dba9356e26b9 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -15,6 +15,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <stdlib.h>
 #include <dlpack/dlpack.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/contrib_codegen.h>
@@ -27,7 +28,6 @@
 
 #include <random>
 #include <sstream>
-#include <stdlib.h>
 #include <unordered_map>
 
 #if defined(_WIN32)
@@ -48,7 +48,7 @@ typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
 // and make a base class such as ExternBuilder for users to implement.
 class DnnlBuilder : public ExprVisitor {
  public:
-  DnnlBuilder(std::string id) { this->subgraph_id_ = id; }
+  explicit DnnlBuilder(const std::string& id) { this->subgraph_id_ = id; }
 
   void VisitExpr_(const VarNode* node) final {
     subgraph_args_.push_back(node->name_hint());
@@ -57,7 +57,7 @@ class DnnlBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const TupleGetItemNode* op) final {
-    ; // Do nothing
+    // Do nothing
   }
 
   void VisitExpr_(const CallNode* call) final {
@@ -188,7 +188,8 @@ class DnnlBuilder : public ExprVisitor {
 
     // Unpack inputs
     for (size_t i = 0; i < subgraph_args_.size(); ++i) {
-      code += "  float* " + subgraph_args_[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
+      code +=
+          "  float* " + subgraph_args_[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
     }
     // Function body
     for (auto decl : buf_decl_) {
@@ -293,7 +294,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       // Reinterpret data and function to the right type and invoke
       if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
         DnnlPackedArgs packed_args;
-        packed_args.data = (void**)malloc(sizeof(float*) * args.size());
+        packed_args.data = reinterpret_cast<void**>(malloc(sizeof(float*) * args.size()));
         for (int i = 0; i < args.size() - 1; ++i) {
           runtime::NDArray arg = args[i];
           packed_args.data[i] = reinterpret_cast<float*>(arg->data);
@@ -365,6 +366,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
                  << "\n";
     }
   }
+
  private:
   std::string src_path_;
   std::string lib_path_;
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index 801d4fb9d73a..9ab69593ef80 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -16,25 +16,28 @@
  * under the License.
  */
 
+#include "libs.h"
+
+#include <string.h>
 #include <assert.h>
 #include <stdlib.h>
+
 #include <algorithm>
 #include <iostream>
 #include <numeric>
 #include <string>
-#include <string.h>
+#include <vector>
 
 #include "dnnl.hpp"
-#include "libs.h"
 
 using namespace dnnl;
 
 // Read from memory, write to handle
-inline void read_from_dnnl_memory(void* handle, memory& mem) {
+inline void read_from_dnnl_memory(void* handle, const memory& mem) {
   size_t bytes = mem.get_desc().get_size();
 
   uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
-  std::copy(src, src + bytes, (uint8_t*)handle);
+  std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
 }
 
 #define CONV2D(p_ID_, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_,       \
@@ -172,7 +175,7 @@ inline void read_from_dnnl_memory(void* handle, memory& mem) {
     auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);            \
     assert(data_md == bn_prim_desc.dst_desc());                                               \
                                                                                               \
-    float* weight = (float*)malloc(sizeof(float) * 2 * p_C_);                                 \
+    float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));               \
     memcpy(weight, gamma, sizeof(float) * p_C_);                                              \
     memcpy(weight + p_C_, beta, sizeof(float) * p_C_);                                        \
                                                                                               \
diff --git a/src/relay/backend/contrib/dnnl/libs.h b/src/relay/backend/contrib/dnnl/libs.h
index beaeaefcd3db..6a9580ae0aa3 100644
--- a/src/relay/backend/contrib/dnnl/libs.h
+++ b/src/relay/backend/contrib/dnnl/libs.h
@@ -16,9 +16,14 @@
  * under the License.
  */
 
+#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
+#define TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
+
 #include <cstdint>
 #include <iostream>
 
 typedef struct {
   void** data;
-} DnnlPackedArgs;
\ No newline at end of file
+} DnnlPackedArgs;
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index b8fe7351be77..fe79a2bce27a 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -15,6 +15,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <dlfcn.h>
+#include <stdlib.h>
+
 #include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
@@ -22,8 +25,6 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 
-#include <dlfcn.h>
-#include <stdlib.h>
 #include <random>
 #include <sstream>
 #include <unordered_map>
@@ -40,14 +41,14 @@ typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
 // and make a base claaa such as ExternBuilder for users to implement.
 class GccBuilder : public ExprVisitor {
  public:
-  GccBuilder(const std::string& id) { this->subgraph_id_ = id; }
+  explicit GccBuilder(const std::string& id) { this->subgraph_id_ = id; }
 
   void VisitExpr_(const VarNode* node) {
     subgraph_args_.push_back(node->name_hint());
     out_.clear();
     out_.push_back({node->name_hint(), 0});
   }
- 
+
   void VisitExpr_(const CallNode* call) final {
     auto op_node = call->op.as<OpNode>();
     std::string func_name = subgraph_id_ + "_" + std::to_string(func_idx++);
@@ -212,7 +213,7 @@ class GccModuleNode : public ExternModuleNodeBase {
       // Reinterpret data and function to the right type and invoke
       if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
         GccPackedArgs packed_args;
-        packed_args.data = (float**)malloc(sizeof(float*) * args.size());
+        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
         for (int i = 0; i < args.size() - 1; ++i) {
           runtime::NDArray arg = args[i];
           packed_args.data[i] = reinterpret_cast<float*>(arg->data);
@@ -225,7 +226,7 @@ class GccModuleNode : public ExternModuleNodeBase {
     });
   }
 
-  void CreateExternSignature (const Function& func, bool update) {
+  void CreateExternSignature(const Function& func, bool update) {
     CHECK(func.defined())
         << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
index 361f9eda56c8..721a2324c567 100644
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -19,8 +19,8 @@
 #include "libs.h"
 
 #include <cstdint>
+#include <cstring>
 #include <iostream>
-#include <string.h>
 
 #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
   extern "C" void p_ID_(float* a, float* b, float* out) { \
diff --git a/src/relay/backend/contrib/gcc/libs.h b/src/relay/backend/contrib/gcc/libs.h
index 1549cc2f6ef8..261449bda075 100644
--- a/src/relay/backend/contrib/gcc/libs.h
+++ b/src/relay/backend/contrib/gcc/libs.h
@@ -15,10 +15,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#ifndef TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
+#define TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
 
 #include <cstdint>
 #include <iostream>
 
 typedef struct {
   float** data;
-} GccPackedArgs;
\ No newline at end of file
+} GccPackedArgs;
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index ef0c4dc10e6d..6d943d8415ab 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -886,7 +886,7 @@ void VMCompiler::Compile(Module mod,
   for (auto gv : context_.global_map) {
     exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
-  
+
   ExternalFuncCodegen();
 }
 
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 4600202a4be4..c5c022d9d2d7 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -133,7 +133,7 @@ class VMCompiler : public runtime::ModuleNode {
 
   /* \brief Use TVM codegen to generat code for primitive functions. */
   void PrimitiveFuncCodegen();
-  
+
   /* \brief Use TVM codegen to generat code for external functions. */
   void ExternalFuncCodegen();
 
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index 714a07c26505..f22d8a762345 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -65,8 +65,7 @@ class ExternOpWrapper : public ExprMutator {
         Expr end = (*end_op)(update_call, compiler_);
         return end;
       }
-    }
-    else {
+    } else {
       LOG(WARNING) << op.operator->()->name << " in " << compiler_ << " is not registered";
     }
     return new_e;

From c7c74c303563a7a1daa4b858cd799b3bb4dacc4c Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 9 Oct 2019 13:15:25 -0700
Subject: [PATCH 21/34] remove get lib path API

---
 include/tvm/relay/contrib_codegen.h            |  8 --------
 src/relay/backend/contrib/dnnl/codegen.cc      | 15 ++-------------
 src/relay/backend/contrib/gcc/codegen.cc       | 18 +++---------------
 .../python/relay/test_pass_partition_graph.py  |  9 +++++----
 4 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index ff91881bcb9d..b1acbd432875 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -47,14 +47,6 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     Close();
   }
 
-  /*!
-   * \brief Get the full path of compiled external shared libraries of this compiler.
-   *
-   * \return An array of strings of the library paths.
-   */
-  virtual const std::vector<std::string> GetExternLibPaths(
-      const std::string& id = "") const = 0;
-
   /*!
    * \brief Get the function prefix of this compiler.
    *
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index dba9356e26b9..aadcd82e919e 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -239,12 +239,6 @@ class DnnlBuilder : public ExprVisitor {
 
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(
-      const std::string& id = "") const override {
-    CHECK_GT(src_lib_path_.count(id), 0U);
-    const auto& pair = src_lib_path_.at(id);
-    return {pair.second};
-  }
 
   const std::string GetPrefix() const override {
     return "dnnl_";
@@ -276,9 +270,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     std::string curr_id = GetSubgraphID(name);
-    if (!IsOpen()) {
-      Open(this->GetExternLibPaths(curr_id));
-    }
+
     CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
 
     return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
@@ -329,9 +321,6 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       CHECK_GE(std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/"), 0);
     }
 
-    // Save the src and lib path.
-    src_lib_path_.emplace(sid, std::make_pair(src_path_, lib_path_));
-
     auto builder = DnnlBuilder(GetPrefix() + sid);
     builder.VisitExpr(func->body);
     std::string code = builder.build();
@@ -347,6 +336,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
     if (ret < 0) {
       LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
     }
+    Open({lib_path_});
   }
 
   void Build(const NodeRef& ref) override {
@@ -370,7 +360,6 @@ class DNNLModuleNode : public ExternModuleNodeBase {
  private:
   std::string src_path_;
   std::string lib_path_;
-  std::unordered_map<std::string, std::pair<std::string, std::string> > src_lib_path_;
 };
 
 /*!
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index fe79a2bce27a..5205d8efc851 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -165,13 +165,6 @@ class GccBuilder : public ExprVisitor {
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::vector<std::string> GetExternLibPaths(
-      const std::string& id = "") const override {
-    CHECK_GT(src_lib_path_.count(id), 0U);
-    const auto& pair = src_lib_path_.at(id);
-    return {pair.second};
-  }
-
   const std::string GetPrefix() const override {
     return "gcc_";
   }
@@ -195,9 +188,7 @@ class GccModuleNode : public ExternModuleNodeBase {
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) override {
     std::string curr_id = GetSubgraphID(name);
-    if (!IsOpen()) {
-      Open(this->GetExternLibPaths(curr_id));
-    }
+
     CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
     // Generate an external packed function
     return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
@@ -248,8 +239,6 @@ class GccModuleNode : public ExternModuleNodeBase {
       CHECK_GE(std::system(cmd.c_str()), 0);
       CHECK_GE(std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/"), 0);
     }
-    // Save the src and lib path.
-    src_lib_path_.emplace(sid, std::make_pair(src_path_, lib_path_));
 
     auto builder = GccBuilder(GetPrefix() + sid);
     builder.VisitExpr(func->body);
@@ -267,12 +256,12 @@ class GccModuleNode : public ExternModuleNodeBase {
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
+    Open({lib_path_});
   }
 
   void Build(const NodeRef& ref) override {
     if (ref->derived_from<FunctionNode>()) {
       CreateExternSignature(Downcast<Function>(ref), true);
-      CompileExternLib();
     } else if (ref->derived_from<relay::ModuleNode>()) {
       relay::Module mod = Downcast<relay::Module>(ref);
       bool update = true;
@@ -280,17 +269,16 @@ class GccModuleNode : public ExternModuleNodeBase {
         CreateExternSignature(Downcast<Function>(it.second), update);
         update = false;
       }
-      CompileExternLib();
     } else {
       LOG(FATAL) << "The input ref is expected to be a Relay function or module"
                  << "\n";
     }
+    CompileExternLib();
   }
 
  private:
   std::string src_path_;
   std::string lib_path_;
-  std::unordered_map<std::string, std::pair<std::string, std::string> > src_lib_path_;
 };
 
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 6751f63e7e18..434ad2ea1730 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -16,6 +16,7 @@
 # under the License.
 """Unit tests for graph partitioning."""
 import numpy as np
+from nose.tools import nottest
 
 import tvm
 from tvm import relay
@@ -172,7 +173,7 @@ def test_multi_node_subgraph():
         w_data.append(np.random.rand(10, 10).astype('float32'))
 
     for kind in ["debug", "vm"]:
-        ex = relay.create_executor("debug", mod=mod, ctx=tvm.cpu(0))
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(0))
         res = ex.evaluate()(x_data, *w_data)
         tvm.testing.assert_allclose(
             res.asnumpy(),
@@ -220,7 +221,7 @@ def test_extern_gcc():
         tvm.testing.assert_allclose(res.asnumpy(),
                                     (y_data * y_data) - (x_data + x_data))
 
-
+@nottest
 def test_extern_dnnl():
     dtype = 'float32'
     ishape = (1, 32, 14, 14)
@@ -260,7 +261,7 @@ def test_extern_dnnl():
 
         tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
-
+@nottest
 def test_extern_dnnl_mobilenet():
     # FIXME: This test is only for demo purpose and supposed to be removed.
     dtype = 'float32'
@@ -290,5 +291,5 @@ def test_extern_dnnl_mobilenet():
     test_multi_node_subgraph()
     test_extern_gcc_single_op()
     test_extern_gcc()
-    test_extern_dnnl()
+    #test_extern_dnnl()
     #test_extern_dnnl_mobilenet()

From 5d5c907c88e9f0c228202bd865b14fc2b2bb5d9f Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Thu, 17 Oct 2019 13:09:03 -0700
Subject: [PATCH 22/34] initial commit tutorial

---
 include/tvm/relay/contrib_codegen.h       |   7 --
 src/relay/backend/contrib/dnnl/codegen.cc |   2 +-
 src/relay/backend/contrib/gcc/codegen.cc  |   2 +-
 tutorials/dev/custom_relay_backend.py     | 120 ++++++++++++++++++++++
 4 files changed, 122 insertions(+), 9 deletions(-)
 create mode 100644 tutorials/dev/custom_relay_backend.py

diff --git a/include/tvm/relay/contrib_codegen.h b/include/tvm/relay/contrib_codegen.h
index b1acbd432875..3ad6b137c439 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/include/tvm/relay/contrib_codegen.h
@@ -47,13 +47,6 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     Close();
   }
 
-  /*!
-   * \brief Get the function prefix of this compiler.
-   *
-   * \return A string of the function name prefix in the library.
-   */
-  virtual const std::string GetPrefix() const = 0;
-
   /*!
    * \brief Compile the external library.
    */
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index aadcd82e919e..5455df685cdb 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -240,7 +240,7 @@ class DnnlBuilder : public ExprVisitor {
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
 
-  const std::string GetPrefix() const override {
+  const std::string GetPrefix() {
     return "dnnl_";
   }
 
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 5205d8efc851..00246a74c1a8 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -165,7 +165,7 @@ class GccBuilder : public ExprVisitor {
 
 class GccModuleNode : public ExternModuleNodeBase {
  public:
-  const std::string GetPrefix() const override {
+  const std::string GetPrefix() {
     return "gcc_";
   }
 
diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
new file mode 100644
index 000000000000..629720df7bc5
--- /dev/null
+++ b/tutorials/dev/custom_relay_backend.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+
+.. _tutorial-custom-relay-backend
+
+Design a New Relay Backend for Third-Parties
+============================================
+**Author**: `Zhi Chen <https://github.com/zhiics>`_, `Cody Hao Yu <https:://github.com/comaniac>`_
+
+As the hardware devices targeted by deep learning workloads keep increasing, the required knowledge
+for users to achieve high performance on vary devices keeps increasing as well. To free data scientists
+from worrying about the performance when developing a new model, hardware vendors either provide
+libraries such as MKLDNN or cuDNN with many commonly used deep learning operators, or provide frameworks
+such as TensorRT to let users describle their models in a certain way to achieve high performance.
+However, users have to learn a new programming interface when they attempt to work on a new libaray
+or device. As a result, the demeand of a unified programming interface becomes more and more important
+to 1) let all users and hardware vendors stand on the same page, and 2) provide a feasbile solution to
+allow a specialized hardware or library to only support widely used operators with extremely high
+perofrmance, but fallback unsupported operators to general devices like CPU/GPU.
+
+In this tutorial, we introduce how a hardware vendor can easily implement a Relay backend to support
+a specialized hardware device/library. It mainly takes three steps: 1) define whether an operator is
+supported, 2) specify how to compile and serialize the supported operators, and 3) specify how to
+execute the compiled operators on a certain device. We will demonstrate how to add a new backend that
+uses GCC compiler to execute a subgraph of a model. Note that you will need to add the specialized Relay
+backend to the TVM codebase and rebuild TVM for enabling.
+
+"""
+
+######################################################################
+# Define The Supported Operators
+# ------------------------------
+# The first step is to define which operators are supported by our backend.
+# We first create a new Python file at python/relay/backend/op/contrib/gcc/extern_op.py,
+# and implement a set of boolean functions with corresponding operator names. A boolean
+# function should return `True` if we allow it to be executed by our backend; `False`
+# otherwise.
+
+from __future__ import absolute_import
+
+def conv2d(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return False
+
+def subtract(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def add(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+def multiply(attrs, args):
+    """Check if the external codegen should be used.
+    """
+    return True
+
+######################################################################
+# Note that since we include `attrs` and `args` into the function signature, we can
+# define more complicate rules. For example, we can only support conv2d with float32
+# data type or with kernel size 1x1.
+
+######################################################################
+# In the last step of the first part, we
+# create python/relay/backend/op/contrib/gcc/__init__.py to allow rule functions to
+# be used by the TVM.
+
+from __future__ import absolute_import as _abs
+from .extern_op import *
+
+######################################################################
+# Implement The Codegen
+# ---------------------
+# The second and the thrid step are implemented in C++ instead of Python.
+# Specifically, we create src/relay/backend/contrib/gcc/codegen.cc and
+# implement the codegen and runtime dispatcher here. For the codegen,
+# we need to implement two functions: `CompileExternalLib()` and `Build()`.
+# `Build()` accepts a Relay subgraph and generate the library or device code
+# accordingly. In the GCC example, we implement a Relay IR visitor to generate
+# C++ code for subgraphs.
+
+######################################################################
+# In addition `CompileExternalLib()` is used for specifying how to generate and
+# serialize an external library for the generated device code (C++ in this
+# example). The generated library/executable binary can either be materialized
+# to disk and load back during runtime, or stored in memory directly for
+# later usage.
+
+######################################################################
+# Implement The Runtime Dispather
+# -------------------------------
+# The last step is invoking the generated external library in runtime.
+# Specifically, we need to implement `GetFunction()` in codegen.cc.
+# The function takes a subgraph name and returns a `PackedFunc` that
+# executes the subgraph with runtime input data. If the subgraph is
+# compiled by `Build` in advance and the shared library or executable
+# binary is available, then we can invoke it here.
+# `GetFunction()` will be invoked by Relay runtime, including interpreter,
+# graph runtime, and VM, meaning that this one implemtation works for all
+# kinds of Relay runtimes.
+
+

From 0dcc5d0d196e7b2749a9c962120736caebe5c563 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Fri, 18 Oct 2019 11:14:31 -0700
Subject: [PATCH 23/34] add annotation to tutorial

---
 tutorials/dev/custom_relay_backend.py | 157 +++++++++++++++++++++++---
 1 file changed, 140 insertions(+), 17 deletions(-)

diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index 629720df7bc5..8ab0d75a8dda 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -23,22 +23,23 @@
 **Author**: `Zhi Chen <https://github.com/zhiics>`_, `Cody Hao Yu <https:://github.com/comaniac>`_
 
 As the hardware devices targeted by deep learning workloads keep increasing, the required knowledge
-for users to achieve high performance on vary devices keeps increasing as well. To free data scientists
-from worrying about the performance when developing a new model, hardware vendors either provide
-libraries such as MKLDNN or cuDNN with many commonly used deep learning operators, or provide frameworks
-such as TensorRT to let users describle their models in a certain way to achieve high performance.
-However, users have to learn a new programming interface when they attempt to work on a new libaray
-or device. As a result, the demeand of a unified programming interface becomes more and more important
-to 1) let all users and hardware vendors stand on the same page, and 2) provide a feasbile solution to
-allow a specialized hardware or library to only support widely used operators with extremely high
-perofrmance, but fallback unsupported operators to general devices like CPU/GPU.
+for users to achieve high performance on vary devices keeps increasing as well. To free data
+scientists from worrying about the performance when developing a new model, hardware vendors either
+provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators,
+or provide frameworks such as TensorRT to let users describle their models in a certain way to
+achieve high performance. However, users have to learn a new programming interface when they
+attempt to work on a new libaray or device. As a result, the demeand of a unified programming
+interface becomes more and more important to 1) let all users and hardware vendors stand on the
+same page, and 2) provide a feasbile solution to allow a specialized hardware or library to only
+support widely used operators with extremely high perofrmance, but fallback unsupported operators
+to general devices like CPU/GPU.
 
 In this tutorial, we introduce how a hardware vendor can easily implement a Relay backend to support
 a specialized hardware device/library. It mainly takes three steps: 1) define whether an operator is
 supported, 2) specify how to compile and serialize the supported operators, and 3) specify how to
-execute the compiled operators on a certain device. We will demonstrate how to add a new backend that
-uses GCC compiler to execute a subgraph of a model. Note that you will need to add the specialized Relay
-backend to the TVM codebase and rebuild TVM for enabling.
+execute the compiled operators on a certain device. We will demonstrate how to add a new backend
+that uses GCC compiler to execute a subgraph of a model. Note that you will need to add the
+specialized Relay backend to the TVM codebase and rebuild TVM for enabling.
 
 """
 
@@ -79,12 +80,106 @@ def multiply(attrs, args):
 # data type or with kernel size 1x1.
 
 ######################################################################
-# In the last step of the first part, we
-# create python/relay/backend/op/contrib/gcc/__init__.py to allow rule functions to
-# be used by the TVM.
+# Customize Subgraph Annotations
+# ------------------------------
+# In addition to specifying a set of rules for supported operators, we can also implement
+# a Relay IR mutator to find the supported subgraphs, which may include multiple operators,
+# for the target backend. Here we implement an annotator that includes an entire Relay graph
+# to be offloaded. Specifically, we are going
+# to do two tasks: 1) insert `aubgraph_begin` after all input variables, and 2) insert
+# `subgraph_end` before the primary output. For example, given a Relay graph as follows:
+#       input_a
+#          |
+#         add    --- input_b
+#          |
+#       subtract --- input_c
+#          |
+#       multiply --- input_d
+#          |
+#         out
+#
+# Our goal is to mutate the graph to the following:
+#
+#       input_a
+#          |
+#     subgraph_begin
+#          |
+#         add    --- subgraph_begin --- input_b
+#          |
+#       subtract --- subgraph_begin --- input_c
+#          |
+#       multiply --- subgraph_begin --- input_d
+#          |
+#      subgraph_end
+#          |
+#         out
+#
+# The implementation is shown as follows. As can be seen, the annotator is derived from
+# `ExprMutator` that traverses a Relay graph and allows we to mutate it. We know that all ops
+# are `call` nodes in Relay graph, so we override the call node mutator `visit_call` in
+# `ExprMutator` and insert annotations.
+
+import tvm
+from tvm import relay
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.annotation import subgraph_begin, subgraph_end
+
+class WholeGraphAnnotator(ExprMutator):
+    """
+    An annotator that creates a subgraph for an entire graph.
+    """
+    def __init__(self, compiler):
+        super(WholeGraphAnnotator, self).__init__()
+        self.compiler = compiler
+        self.last_call = True
+
+    def visit_call(self, call):
+        curr_last = self.last_call
+        self.last_call = False
+
+        params = []
+        for arg in call.args:
+            param = super().visit(arg)
+            if isinstance(param, relay.expr.Var):
+                param = subgraph_begin(param, self.compiler)
+            params.append(param)
+
+        new_call = relay.Call(call.op, params, call.attrs)
+        if curr_last:
+            new_call = subgraph_end(new_call, self.compiler)
+        return new_call
+
+######################################################################
+# Finally, we apply the annotator to our workload. Let's first build a Relay function:
+
+input_a = relay.var('a', shape=(10, 10))
+input_b = relay.var('b', shape=(10, 10))
+input_c = relay.var('c', shape=(10, 10))
+input_d = relay.var('d', shape=(10, 10))
+
+temp_1 = relay.add(input_a, input_b)
+temp_2 = relay.subtract(temp_1, input_c)
+out = relay.multiply(temp_2, input_d)
+func = relay.Function([input_a, input_b, input_c, input_d], out)
+
+######################################################################
+# The above Relay function results in the following IR:
 
-from __future__ import absolute_import as _abs
-from .extern_op import *
+print(func)
+
+######################################################################
+# Then we apply the annotator to the IR and partition the graph:
+
+mod = relay.Module()
+mod['main'] = WholeGraphAnnotator('gcc').visit(func)
+mod = relay.transform.PartitionGraph()(mod)
+
+######################################################################
+# Accordingly, the IR is transformed to the following. We can see that the entire Relay graph
+# is enclosed to a function with `External="gcc"` attribute. This indicates that this function
+# will be offloaded to an external backend during the runtime.
+
+print(mod['main'])
 
 ######################################################################
 # Implement The Codegen
@@ -117,4 +212,32 @@ def multiply(attrs, args):
 # graph runtime, and VM, meaning that this one implemtation works for all
 # kinds of Relay runtimes.
 
+######################################################################
+# Add Codegen to TVM Building Process
+# -----------------------------------
+# Finally, we include the implemented codegen to the cmake config so that
+# it will be built along with the TVM. To do so, we add two lines to
+# cmake/modules/contrib/Extern.cmake:
+# file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
+# list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+
+
+######################################################################
+# We can now test the correctness of the external GCC backend:
+#
+# .. note::
+#     The complete GCC backend implementation is in the TVM codebase
+#     so we can directly use it in this tutorial for demonstration.
+
+import numpy as np
+
+a_data = np.random.rand(10, 10).astype('float32')
+b_data = np.random.rand(10, 10).astype('float32')
+c_data = np.random.rand(10, 10).astype('float32')
+d_data = np.random.rand(10, 10).astype('float32')
+
+ex = relay.create_executor('debug', mod=mod, ctx=tvm.cpu(0))
+result = ex.evaluate()(a_data, b_data, c_data, d_data)
+tvm.testing.assert_allclose(result.asnumpy(), (a_data + b_data - c_data) * d_data)
 
+print('Results are correct!')
\ No newline at end of file

From 70d1e33d6442d28893a4de425436045fac53e10f Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 28 Oct 2019 20:55:30 +0000
Subject: [PATCH 24/34] Refine tutorial a bit

---
 tutorials/dev/custom_relay_backend.py | 91 +++++++++++++++++----------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index 8ab0d75a8dda..7359fea1bd7b 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -18,27 +18,31 @@
 
 .. _tutorial-custom-relay-backend
 
-Design a New Relay Backend for Third-Parties
+Bring Your Own Codegen To TVM
 ============================================
 **Author**: `Zhi Chen <https://github.com/zhiics>`_, `Cody Hao Yu <https:://github.com/comaniac>`_
 
 As the hardware devices targeted by deep learning workloads keep increasing, the required knowledge
-for users to achieve high performance on vary devices keeps increasing as well. To free data
+for users to achieve high performance on various devices keeps increasing as well. To free data
 scientists from worrying about the performance when developing a new model, hardware vendors either
 provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators,
 or provide frameworks such as TensorRT to let users describle their models in a certain way to
 achieve high performance. However, users have to learn a new programming interface when they
-attempt to work on a new libaray or device. As a result, the demeand of a unified programming
+attempt to work on a new libaray or device. As a result, the demand of a unified programming
 interface becomes more and more important to 1) let all users and hardware vendors stand on the
-same page, and 2) provide a feasbile solution to allow a specialized hardware or library to only
+same page, and 2) provide a feasible solution to allow a specialized hardware or library to only
 support widely used operators with extremely high perofrmance, but fallback unsupported operators
 to general devices like CPU/GPU.
 
-In this tutorial, we introduce how a hardware vendor can easily implement a Relay backend to support
-a specialized hardware device/library. It mainly takes three steps: 1) define whether an operator is
-supported, 2) specify how to compile and serialize the supported operators, and 3) specify how to
-execute the compiled operators on a certain device. We will demonstrate how to add a new backend
-that uses GCC compiler to execute a subgraph of a model. Note that you will need to add the
+In this tutorial, we demonstrate how a hardware vendor can easily implement
+a Relay backend to support a specialized hardware device/library. It mainly
+takes three steps: 1) define whether an operator is supported under a given
+template, 2) specify how to compile and serialize the supported operators so
+that it can ingest TVM specific data format, e.g. NDArray, and 3) specify how
+to execute the compiled operators on a certain device. We will demonstrate how
+to add a new backend that uses open source compilers (e.g. GCC, LLVM, etc) or any
+proprietary compilers to execute a subgraph of a model without the exposure of
+the IP of customer's codegen tool chain. Note that you will need to add the
 specialized Relay backend to the TVM codebase and rebuild TVM for enabling.
 
 """
@@ -46,10 +50,13 @@
 ######################################################################
 # Define The Supported Operators
 # ------------------------------
-# The first step is to define which operators are supported by our backend.
-# We first create a new Python file at python/relay/backend/op/contrib/gcc/extern_op.py,
+# The first step is to define which operators are supported by your backend.
+# A templated is provided to ease vendor's effort to add the supported
+# operators.
+#
+# For example, We create a new Python file at python/relay/backend/op/contrib/gcc/extern_op.py,
 # and implement a set of boolean functions with corresponding operator names. A boolean
-# function should return `True` if we allow it to be executed by our backend; `False`
+# function should return `True` if we allow it to be executed by the given backend; `False`
 # otherwise.
 
 from __future__ import absolute_import
@@ -75,9 +82,21 @@ def multiply(attrs, args):
     return True
 
 ######################################################################
-# Note that since we include `attrs` and `args` into the function signature, we can
-# define more complicate rules. For example, we can only support conv2d with float32
-# data type or with kernel size 1x1.
+# Note that since we include `attrs` and `args` into the function signature, we
+# can define more complicated rules. For example, we can only support conv2d
+# with float32 data type or with kernel size 1x1. In addition, the vendors can
+# also check the attributes associated with a given operator to decide if it is
+# supported by checking the fields in `attrs`. In a even more complicated but
+# interesting scenario, we also allow developers to check the sequence of
+# operators through iterating on the `agrs`. However, this is only
+# unidirectional as only the inputs are visible.
+#
+# After annotating whether an operator can be executed on the given backend.
+# Users can directly invoke the partitioning pass to separate the graph into
+# multiple segments. The C++ backend implements a partitioning pass to fullfil
+# the task and creates subgraphs/sub-functions with *External* attribute,
+# indicating that this function will be handled by external codegen tool.
+# Therefore, Relay passes should skip optimizations on them.
 
 ######################################################################
 # Customize Subgraph Annotations
@@ -85,9 +104,9 @@ def multiply(attrs, args):
 # In addition to specifying a set of rules for supported operators, we can also implement
 # a Relay IR mutator to find the supported subgraphs, which may include multiple operators,
 # for the target backend. Here we implement an annotator that includes an entire Relay graph
-# to be offloaded. Specifically, we are going
-# to do two tasks: 1) insert `aubgraph_begin` after all input variables, and 2) insert
-# `subgraph_end` before the primary output. For example, given a Relay graph as follows:
+# to be offloaded. Specifically, we are going to do two tasks:
+# - insert `aubgraph_begin` after all input variables
+# - insert `subgraph_end` before the primary output. For example, given a Relay graph as follows:
 #       input_a
 #          |
 #         add    --- input_b
@@ -150,7 +169,8 @@ def visit_call(self, call):
         return new_call
 
 ######################################################################
-# Finally, we apply the annotator to our workload. Let's first build a Relay function:
+# Finally, we apply the annotator to our workload. Let's first build a Relay
+# function:
 
 input_a = relay.var('a', shape=(10, 10))
 input_b = relay.var('b', shape=(10, 10))
@@ -175,21 +195,22 @@ def visit_call(self, call):
 mod = relay.transform.PartitionGraph()(mod)
 
 ######################################################################
-# Accordingly, the IR is transformed to the following. We can see that the entire Relay graph
-# is enclosed to a function with `External="gcc"` attribute. This indicates that this function
-# will be offloaded to an external backend during the runtime.
+# Accordingly, the IR is transformed to the following. We can see that the
+# entire Relay graph is enclosed in a function with `External="gcc"` attribute.
+# It indicates that this function will be offloaded to an external backend
+# during the runtime.
 
 print(mod['main'])
 
 ######################################################################
 # Implement The Codegen
 # ---------------------
-# The second and the thrid step are implemented in C++ instead of Python.
+# The second and the third step are implemented in C++ instead of Python.
 # Specifically, we create src/relay/backend/contrib/gcc/codegen.cc and
 # implement the codegen and runtime dispatcher here. For the codegen,
 # we need to implement two functions: `CompileExternalLib()` and `Build()`.
-# `Build()` accepts a Relay subgraph and generate the library or device code
-# accordingly. In the GCC example, we implement a Relay IR visitor to generate
+# `Build()` accepts a Relay module or subgraph and generate the library or device
+# code accordingly. In the GCC example, we implement a Relay IR visitor to generate
 # C++ code for subgraphs.
 
 ######################################################################
@@ -197,17 +218,23 @@ def visit_call(self, call):
 # serialize an external library for the generated device code (C++ in this
 # example). The generated library/executable binary can either be materialized
 # to disk and load back during runtime, or stored in memory directly for
-# later usage.
+# later usage using whatever user defined mechanism. In the GCC case, the
+# stand system calls e.g. dlopen/dlsym or LoadLibrary/GetProcAddress are used
+# for Linux and Windows, respectively.
 
 ######################################################################
 # Implement The Runtime Dispather
 # -------------------------------
 # The last step is invoking the generated external library in runtime.
-# Specifically, we need to implement `GetFunction()` in codegen.cc.
-# The function takes a subgraph name and returns a `PackedFunc` that
-# executes the subgraph with runtime input data. If the subgraph is
-# compiled by `Build` in advance and the shared library or executable
-# binary is available, then we can invoke it here.
+# Specifically, we need to implement tvm runtime `Module` compatible
+# `GetFunction()` in codegen.cc. The function takes a subgraph name and returns
+# a `PackedFunc` that executes the subgraph with runtime input data. Note that
+# the runtime data in TVM is provided in the tvm `NDArray` format. It's
+# vendors' repsonsiblity to deserialize it into the format that they library
+# can ingest. For example, we unpack it and extract the raw pointers for
+# MKL-DNN. If the subgraph is compiled by `Build` in advance and the shared
+# library or executable binary is available, then we can invoke it here.
+#
 # `GetFunction()` will be invoked by Relay runtime, including interpreter,
 # graph runtime, and VM, meaning that this one implemtation works for all
 # kinds of Relay runtimes.
@@ -240,4 +267,4 @@ def visit_call(self, call):
 result = ex.evaluate()(a_data, b_data, c_data, d_data)
 tvm.testing.assert_allclose(result.asnumpy(), (a_data + b_data - c_data) * d_data)
 
-print('Results are correct!')
\ No newline at end of file
+print('Results are correct!')

From 125b28fb069a12b97fb9d4714a65f3911d8697ee Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Tue, 29 Oct 2019 20:14:15 +0000
Subject: [PATCH 25/34] rebase to upstream

---
 include/tvm/runtime/vm.h                  | 25 +++++++++++++----------
 src/relay/backend/contrib/dnnl/codegen.cc |  5 ++---
 src/relay/backend/contrib/gcc/codegen.cc  |  4 ++--
 src/relay/backend/vm/compiler.cc          |  8 ++++----
 src/runtime/vm/vm.cc                      | 21 ++++++++-----------
 5 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index bcdbbbb6cf16..2824c2dc17ed 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -611,9 +611,13 @@ class Executable : public ModuleNode {
     return "VMExecutable";
   }
 
-  /*! \brief The runtime module/library that contains both the host and also the device
-   * code when executing on non-CPU devices. */
+  /*!
+   * \brief The runtime module/library that contains both the host and also the device
+   * code when executing on non-CPU devices.
+   */
   runtime::Module lib;
+  /*! \brief The external module/library. */
+  std::vector<runtime::Module> ext_libs;
   /*! \brief The global constant pool. */
   std::vector<ObjectRef> constants;
   /*! \brief A map from globals (as strings) to their index in the function map. */
@@ -624,6 +628,13 @@ class Executable : public ModuleNode {
   std::unordered_map<std::string, Index> primitive_map;
   /*! \brief The virtual machine's function table. */
   std::vector<VMFunction> functions;
+  /*! \brief A mapping from the subgraph id to the external library index in the
+   * `ext_libs`.
+   */
+  std::unordered_map<Index, Index> external_map;
+  /*! \brief A mapping from the subgraph id to the external function name.
+   */
+  std::unordered_map<Index, std::string> external_func_map;
 
  private:
   /*!
@@ -737,9 +748,7 @@ class VirtualMachine : public runtime::ModuleNode {
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
   /*! \brief The virtual machine's external function table. */
-  std::vector<relay::Function> external_funcs;
-  /*! \brief The external module/library. */
-  std::vector<runtime::Module> ext_libs;
+  std::vector<PackedFunc> external_funcs;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames_;
   /*! \brief The fuction table index of the current function. */
@@ -833,12 +842,6 @@ class VirtualMachine : public runtime::ModuleNode {
   /*! \brief Get device context for params. */
   TVMContext GetParamsContext() const;
 
-  std::unordered_map<Index, Index> external_map;
-
-  /*! \brief A mapping from the subgraph id to the external function name.
-   */
-  std::unordered_map<Index, std::string> external_func_map;
-
  private:
   /*!
    * \brief Invoke a global setting up the VM state to execute.
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 5455df685cdb..ea537277e154 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -239,7 +239,6 @@ class DnnlBuilder : public ExprVisitor {
 
 class DNNLModuleNode : public ExternModuleNodeBase {
  public:
-
   const std::string GetPrefix() {
     return "dnnl_";
   }
@@ -340,10 +339,10 @@ class DNNLModuleNode : public ExternModuleNodeBase {
   }
 
   void Build(const NodeRef& ref) override {
-    if (ref->derived_from<FunctionNode>()) {
+    if (ref->IsInstance<FunctionNode>()) {
       CreateExternSignature(Downcast<Function>(ref), true);
       CompileExternLib();
-    } else if (ref->derived_from<relay::ModuleNode>()) {
+    } else if (ref->IsInstance<relay::ModuleNode>()) {
       relay::Module mod = Downcast<relay::Module>(ref);
       bool update = true;
       for (const auto& it : mod->functions) {
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 00246a74c1a8..cceffd012ec9 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -260,9 +260,9 @@ class GccModuleNode : public ExternModuleNodeBase {
   }
 
   void Build(const NodeRef& ref) override {
-    if (ref->derived_from<FunctionNode>()) {
+    if (ref->IsInstance<FunctionNode>()) {
       CreateExternSignature(Downcast<Function>(ref), true);
-    } else if (ref->derived_from<relay::ModuleNode>()) {
+    } else if (ref->IsInstance<relay::ModuleNode>()) {
       relay::Module mod = Downcast<relay::Module>(ref);
       bool update = true;
       for (const auto& it : mod->functions) {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 6d943d8415ab..066316e0202c 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1039,13 +1039,13 @@ void VMCompiler::ExternalFuncCodegen() {
     const auto *cg = runtime::Registry::Get("relay.ext." + it.first);
     CHECK(cg) << "relay.ext." << it.first << " is not registered";
     runtime::Module mod = (*cg)(it.second);
-    comp_map.emplace(it.first, vm_->ext_libs.size());
-    vm_->ext_libs.push_back(mod);
+    comp_map.emplace(it.first, exec_->ext_libs.size());
+    exec_->ext_libs.push_back(mod);
   }
 
   for (size_t i = 0; i < context_.external_funcs.size(); i++) {
-    vm_->external_func_map.emplace(i, std::get<0>(func_codgen[i]));
-    vm_->external_map.emplace(i, comp_map[std::get<1>(func_codgen[i])]);
+    exec_->external_func_map.emplace(i, std::get<0>(func_codgen[i]));
+    exec_->external_map.emplace(i, comp_map[std::get<1>(func_codgen[i])]);
   }
 }
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 48c5ddad992e..4114efdeff3a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -843,25 +843,20 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
     packed_funcs_[packed_index] = lib.GetFunction(packed_name);
   }
 
-  for (const auto& it : external_map) {
+  for (const auto& it : this->exec->external_map) {
     Index subgraph_id = it.first;
-    Index ext_lib_indx = it.second;
+    Index ext_lib_idx = it.second;
     if (external_funcs.size() <= static_cast<size_t>(subgraph_id)) {
       external_funcs.resize(subgraph_id + 1);
     }
-    CHECK_GT(external_func_map.count(subgraph_id), 0U);
-    external_funcs[subgraph_id] =
-        ext_libs[ext_lib_indx].GetFunction(external_func_map[subgraph_id]);
+    CHECK_GT(this->exec->external_func_map.count(subgraph_id), 0U);
+    const std::string& symb = exec->external_func_map.at(subgraph_id);
+    auto ext_mod = exec->ext_libs.at(ext_lib_idx);
+    CHECK(ext_mod.operator->()) << "external module is not defined." << "\n";
+    external_funcs[subgraph_id] = ext_mod.GetFunction(symb);
   }
 }
 
-// TODO(@zhiics) Invoke the external function/subgraph.
-void VirtualMachine::InvokeExternal(Index ext_index,
-                                    const relay::Function& func,
-                                    Index arg_count, Index output_size,
-                                    const std::vector<Object>& args) {
-}
-
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
   ctxs_ = ctxs;
 }
@@ -969,7 +964,7 @@ void VirtualMachine::RunLoop() {
       case Opcode::InvokeExternal: {
         const auto& func = external_funcs[instr.ext_index];
         const auto& arity = instr.ext_arity;
-        std::vector<Object> args;
+        std::vector<ObjectRef> args;
         for (Index i = 0; i < arity; ++i) {
           args.push_back(ReadRegister(instr.ext_args[i]));
         }

From a298f9c2add2dae0480888309d9f60f5480d1717 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 30 Oct 2019 06:13:05 +0000
Subject: [PATCH 26/34] Improve:

- Separate compilation and runtime
- Create separate build pipeline for external and normal functions
- Serialize InvokeExternal Instruction
---
 cmake/modules/contrib/Extern.cmake            |   5 +
 include/tvm/runtime/vm.h                      |   2 +-
 python/tvm/relay/__init__.py                  |   2 +-
 python/tvm/relay/build_module.py              |  25 ++++
 src/relay/backend/compile_engine.cc           |   7 +-
 src/relay/backend/contrib/contrib_codegen.h   |  71 ++++++++++
 src/relay/backend/contrib/dnnl/codegen.cc     | 122 +++++-------------
 src/relay/backend/contrib/dnnl/libs.cc        |   9 +-
 src/relay/backend/contrib/dnnl/libs.h         |  29 -----
 src/relay/backend/contrib/gcc/codegen.cc      | 103 ++++-----------
 src/relay/backend/contrib/gcc/libs.cc         |   9 +-
 src/relay/backend/contrib/gcc/libs.h          |  28 ----
 src/runtime/contrib/dnnl/dnnl.cc              |  78 +++++++++++
 src/runtime/contrib/dnnl/dnnl.h               |  64 +++++++++
 .../runtime/contrib/extern_common.h           | 118 +++++++----------
 src/runtime/contrib/gcc/gcc.cc                |  78 +++++++++++
 src/runtime/contrib/gcc/gcc.h                 |  63 +++++++++
 src/runtime/vm/executable.cc                  |  20 ++-
 src/runtime/vm/vm.cc                          |   1 +
 .../python/relay/test_pass_partition_graph.py |  11 +-
 20 files changed, 531 insertions(+), 314 deletions(-)
 create mode 100644 src/relay/backend/contrib/contrib_codegen.h
 delete mode 100644 src/relay/backend/contrib/dnnl/libs.h
 delete mode 100644 src/relay/backend/contrib/gcc/libs.h
 create mode 100644 src/runtime/contrib/dnnl/dnnl.cc
 create mode 100644 src/runtime/contrib/dnnl/dnnl.h
 rename include/tvm/relay/contrib_codegen.h => src/runtime/contrib/extern_common.h (62%)
 create mode 100644 src/runtime/contrib/gcc/gcc.cc
 create mode 100644 src/runtime/contrib/gcc/gcc.h

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 20e0cb6fa100..8cce4365ef53 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -21,7 +21,12 @@ message(STATUS "Build with relay.backend.contrib")
 file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
 list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
+file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+
 # DNNL (for demo purpose)
 file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
 list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
 
+file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*.cc)
+list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 2824c2dc17ed..8bf2c3553471 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -301,7 +301,7 @@ struct Instruction {
    */
   static Instruction InvokePacked(Index packed_index, Index arity, Index output_size,
                                   const std::vector<RegName>& args);
-  /*!
+  /*! 
    * \brief Construct an allocate tensor instruction with constant shape.
    * \param storage The storage to allocate out of.
    * \param shape The shape of the tensor.
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index c7cbcf096a6c..60057e3387b4 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -29,7 +29,7 @@
 from . import adt
 from . import analysis
 from . import transform
-from .build_module import build, create_executor, optimize
+from .build_module import build, create_executor, optimize, build_extern
 from .transform import build_config
 from . import prelude
 from . import parser
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 28ce16b9b452..dcc7103a775e 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -30,6 +30,7 @@
 from .module import Module as _Module
 from .backend import interpreter as _interpreter
 from .backend.vm import VMExecutor
+from . import transform as _transform
 
 def _update_target(target):
     target = target if target else _target.current_target()
@@ -245,14 +246,21 @@ def build(mod, target=None, target_host=None, params=None):
     return graph_json, mod, params
 
 
+<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
 def optimize(mod, target=None, params=None):
     """Helper function that optimizes a Relay module.
+=======
+def build_extern(mod, target):
+    """Helper function that builds a Relay function to run on external codegen
+    tools.
+>>>>>>> Improve:
 
     Parameters
     ----------
     mod : relay.Module
         The module to build. Using relay.Function is deprecated.
 
+<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
     target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
     name) to str/tvm.target.Target, optional
         For heterogeneous compilation, it is a dictionary indicating context to
@@ -261,10 +269,15 @@ def optimize(mod, target=None, params=None):
     params : dict of str to NDArray
         Input parameters to the graph that do not change
         during inference time. Used for constant folding.
+=======
+    target : str
+        The name of the external compilation target.
+>>>>>>> Improve:
 
     Returns
     -------
     mod : relay.Module
+<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
         The optimized relay module.
 
     params : dict
@@ -294,6 +307,18 @@ def optimize(mod, target=None, params=None):
         bld_mod = BuildModule()
         mod, params = bld_mod.optimize(func, target, params)
     return mod, params
+=======
+        The relay module contains partitioned subgraphes for external codegen
+        tools.
+    """
+    if isinstance(mod, _expr.Function):
+        mod = _Module.from_expr(mod)
+
+    seq = _transform.Sequential([_transform.ExternOp(target),
+                                 _transform.PartitionGraph()])
+    mod = seq(mod)
+    return mod
+>>>>>>> Improve:
 
 
 class GraphExecutor(_interpreter.Executor):
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index c35b49fba8d4..4425a0ec9969 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -21,6 +21,8 @@
  * \file relay/backend/compile_engine.cc
  * \brief Internal compialtion engine.
  */
+#include "compile_engine.h"
+
 #include <tvm/schedule.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/ir.h>
@@ -28,7 +30,6 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
 #include <topi/tags.h>
@@ -38,8 +39,9 @@
 #include <functional>
 #include <vector>
 #include <unordered_map>
+
+#include "contrib/contrib_codegen.h"
 #include "../ir/type_functor.h"
-#include "compile_engine.h"
 
 namespace tvm {
 namespace relay {
@@ -601,6 +603,7 @@ class CompileEngineImpl : public CompileEngineNode {
       auto name = FunctionGetAttr(key->source_func, "func_name");
       const tvm::ir::StringImm* func_name = name.as<tvm::ir::StringImm>();
       CHECK(func_name);
+      value->lib.GetFunction("init")();
       value->packed_func = value->lib.GetFunction(func_name->value);
     } else if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
       // build the function.
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
new file mode 100644
index 000000000000..b3c7894ddd02
--- /dev/null
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/contrib_codegen.h
+ * \brief The base class for external codegen tools.
+ */
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CONTRIB_CODEGEN_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CONTRIB_CODEGEN_H_
+
+#include <tvm/relay/expr.h>
+#include <string>
+#include "../../../runtime/contrib/extern_common.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+class ExternCodegenBase {
+ public:
+  ExternCodegenBase() = default;
+
+  /*!
+   * \brief Compile the external library.
+   */
+  virtual void CompileExternLib() = 0;
+
+  /*!
+   * \brief Build the shared library of external ops.
+   *
+   * \param ref The subgraph Relay expression/module to be executed using extern ops.
+   *
+   */
+  virtual void Build(const NodeRef& ref) = 0;
+
+  /*!
+   * \brief Split the Relay function name to tokens.
+   *
+   * \param func The provided function.
+   *
+   * \return A vector of tokenized function name splitted by "_".
+   */
+  std::string GetSubgraphID(const Function& func) const {
+    const auto name_node =
+        FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
+    CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
+    std::string name = name_node->value;
+    return runtime::contrib::GetSubgraphID(name);
+  }
+};
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CONTRIB_CODEGEN_H_
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index ea537277e154..9eb855280dcb 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -15,35 +16,30 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <stdlib.h>
-#include <dlpack/dlpack.h>
+
+/*!
+ * \file src/relay/backend/contrib/dnnl/codegen.cc
+ * \brief Implementation of DNNL codegen APIs.
+ */
+
 #include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/util.h>
 
 #include <random>
+#include <fstream>
 #include <sstream>
-#include <unordered_map>
+#include <streambuf>
 
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#include "libs.h"
+#include "../../../../runtime/contrib/dnnl/dnnl.h"
+#include "../contrib_codegen.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
-
 // FIXME: This is an experimental implementation. We should implement all utilities
 // and make a base class such as ExternBuilder for users to implement.
 class DnnlBuilder : public ExprVisitor {
@@ -184,7 +180,7 @@ class DnnlBuilder : public ExprVisitor {
     }
 
     // Write subgraph function declaration
-    code += "extern \\\"C\\\" void " + subgraph_id_ + "(DnnlPackedArgs args, float* out) {\n";
+    code += "extern \"C\" void " + subgraph_id_ + "(DnnlPackedArgs args, float* out) {\n";
 
     // Unpack inputs
     for (size_t i = 0; i < subgraph_args_.size(); ++i) {
@@ -237,65 +233,10 @@ class DnnlBuilder : public ExprVisitor {
   }
 };
 
-class DNNLModuleNode : public ExternModuleNodeBase {
+class DNNLCodegen : public ExternCodegenBase {
  public:
-  const std::string GetPrefix() {
-    return "dnnl_";
-  }
-
-  /*!
-   * \brief Get the source code of the external module.
-   *
-   * \param format The format of the source code.
-   *
-   * \return The source code of the external library module in the text form.
-   */
-  TVM_DLL std::string GetSource(const std::string& format = "") override {
-    return "";
-  }
-
-  const char* type_key() const override { return "DNNLModule"; }
-
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters.
-   *
-   * \param name the name of the external function.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    std::string curr_id = GetSubgraphID(name);
-
-    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
-
-    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
-                                                    tvm::TVMRetValue* rv) {
-      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
-      runtime::NDArray out_arg = args[args.size() - 1];
-      auto out = reinterpret_cast<float*>(out_arg->data);
-
-      // Get function from the library
-      std::string encoded_name = GetPrefix() + curr_id;
-      auto func_s = reinterpret_cast<DnnlSubgraphFunc>(GetSymbol(encoded_name));
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        DnnlPackedArgs packed_args;
-        packed_args.data = reinterpret_cast<void**>(malloc(sizeof(float*) * args.size()));
-        for (int i = 0; i < args.size() - 1; ++i) {
-          runtime::NDArray arg = args[i];
-          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
-        }
-        (*func_s)(packed_args, out);
-      } else {
-        LOG(FATAL) << "Only support float32 type.";
-      }
-      *rv = out;
-    });
+  std::string GetLibPath() const {
+    return lib_path_;
   }
 
   void CreateExternSignature(const Function& func, bool update) {
@@ -313,29 +254,27 @@ class DNNLModuleNode : public ExternModuleNodeBase {
       std::uniform_int_distribution<uint64_t> distr;
       std::stringstream ss;
       ss << std::hex << distr(gen);
-      src_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".cc";
+      std::ifstream lib_file("src/relay/backend/contrib/dnnl/libs.cc");
+      code_.assign((std::istreambuf_iterator<char>(lib_file)),
+                    std::istreambuf_iterator<char>());
       lib_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".so";
-      std::string cmd = "cp src/relay/backend/contrib/dnnl/libs.cc " + src_path_;
-      CHECK_GE(std::system(cmd.c_str()), 0);
-      CHECK_GE(std::system("cp src/relay/backend/contrib/dnnl/libs.h /tmp/"), 0);
     }
 
-    auto builder = DnnlBuilder(GetPrefix() + sid);
+    auto builder = DnnlBuilder(runtime::contrib::kDnnlPrefix + sid);
     builder.VisitExpr(func->body);
     std::string code = builder.build();
-
-    std::string cmd = "echo \"" + code + "\" >> " + src_path_;
-    CHECK_GE(std::system(cmd.c_str()), 0);
+    code_ = code_ + code;
   }
 
   void CompileExternLib() override {
-    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC " + src_path_ +
-                      " -o " + lib_path_ + " -ldl -lpthread -lm -ldnnl";
+    std::string code = "echo \'" + code_ + "\'";
+    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC -xc++ - -o " + lib_path_ +
+                      " -ldl -lpthread -lm -ldnnl";
+    cmd = code + " | " + cmd;
     int ret = std::system(cmd.c_str());
     if (ret < 0) {
       LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
     }
-    Open({lib_path_});
   }
 
   void Build(const NodeRef& ref) override {
@@ -357,7 +296,7 @@ class DNNLModuleNode : public ExternModuleNodeBase {
   }
 
  private:
-  std::string src_path_;
+  std::string code_;
   std::string lib_path_;
 };
 
@@ -366,12 +305,15 @@ class DNNLModuleNode : public ExternModuleNodeBase {
  * compile it into a runtime module.
  */
 runtime::Module DNNLCompiler(const NodeRef& ref) {
-  std::shared_ptr<DNNLModuleNode> n = std::make_shared<DNNLModuleNode>();
-  n->Build(ref);
+  DNNLCodegen dnnl;
+  dnnl.Build(ref);
+  std::shared_ptr<runtime::contrib::DNNLModule> n =
+      std::make_shared<runtime::contrib::DNNLModule>(dnnl.GetLibPath());
   return runtime::Module(n);
 }
 
-TVM_REGISTER_API("relay.ext.dnnl").set_body_typed(DNNLCompiler);
+TVM_REGISTER_API("relay.ext.dnnl")
+.set_body_typed(DNNLCompiler);
 
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index 9ab69593ef80..dfbdb65d16d0 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -16,8 +17,6 @@
  * under the License.
  */
 
-#include "libs.h"
-
 #include <string.h>
 #include <assert.h>
 #include <stdlib.h>
@@ -32,6 +31,10 @@
 
 using namespace dnnl;
 
+typedef struct {
+  void** data;
+} DnnlPackedArgs;
+
 // Read from memory, write to handle
 inline void read_from_dnnl_memory(void* handle, const memory& mem) {
   size_t bytes = mem.get_desc().get_size();
diff --git a/src/relay/backend/contrib/dnnl/libs.h b/src/relay/backend/contrib/dnnl/libs.h
deleted file mode 100644
index 6a9580ae0aa3..000000000000
--- a/src/relay/backend/contrib/dnnl/libs.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
-#define TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
-
-#include <cstdint>
-#include <iostream>
-
-typedef struct {
-  void** data;
-} DnnlPackedArgs;
-
-#endif  // TVM_RELAY_BACKEND_CONTRIB_DNNL_LIBS_H_
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index cceffd012ec9..7317b7017a6d 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -15,28 +16,23 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <dlfcn.h>
-#include <stdlib.h>
-
-#include <tvm/relay/contrib_codegen.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
 
 #include <random>
+#include <fstream>
 #include <sstream>
-#include <unordered_map>
+#include <streambuf>
 
-#include "libs.h"
+#include "../contrib_codegen.h"
+#include "../../../../runtime/contrib/gcc/gcc.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 
-typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
-
 // FIXME: This is an experimental implementation. We should implement all utilities
 // and make a base claaa such as ExternBuilder for users to implement.
 class GccBuilder : public ExprVisitor {
@@ -118,7 +114,7 @@ class GccBuilder : public ExprVisitor {
     }
 
     // Write subgraph function declaration
-    code += "extern \\\"C\\\" void " + subgraph_id_ + "(GccPackedArgs args, float* out) {\n";
+    code += "extern  \"C\" void " + subgraph_id_ + "(GccPackedArgs args, float* out) {\n";
 
     // Unpack inputs
     for (size_t i = 0; i < subgraph_args_.size(); ++i) {
@@ -163,58 +159,10 @@ class GccBuilder : public ExprVisitor {
   }
 };
 
-class GccModuleNode : public ExternModuleNodeBase {
+class GccCodegen : public ExternCodegenBase {
  public:
-  const std::string GetPrefix() {
-    return "gcc_";
-  }
-
-  /*!
-   * \brief Get the source code of the external module.
-   *
-   * \param format The format of the source code.
-   *
-   * \return The source code of the external library module in the text form.
-   */
-  TVM_DLL std::string GetSource(const std::string& format = "") override {
-    return "";
-  }
-
-  const char* type_key() const override {
-    return "GccModule";
-  }
-
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override {
-    std::string curr_id = GetSubgraphID(name);
-
-    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
-    // Generate an external packed function
-    return PackedFunc([sptr_to_self, curr_id, this](tvm::TVMArgs args,
-                                                    tvm::TVMRetValue* rv) {
-      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
-      runtime::NDArray out_arg = args[args.size() - 1];
-      auto out = reinterpret_cast<float*>(out_arg->data);
-
-      // Get function from the library
-      std::string encoded_name = GetPrefix() + curr_id;
-      auto func_s = reinterpret_cast<GccSubgraphFunc>(GetSymbol(encoded_name));
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        GccPackedArgs packed_args;
-        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
-        for (int i = 0; i < args.size() - 1; ++i) {
-          runtime::NDArray arg = args[i];
-          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
-        }
-        (*func_s)(packed_args, out);
-      } else {
-        LOG(FATAL) << "Only support float32 type.";
-      }
-      *rv = out;
-    });
+  std::string GetLibPath() const {
+    return lib_path_;
   }
 
   void CreateExternSignature(const Function& func, bool update) {
@@ -233,30 +181,30 @@ class GccModuleNode : public ExternModuleNodeBase {
       std::uniform_int_distribution<uint64_t> distr;
       std::stringstream ss;
       ss << std::hex << distr(gen);
-      src_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".cc";
+      std::ifstream lib_file("src/relay/backend/contrib/gcc/libs.cc");
+      code_.assign((std::istreambuf_iterator<char>(lib_file)),
+                    std::istreambuf_iterator<char>());
       lib_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".so";
-      std::string cmd = "cp src/relay/backend/contrib/gcc/libs.cc " + src_path_;
-      CHECK_GE(std::system(cmd.c_str()), 0);
-      CHECK_GE(std::system("cp src/relay/backend/contrib/gcc/libs.h /tmp/"), 0);
     }
 
-    auto builder = GccBuilder(GetPrefix() + sid);
+    auto builder = GccBuilder(runtime::contrib::kGccPrefix + sid);
     builder.VisitExpr(func->body);
     std::string code = builder.build();
 
     // Append the signature.
-    auto cmd = "echo \"" + code + "\" >> " + src_path_;
-    CHECK_GE(std::system(cmd.c_str()), 0);
+    code_ = code_ + code;
   }
 
   void CompileExternLib() override {
-    std::string cmd =
-        "g++ -std=c++11 -shared -fPIC -ldl " + src_path_ + " -o " + lib_path_;
+    // Compile from pipe and generate the library.
+    std::string code = "echo \'" + code_ + "\'";
+    std::string cmd = "g++ -std=c++11 -shared -fPIC -ldl -o " + lib_path_ + " -xc++ -";
+    cmd = code + " | " + cmd;
+
     int ret = std::system(cmd.c_str());
     if (ret != 0) {
       LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
     }
-    Open({lib_path_});
   }
 
   void Build(const NodeRef& ref) override {
@@ -277,22 +225,23 @@ class GccModuleNode : public ExternModuleNodeBase {
   }
 
  private:
-  std::string src_path_;
+  std::string code_;
   std::string lib_path_;
 };
 
-
 /*!
  * \brief The external compiler/codegen tool. It takes a Relay expression/module and
  * compile it into a runtime module.
  *
  * The external codegen tool should have been registered similiarly to LLVM,
- * CUDA, etc, under TVM so the generated code could be packed in a runtime
+ * CUDA, etc, under TVM, so the generated code could be packed in a runtime
  * module. This module simplifies code serialization and invocation.
  */
 runtime::Module GccCompiler(const NodeRef& ref) {
-  std::shared_ptr<GccModuleNode> n = std::make_shared<GccModuleNode>();
-  n->Build(ref);
+  GccCodegen gcc;
+  gcc.Build(ref);
+  std::shared_ptr<runtime::contrib::GccModule> n =
+    std::make_shared<runtime::contrib::GccModule>(gcc.GetLibPath());
   return runtime::Module(n);
 }
 
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
index 721a2324c567..472fc12e4323 100644
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ b/src/relay/backend/contrib/gcc/libs.cc
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -16,12 +17,14 @@
  * under the License.
  */
 
-#include "libs.h"
-
 #include <cstdint>
 #include <cstring>
 #include <iostream>
 
+typedef struct {
+  float** data;
+} GccPackedArgs;
+
 #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
   extern "C" void p_ID_(float* a, float* b, float* out) { \
     for (int64_t i = 0; i < p_DIM1_; ++i) {               \
diff --git a/src/relay/backend/contrib/gcc/libs.h b/src/relay/backend/contrib/gcc/libs.h
deleted file mode 100644
index 261449bda075..000000000000
--- a/src/relay/backend/contrib/gcc/libs.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
-#define TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
-
-#include <cstdint>
-#include <iostream>
-
-typedef struct {
-  float** data;
-} GccPackedArgs;
-
-#endif  // TVM_RELAY_BACKEND_CONTRIB_GCC_LIBS_H_
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
new file mode 100644
index 000000000000..cbeddd816417
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "dnnl.h"
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+void DNNLModule::Init() {
+  if (!IsOpen()) {
+    CHECK_GT(lib_path_.size(), 0U);
+    Open({lib_path_});
+  }
+}
+
+runtime::PackedFunc DNNLModule::GetFunction(
+    const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "init") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      this->Init();
+    });
+  } else {
+    std::string curr_id = GetSubgraphID(name);
+
+    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
+
+    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
+      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
+      runtime::NDArray out_arg = args[args.size() - 1];
+      auto out = reinterpret_cast<float*>(out_arg->data);
+
+      // Get function from the library
+      std::string encoded_name = kDnnlPrefix + curr_id;
+      auto func_s = reinterpret_cast<DnnlSubgraphFunc>(GetSymbol(encoded_name));
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        DnnlPackedArgs packed_args;
+        packed_args.data = reinterpret_cast<void**>(malloc(sizeof(float*) * args.size()));
+        for (int i = 0; i < args.size() - 1; ++i) {
+          runtime::NDArray arg = args[i];
+          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
+        }
+        (*func_s)(packed_args, out);
+      } else {
+        LOG(FATAL) << "Only support float32 type.";
+      }
+      *rv = out;
+    });
+  }
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+
diff --git a/src/runtime/contrib/dnnl/dnnl.h b/src/runtime/contrib/dnnl/dnnl.h
new file mode 100644
index 000000000000..bde9ae068fa8
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
+#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <string>
+#include "../extern_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*!
+ * \brief Defined a data structure to save dnnl subgraph args.
+ */
+typedef struct {
+  void** data;
+} DnnlPackedArgs;
+
+constexpr const char* kDnnlPrefix = "dnnl_";
+
+typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
+
+class DNNLModule : public ExternModuleBase {
+ public:
+  explicit DNNLModule(const std::string& lib_path) : lib_path_(lib_path) {}
+
+  const char* type_key() const final {
+    return "DNNLModule";
+  }
+
+  runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  void Init() final;
+
+ private:
+  /*! \brief The path to the compiled dnnl library.*/
+  std::string lib_path_;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
diff --git a/include/tvm/relay/contrib_codegen.h b/src/runtime/contrib/extern_common.h
similarity index 62%
rename from include/tvm/relay/contrib_codegen.h
rename to src/runtime/contrib/extern_common.h
index 3ad6b137c439..0a400c11649d 100644
--- a/include/tvm/relay/contrib_codegen.h
+++ b/src/runtime/contrib/extern_common.h
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -15,14 +16,17 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RELAY_CONTRIB_CODEGEN_H_
-#define TVM_RELAY_CONTRIB_CODEGEN_H_
+
+/*!
+ * \file src/runtime/contrib/extern_common.h
+ * \brief The definition of the base class for the external runtime.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
+#define TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
 
 #include <stdlib.h>
 #include <dlpack/dlpack.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/util.h>
@@ -37,28 +41,42 @@
 #endif
 
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace contrib {
 
-class ExternModuleNodeBase : public runtime:: ModuleNode {
- public:
-  ExternModuleNodeBase() = default;
-  ~ExternModuleNodeBase() {
-    Close();
+/*!
+ * \brief Split the encoded function name to tokens.
+ *
+ * \param the function name string.
+ *
+ * \return a vector of tokenized function name splitted by "_".
+ */
+static inline std::string GetSubgraphID(const std::string& name) {
+  std::string temp = name;
+  std::vector<std::string> tokens;
+  std::string delimiter = "_";
+  size_t pos = 0;
+  std::string token;
+  while ((pos = temp.find(delimiter)) != std::string::npos) {
+    token = temp.substr(0, pos);
+    tokens.push_back(token);
+    temp.erase(0, pos + delimiter.length());
   }
+  tokens.push_back(temp);
 
-  /*!
-   * \brief Compile the external library.
-   */
-  virtual void CompileExternLib() = 0;
+  CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
+  CHECK(tokens[0] == "subgraph")
+      << "Function name does not start with \"subgraph\": " << name;
+  return tokens[1];
+}
 
-  /*!
-   * \brief Build the shared library of external ops.
-   *
-   * \param ref The subgraph Relay expression/module to be executed using extern ops.
-   *
-   */
-  virtual void Build(const NodeRef& ref) = 0;
+class ExternModuleBase : public runtime:: ModuleNode {
+ public:
+  ExternModuleBase() = default;
+
+  ~ExternModuleBase() {
+    Close();
+  }
 
   /*!
    * \brief Get a PackedFunc from module, which is a function ptr can be invoked
@@ -73,53 +91,8 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
       const std::string& name,
       const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
 
-  /*!
-   * \brief Get the source code of the external module.
-   *
-   * \param format The format of the source code.
-   *
-   * \return The source code of the external library module in the text form.
-   */
-  TVM_DLL std::string GetSource(const std::string& format = "") override {
-    return "";
-  }
-
   const char* type_key() const override {
-    return "ExternModule";
-  }
-
-  /*!
-   * \brief Split the encoded function name to tokens.
-   *
-   * \param the function name string.
-   *
-   * \return a vector of tokenized function name splitted by "_".
-   */
-  std::string GetSubgraphID(const Function& func) const {
-    const auto name_node =
-        FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
-    CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
-    std::string name = name_node->value;
-    return GetSubgraphID(name);
-  }
-
-  std::string GetSubgraphID(const std::string& name) const {
-    std::string temp = name;
-    std::vector<std::string> tokens;
-    std::string delimiter = "_";
-    size_t pos = 0;
-    std::string token;
-    while ((pos = temp.find(delimiter)) != std::string::npos) {
-      token = temp.substr(0, pos);
-      tokens.push_back(token);
-      temp.erase(0, pos + delimiter.length());
-    }
-    tokens.push_back(temp);
-
-    CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
-    CHECK(tokens[0] == "subgraph")
-        << "Function name does not start with \"subgraph\": " << name;
-    return tokens[1];
+    return "ExternModuleBase";
   }
 
  protected:
@@ -195,9 +168,12 @@ class ExternModuleNodeBase : public runtime:: ModuleNode {
     }
   }
 #endif
+
+  // Initialize an external runtime module.
+  virtual void Init() = 0;
 };
 
 }  // namespace contrib
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RELAY_CONTRIB_CODEGEN_H_
+#endif  // TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
new file mode 100644
index 000000000000..183fd85d65fb
--- /dev/null
+++ b/src/runtime/contrib/gcc/gcc.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "gcc.h"
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+void GccModule::Init() {
+  if (!IsOpen()) {
+    CHECK_GT(lib_path_.size(), 0U);
+    Open({lib_path_});
+  }
+}
+
+runtime::PackedFunc GccModule::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "init") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      this->Init();
+    });
+  } else {
+    std::string curr_id = GetSubgraphID(name);
+
+    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
+    // Generate an external packed function
+    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
+      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
+      runtime::NDArray out_arg = args[args.size() - 1];
+      auto out = reinterpret_cast<float*>(out_arg->data);
+
+      // Get function from the library
+      std::string encoded_name = "gcc_" + curr_id;
+      auto func_s = reinterpret_cast<GccSubgraphFunc>(GetSymbol(encoded_name));
+
+      // Reinterpret data and function to the right type and invoke
+      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
+        GccPackedArgs packed_args;
+        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
+        for (int i = 0; i < args.size() - 1; ++i) {
+          runtime::NDArray arg = args[i];
+          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
+        }
+        (*func_s)(packed_args, out);
+      } else {
+        LOG(FATAL) << "Only support float32 type.";
+      }
+      *rv = out;
+    });
+  }
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.h b/src/runtime/contrib/gcc/gcc.h
new file mode 100644
index 000000000000..4c23c218ea8f
--- /dev/null
+++ b/src/runtime/contrib/gcc/gcc.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_GCC_GCC_H_
+#define TVM_RUNTIME_CONTRIB_GCC_GCC_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <string>
+#include "../extern_common.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+constexpr const char* kGccPrefix = "gcc_";
+
+/*!
+ * \brief Defined a data structure to save subgraph args.
+ */
+typedef struct {
+  float** data;
+} GccPackedArgs;
+
+typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
+
+class GccModule : public ExternModuleBase {
+ public:
+  explicit GccModule(const std::string& lib_path) : lib_path_(lib_path) {}
+
+  const char* type_key() const final {
+    return "GccModule";
+  }
+
+  runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  void Init() final;
+
+ private:
+  std::string lib_path_;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_GCC_GCC_H_
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index f02fadb53ed9..81d3d11e0ecd 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -308,10 +308,17 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
       // Number of fields = 0
       break;
     }
-    case Opcode::InvokePacked: {
+    case Opcode::InvokeExternal: {
       // Number of fields = 3 + instr.arity
       // Note that arity includes both input arguments and outputs. We will
       // put all the `arity` number of fields in the end for serialization.
+      fields.assign({instr.ext_index, instr.ext_arity, instr.ext_output_size});
+      // Save the args.
+      fields.insert(fields.end(), instr.ext_args, instr.ext_args + instr.ext_arity);
+      break;
+    }
+    case Opcode::InvokePacked: {
+      // Number of fields = 3 + instr.arity
       fields.assign({instr.packed_index, instr.arity, instr.output_size});
       // Save the args.
       fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
@@ -557,6 +564,17 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
       DCHECK(instr.fields.empty());
       return Instruction::Fatal();
     }
+    case Opcode::InvokeExternal: {
+      // Number of fields = 3 + instr.arity
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index ext_index = instr.fields[0];
+      Index arity = instr.fields[1];
+      Index output_size = instr.fields[2];
+      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
+      return Instruction::InvokePacked(ext_index, arity, output_size, args);
+    }
     case Opcode::InvokePacked: {
       // Number of fields = 3 + instr.arity
       DCHECK_GE(instr.fields.size(), 3U);
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 4114efdeff3a..f083b058cafd 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -853,6 +853,7 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
     const std::string& symb = exec->external_func_map.at(subgraph_id);
     auto ext_mod = exec->ext_libs.at(ext_lib_idx);
     CHECK(ext_mod.operator->()) << "external module is not defined." << "\n";
+    ext_mod.GetFunction("init")();
     external_funcs[subgraph_id] = ext_mod.GetFunction(symb);
   }
 }
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 434ad2ea1730..b8bad9419ad2 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -193,8 +193,7 @@ def test_extern_gcc_single_op():
     y_data = np.random.rand(8, 8).astype('float32')
     mod = relay.Module()
     mod["main"] = f
-    mod = relay.transform.ExternOp("gcc")(mod)
-    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.build_extern(mod, "gcc")
 
     for kind in ["debug", "vm"]:
         ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
@@ -212,8 +211,7 @@ def test_extern_gcc():
     y_data = np.random.rand(2, 2).astype('float32')
     mod = relay.Module()
     mod["main"] = f
-    mod = relay.transform.ExternOp("gcc")(mod)
-    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.build_extern(mod, "gcc")
 
     for kind in ["debug", "vm"]:
         ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
@@ -221,7 +219,6 @@ def test_extern_gcc():
         tvm.testing.assert_allclose(res.asnumpy(),
                                     (y_data * y_data) - (x_data + x_data))
 
-@nottest
 def test_extern_dnnl():
     dtype = 'float32'
     ishape = (1, 32, 14, 14)
@@ -268,9 +265,7 @@ def test_extern_dnnl_mobilenet():
     ishape = (1, 3, 224, 224)
     mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
 
-    #mod['main'] = MobileNetAnnotator('dnnl').visit(mod['main'])
-    mod = relay.transform.ExternOp('dnnl')(mod)
-    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.build_extern(mod, "dnnl")
 
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 

From 789120809f291d19d5a946265cc4a25dfa460b12 Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Thu, 31 Oct 2019 15:13:11 -0700
Subject: [PATCH 27/34] change macro to function to reduce .so size

---
 cmake/config.cmake                        |   6 +
 cmake/modules/contrib/Extern.cmake        |  27 +-
 src/relay/backend/contrib/dnnl/codegen.cc |  40 +--
 src/relay/backend/contrib/dnnl/libs.cc    | 362 +++++++++++-----------
 tutorials/dev/custom_relay_backend.py     |  35 ++-
 5 files changed, 245 insertions(+), 225 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 1ef956c7ee18..af785cc4524b 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -163,6 +163,12 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
+# Whether use contrib extern (use ";" to separate multiple externs)
+# Available externs:
+#   gcc
+#   dnnl
+set(USE_EXTERN none)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 8cce4365ef53..f44ff1c9fbf4 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -17,16 +17,23 @@
 
 message(STATUS "Build with relay.backend.contrib")
 
-# Gcc (for demo purpose)
-file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
-list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+list(FIND USE_EXTERN "gcc" _gcc_idx)
+if(_gcc_idx GREATER -1)
+    file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
+    list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
-file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+    message(STATUS "Use extern library: GCC")
+endif()
 
-# DNNL (for demo purpose)
-file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
-list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+list(FIND USE_EXTERN "dnnl" _dnnl_idx)
+if(_dnnl_idx GREATER -1)
+    file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
+    list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+
+    file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*)
+    list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
+    message(STATUS "Use extern library: MKLDNN")
+endif()
 
-file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*.cc)
-list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 9eb855280dcb..06b68f9debbb 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -57,17 +57,16 @@ class DnnlBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    std::string func_name = subgraph_id_ + "_" + std::to_string(func_idx_++);
 
     // Make function declaration
     std::string decl = "";
 
     // Args: ID
-    std::string macro = "";
+    std::string func_name = "";
     std::vector<std::string> args;
 
     if (IsOp(call, "nn.conv2d")) {
-      macro = "CONV2D";
+      func_name = "dnnl_conv2d";
       const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
 
       auto ishape = GetShape(call->args[0]->checked_type());
@@ -88,7 +87,7 @@ class DnnlBuilder : public ExprVisitor {
       args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
       args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
     } else if (IsOp(call, "nn.dense")) {
-      macro = "DENSE";
+      func_name = "dnnl_dense";
       auto ishape = GetShape(call->args[0]->checked_type());
       auto wshape = GetShape(call->args[1]->checked_type());
 
@@ -98,7 +97,7 @@ class DnnlBuilder : public ExprVisitor {
       args.push_back(std::to_string(wshape[0]));
 
     } else if (IsOp(call, "nn.relu")) {
-      macro = "RELU";
+      func_name = "dnnl_relu";
       auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: N, C, H, W
@@ -106,7 +105,7 @@ class DnnlBuilder : public ExprVisitor {
         args.push_back(std::to_string(s));
       }
     } else if (IsOp(call, "nn.batch_norm")) {
-      macro = "BN";
+      func_name = "dnnl_bn";
       const auto* bn_attr = call->attrs.as<BatchNormAttrs>();
       auto ishape = GetShape(call->args[0]->checked_type());
 
@@ -118,7 +117,7 @@ class DnnlBuilder : public ExprVisitor {
       // Args: epilson
       args.push_back(std::to_string(bn_attr->epsilon));
     } else if (IsOp(call, "add")) {
-      macro = "ADD";
+      func_name = "dnnl_add";
       auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: H, W
@@ -129,14 +128,7 @@ class DnnlBuilder : public ExprVisitor {
       LOG(FATAL) << "Unsupported op: " << AsText(call->op, false);
     }
 
-    decl = macro + "(" + func_name;
-    for (size_t i = 0; i < args.size(); ++i) {
-      decl += ", " + args[i];
-    }
-    decl += ");";
-    func_decl_.push_back(decl);
-
-    // Make function call when visiting arguments
+    // Make function call with input buffers when visiting arguments
     bool first = true;
     std::string func_call = func_name + "(";
     for (size_t i = 0; i < call->args.size(); ++i) {
@@ -150,6 +142,7 @@ class DnnlBuilder : public ExprVisitor {
       }
     }
 
+    // Analyze the output buffer
     auto type_node = call->checked_type().as<TensorTypeNode>();
     CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
         << "Only support single output tensor with float type";
@@ -162,8 +155,13 @@ class DnnlBuilder : public ExprVisitor {
     std::string buf_decl =
         "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
     buf_decl_.push_back(buf_decl);
+    func_call += ", " + out;
 
-    func_call += ", " + out + ");";
+    // Attach attribute arguments
+    for (size_t i = 0; i < args.size(); ++i) {
+      func_call += ", " + args[i];
+    }
+    func_call += ");";
     subgraph_body.push_back(func_call);
 
     // Update output buffer
@@ -174,11 +172,6 @@ class DnnlBuilder : public ExprVisitor {
   std::string build() {
     std::string code = "";
 
-    // Write function macros
-    for (auto decl : func_decl_) {
-      code += decl + "\n";
-    }
-
     // Write subgraph function declaration
     code += "extern \"C\" void " + subgraph_id_ + "(DnnlPackedArgs args, float* out) {\n";
 
@@ -205,11 +198,9 @@ class DnnlBuilder : public ExprVisitor {
 
  private:
   std::string subgraph_id_ = "";
-  int func_idx_ = 0;
   int buf_idx_ = 0;
   std::vector<std::string> subgraph_args_;
   std::vector<std::string> subgraph_body;
-  std::vector<std::string> func_decl_;
   std::vector<std::string> buf_decl_;
   std::vector<std::pair<std::string, int>> out_;
 
@@ -262,8 +253,7 @@ class DNNLCodegen : public ExternCodegenBase {
 
     auto builder = DnnlBuilder(runtime::contrib::kDnnlPrefix + sid);
     builder.VisitExpr(func->body);
-    std::string code = builder.build();
-    code_ = code_ + code;
+    code_ += builder.build();
   }
 
   void CompileExternLib() override {
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
index dfbdb65d16d0..b97b36099ef6 100644
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ b/src/relay/backend/contrib/dnnl/libs.cc
@@ -43,186 +43,182 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
   std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
 }
 
-#define CONV2D(p_ID_, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_,       \
-               p_Sw_)                                                                              \
-  extern "C" void p_ID_(float* data, float* weights, float* out) {                                 \
-    using tag = memory::format_tag;                                                                \
-    using dt = memory::data_type;                                                                  \
-    engine eng(engine::kind::cpu, 0);                                                              \
-    stream s(eng);                                                                                 \
-                                                                                                   \
-    memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};                                         \
-    memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};                                   \
-    if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};                        \
-    memory::dims conv2d_bias_tz = {p_O_};                                                          \
-    memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,          \
-                                  (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};                     \
-    memory::dims conv2d_strides = {p_Sh_, p_Sw_};                                                  \
-    memory::dims conv2d_padding = {p_Ph_, p_Pw_};                                                  \
-                                                                                                   \
-    std::vector<float> conv2d_bias(p_O_, 0);                                                       \
-                                                                                                   \
-    auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);               \
-    auto user_weights_memory =                                                                     \
-        memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights); \
-    auto conv2d_user_bias_memory =                                                                 \
-        memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());                      \
-                                                                                                   \
-    auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);                         \
-    auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);                       \
-    auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);                 \
-    auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);                        \
-                                                                                                   \
-    auto conv2d_desc =                                                                             \
-        convolution_forward::desc(prop_kind::forward_inference, algorithm::convolution_direct,     \
-                                  conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md, \
-                                  conv2d_strides, conv2d_padding, conv2d_padding);                 \
-    auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);                 \
-                                                                                                   \
-    auto conv2d_src_memory = user_src_memory;                                                      \
-    auto conv2d_weights_memory = user_weights_memory;                                              \
-    auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);                             \
-                                                                                                   \
-    auto conv = convolution_forward(conv2d_prim_desc);                                             \
-    conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},                                            \
-                     {DNNL_ARG_WEIGHTS, conv2d_weights_memory},                                    \
-                     {DNNL_ARG_BIAS, conv2d_user_bias_memory},                                     \
-                     {DNNL_ARG_DST, conv2d_dst_memory}});                                          \
-    s.wait();                                                                                      \
-    read_from_dnnl_memory(out, conv2d_dst_memory);                                                 \
-  }
-
-#define DENSE(p_ID_, p_B_, p_I_, p_O_)                                                   \
-  extern "C" void p_ID_(float* data, float* weight, float* out) {                        \
-    using tag = memory::format_tag;                                                      \
-    using dt = memory::data_type;                                                        \
-                                                                                         \
-    engine eng(engine::kind::cpu, 0);                                                    \
-    stream s(eng);                                                                       \
-                                                                                         \
-    memory::dims data_tz = {p_B_, p_I_};                                                 \
-    memory::dims weight_tz = {p_O_, p_I_};                                               \
-    memory::dims bias_tz = {p_O_};                                                       \
-    memory::dims dst_tz = {p_B_, p_O_};                                                  \
-                                                                                         \
-    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};                            \
-    auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});                      \
-    auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});                           \
-    auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});                            \
-                                                                                         \
-    std::vector<float> bias(p_O_, 0);                                                    \
-    auto data_memory = memory(data_md, eng, data);                                       \
-    auto weight_memory = memory(weight_md, eng, weight);                                 \
-    auto bias_memory = memory(bias_md, eng, bias.data());                                \
-    auto dst_memory = memory(dst_md, eng);                                               \
-                                                                                         \
-    auto dense_desc = inner_product_forward::desc(prop_kind::forward_inference, data_md, \
-                                                  weight_md, bias_md, dst_md);           \
-    auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);       \
-    assert(dst_md == dense_prim_desc.dst_desc());                                        \
-                                                                                         \
-    auto dense = inner_product_forward(dense_prim_desc);                                 \
-    dense.execute(s, {{DNNL_ARG_SRC, data_memory},                                       \
-                      {DNNL_ARG_WEIGHTS, weight_memory},                                 \
-                      {DNNL_ARG_BIAS, bias_memory},                                      \
-                      {DNNL_ARG_DST, dst_memory}});                                      \
-    s.wait();                                                                            \
-    read_from_dnnl_memory(out, dst_memory);                                              \
-  }
-
-#define RELU(p_ID_, p_N_, p_C_, p_H_, p_W_)                                                       \
-  extern "C" void p_ID_(float* data, float* out) {                                                \
-    using tag = memory::format_tag;                                                               \
-    using dt = memory::data_type;                                                                 \
-                                                                                                  \
-    engine eng(engine::kind::cpu, 0);                                                             \
-    stream s(eng);                                                                                \
-                                                                                                  \
-    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                              \
-                                                                                                  \
-    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                                   \
-                                                                                                  \
-    auto data_memory = memory(data_md, eng, data);                                                \
-    auto dst_memory = memory(data_md, eng);                                                       \
-                                                                                                  \
-    auto relu_desc =                                                                              \
-        eltwise_forward::desc(prop_kind::forward_inference, algorithm::eltwise_relu, data_md, 0); \
-    auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);                        \
-    assert(data_md == relu_prim_desc.dst_desc());                                                 \
-                                                                                                  \
-    auto relu = eltwise_forward(relu_prim_desc);                                                  \
-    relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});                   \
-    s.wait();                                                                                     \
-    read_from_dnnl_memory(out, dst_memory);                                                       \
-  }
-
-#define BN(p_ID_, p_N_, p_C_, p_H_, p_W_, p_E_)                                               \
-  extern "C" void p_ID_(float* data, float* gamma, float* beta, float* mean, float* variance, \
-                        float* out) {                                                         \
-    using tag = memory::format_tag;                                                           \
-    using dt = memory::data_type;                                                             \
-                                                                                              \
-    engine eng(engine::kind::cpu, 0);                                                         \
-    stream s(eng);                                                                            \
-                                                                                              \
-    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                          \
-                                                                                              \
-    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                               \
-                                                                                              \
-    auto data_memory = memory(data_md, eng, data);                                            \
-    auto dst_memory = memory(data_md, eng);                                                   \
-                                                                                              \
-    auto bn_desc = batch_normalization_forward::desc(                                         \
-        prop_kind::forward_inference, data_md, p_E_,                                          \
-        normalization_flags::use_global_stats | normalization_flags::use_scale_shift);        \
-    auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);            \
-    assert(data_md == bn_prim_desc.dst_desc());                                               \
-                                                                                              \
-    float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));               \
-    memcpy(weight, gamma, sizeof(float) * p_C_);                                              \
-    memcpy(weight + p_C_, beta, sizeof(float) * p_C_);                                        \
-                                                                                              \
-    auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);                    \
-    auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);                           \
-    auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);               \
-                                                                                              \
-    auto bn = batch_normalization_forward(bn_prim_desc);                                      \
-    bn.execute(s, {{DNNL_ARG_SRC, data_memory},                                               \
-                   {DNNL_ARG_DST, dst_memory},                                                \
-                   {DNNL_ARG_SCALE_SHIFT, weight_memory},                                     \
-                   {DNNL_ARG_MEAN, mean_memory},                                              \
-                   {DNNL_ARG_VARIANCE, variance_memory}});                                    \
-    s.wait();                                                                                 \
-    read_from_dnnl_memory(out, dst_memory);                                                   \
-    free(weight);                                                                             \
-  }
-
-#define ADD(p_ID_, p_N_, p_C_, p_H_, p_W_)                                           \
-  extern "C" void p_ID_(float* data, float* weight, float* out) {                    \
-    using tag = memory::format_tag;                                                  \
-    using dt = memory::data_type;                                                    \
-                                                                                     \
-    engine eng(engine::kind::cpu, 0);                                                \
-    stream s(eng);                                                                   \
-                                                                                     \
-    memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};                                 \
-                                                                                     \
-    auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};                      \
-    auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});                  \
-    auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});                     \
-                                                                                     \
-    auto data_memory = memory(data_md, eng, data);                                   \
-    auto weight_memory = memory(weight_md, eng, weight);                             \
-    auto dst_memory = memory(dst_md, eng);                                           \
-                                                                                     \
-    auto add_desc = binary::desc(algorithm::binary_add, data_md, weight_md, dst_md); \
-    auto add_prim_desc = binary::primitive_desc(add_desc, eng);                      \
-    assert(dst_md == add_prim_desc.dst_desc());                                      \
-                                                                                     \
-    auto add = binary(add_prim_desc);                                                \
-    add.execute(s, {{DNNL_ARG_SRC_0, data_memory},                                   \
-                    {DNNL_ARG_SRC_1, weight_memory},                                 \
-                    {DNNL_ARG_DST, dst_memory}});                                    \
-    s.wait();                                                                        \
-    read_from_dnnl_memory(out, dst_memory);                                          \
-  }
+extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_,
+                            int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_,
+                            int p_Kw_, int p_Sh_, int p_Sw_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};
+  memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
+  if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
+  memory::dims conv2d_bias_tz = {p_O_};
+  memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
+                                (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
+  memory::dims conv2d_strides = {p_Sh_, p_Sw_};
+  memory::dims conv2d_padding = {p_Ph_, p_Pw_};
+
+  std::vector<float> conv2d_bias(p_O_, 0);
+
+  auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
+  auto user_weights_memory =
+      memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights);
+  auto conv2d_user_bias_memory =
+      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());
+
+  auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
+  auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
+  auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
+  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
+
+  auto conv2d_desc = convolution_forward::desc(
+      prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
+      conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding, conv2d_padding);
+  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);
+
+  auto conv2d_src_memory = user_src_memory;
+  auto conv2d_weights_memory = user_weights_memory;
+  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+
+  auto conv = convolution_forward(conv2d_prim_desc);
+  conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
+                   {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
+                   {DNNL_ARG_BIAS, conv2d_user_bias_memory},
+                   {DNNL_ARG_DST, conv2d_dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, conv2d_dst_memory);
+}
+
+extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_B_, p_I_};
+  memory::dims weight_tz = {p_O_, p_I_};
+  memory::dims bias_tz = {p_O_};
+  memory::dims dst_tz = {p_B_, p_O_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};
+  auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});
+  auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});
+  auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});
+
+  std::vector<float> bias(p_O_, 0);
+  auto data_memory = memory(data_md, eng, data);
+  auto weight_memory = memory(weight_md, eng, weight);
+  auto bias_memory = memory(bias_md, eng, bias.data());
+  auto dst_memory = memory(dst_md, eng);
+
+  auto dense_desc = inner_product_forward::desc(prop_kind::forward_inference, data_md, weight_md,
+                                                bias_md, dst_md);
+  auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);
+  assert(dst_md == dense_prim_desc.dst_desc());
+
+  auto dense = inner_product_forward(dense_prim_desc);
+  dense.execute(s, {{DNNL_ARG_SRC, data_memory},
+                    {DNNL_ARG_WEIGHTS, weight_memory},
+                    {DNNL_ARG_BIAS, bias_memory},
+                    {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+}
+
+extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_, int p_W_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+
+  auto data_memory = memory(data_md, eng, data);
+  auto dst_memory = memory(data_md, eng);
+
+  auto relu_desc =
+      eltwise_forward::desc(prop_kind::forward_inference, algorithm::eltwise_relu, data_md, 0);
+  auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);
+  assert(data_md == relu_prim_desc.dst_desc());
+
+  auto relu = eltwise_forward(relu_prim_desc);
+  relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+}
+
+extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean, float* variance,
+                        float* out, int p_N_, int p_C_, int p_H_, int p_W_, int p_E_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+
+  auto data_memory = memory(data_md, eng, data);
+  auto dst_memory = memory(data_md, eng);
+
+  auto bn_desc = batch_normalization_forward::desc(
+      prop_kind::forward_inference, data_md, p_E_,
+      normalization_flags::use_global_stats | normalization_flags::use_scale_shift);
+  auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);
+  assert(data_md == bn_prim_desc.dst_desc());
+
+  float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));
+  memcpy(weight, gamma, sizeof(float) * p_C_);
+  memcpy(weight + p_C_, beta, sizeof(float) * p_C_);
+
+  auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);
+  auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);
+  auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);
+
+  auto bn = batch_normalization_forward(bn_prim_desc);
+  bn.execute(s, {{DNNL_ARG_SRC, data_memory},
+                 {DNNL_ARG_DST, dst_memory},
+                 {DNNL_ARG_SCALE_SHIFT, weight_memory},
+                 {DNNL_ARG_MEAN, mean_memory},
+                 {DNNL_ARG_VARIANCE, variance_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+  free(weight);
+}
+
+extern "C" void dnnl_add(float* data, float* weight, float* out,
+                         int p_N_, int p_C_, int p_H_, int p_W_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+  auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
+  auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
+
+  auto data_memory = memory(data_md, eng, data);
+  auto weight_memory = memory(weight_md, eng, weight);
+  auto dst_memory = memory(dst_md, eng);
+
+  auto add_desc = binary::desc(algorithm::binary_add, data_md, weight_md, dst_md);
+  auto add_prim_desc = binary::primitive_desc(add_desc, eng);
+  assert(dst_md == add_prim_desc.dst_desc());
+
+  auto add = binary(add_prim_desc);
+  add.execute(
+      s,
+      {{DNNL_ARG_SRC_0, data_memory}, {DNNL_ARG_SRC_1, weight_memory}, {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+}
diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index 7359fea1bd7b..1933cd94b860 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -226,8 +226,17 @@ def visit_call(self, call):
 # Implement The Runtime Dispather
 # -------------------------------
 # The last step is invoking the generated external library in runtime.
-# Specifically, we need to implement tvm runtime `Module` compatible
-# `GetFunction()` in codegen.cc. The function takes a subgraph name and returns
+# We create a runtime module `GccModule` derived from `ExternModuleBase`
+# in src/runtime/contrib/gcc/gcc.h for Relay runtime to dispatch the
+# generated library/executable. Then, we implement the dispatcher in
+# src/runtime/contrib/gcc/gcc.cc. Note that altough the `GccModule` constructor
+# accepts the path of generated library/executable for runtime initialization,
+# it can be customized by each external backend to accept any types of required
+# artifacts.
+
+######################################################################
+# In addition, we implement tvm runtime `Module` compatible
+# `GetFunction()`. The function takes a subgraph name and returns
 # a `PackedFunc` that executes the subgraph with runtime input data. Note that
 # the runtime data in TVM is provided in the tvm `NDArray` format. It's
 # vendors' repsonsiblity to deserialize it into the format that they library
@@ -243,18 +252,30 @@ def visit_call(self, call):
 # Add Codegen to TVM Building Process
 # -----------------------------------
 # Finally, we include the implemented codegen to the cmake config so that
-# it will be built along with the TVM. To do so, we add two lines to
-# cmake/modules/contrib/Extern.cmake:
-# file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
-# list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+# it will be built along with the TVM. In cmake/modules/contrib/Extern.cmake:
+#
+# list(FIND USE_EXTERN "gcc" _gcc_idx)
+# if(_gcc_idx GREATER -1)
+#     file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
+#     list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+#     file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+#     list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+#     message(STATUS "Use extern library: GCC")
+# endif()
 
 
 ######################################################################
-# We can now test the correctness of the external GCC backend:
+# We can now build TVM with the external GCC backedn and test the correctness:
+# 1. cd build
+# 2. set(USE_EXTERN gcc) in config.cmake
+# 3. cmake ..; make -j
 #
 # .. note::
 #     The complete GCC backend implementation is in the TVM codebase
 #     so we can directly use it in this tutorial for demonstration.
+#
+#     Multiple external backends can be eneabled simultaneously by ";".
+#     For example: set(USE_EXTERN gcc;dnnl)
 
 import numpy as np
 

From b8f1db0bd6e655581511394765583f1102c2f68c Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 1 Nov 2019 20:50:21 +0000
Subject: [PATCH 28/34] create attr const string for Function

---
 include/tvm/relay/expr.h                  | 19 +++++++++++++++++++
 src/relay/backend/compile_engine.cc       |  4 ++--
 src/relay/backend/contrib/dnnl/codegen.cc |  1 -
 src/relay/backend/vm/compiler.cc          |  4 ++--
 src/relay/backend/vm/lambda_lift.cc       |  6 ++----
 src/relay/ir/expr.cc                      |  8 ++++----
 src/relay/pass/fuse_ops.cc                |  2 +-
 src/relay/pass/partition_graph.cc         | 10 ++++++----
 src/relay/pass/pass_manager.cc            | 10 +++++-----
 9 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index db4cc6c993b0..ee6db9342bb7 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -596,6 +596,25 @@ std::string AsText(const NodeRef& node,
                    bool show_meta_data = true,
                    runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
 
+/*! \brief namespace of the attributes that are attached to a function. */
+namespace attr {
+/*! \brief Mark the function as a primitive function. */
+constexpr const char* kPrimitive = "Primitive";
+/*!
+ * \brief Mark the function as an external function that needs to be handled by
+ * the external codegen tool/backend.
+ */
+constexpr const char* kExternal = "External";
+/*! \brief Indicate if the function is a closure. */
+constexpr const char* kClosure = "Closure";
+/*! \brief Store a Var to parameter/Constant mapping on a Function. */
+constexpr const char* kParams = "__params__";
+/*! \brief Store the function name. */
+constexpr const char* kFuncName = "FuncName";
+/*! \brief Mark if the function should be avoided being optimized. */
+constexpr const char* kSkipOptimization = "SkipOptimization";
+}  // namespace attr
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 4425a0ec9969..798d8b632bb2 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -600,7 +600,7 @@ class CompileEngineImpl : public CompileEngineNode {
     if (value->packed_func != nullptr) return value->packed_func;
     // Handle 3rd party generated code library.
     if (value->lib.operator->()) {
-      auto name = FunctionGetAttr(key->source_func, "func_name");
+      auto name = FunctionGetAttr(key->source_func, attr::kFuncName);
       const tvm::ir::StringImm* func_name = name.as<tvm::ir::StringImm>();
       CHECK(func_name);
       value->lib.GetFunction("init")();
@@ -661,7 +661,7 @@ class CompileEngineImpl : public CompileEngineNode {
     }
 
     if (key->source_func->IsExternal()) {
-      auto compiler = FunctionGetAttr(key->source_func, "External");
+      auto compiler = FunctionGetAttr(key->source_func, attr::kExternal);
       const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
       CHECK(code_gen);
       std::string ext_name = "relay.ext." + code_gen->value;
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 06b68f9debbb..64489bbbddef 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -57,7 +57,6 @@ class DnnlBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-
     // Make function declaration
     std::string decl = "";
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 066316e0202c..a62b85743fc6 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1016,12 +1016,12 @@ void VMCompiler::ExternalFuncCodegen() {
   std::unordered_map<int, std::pair<std::string, std::string> > func_codgen;
   for (size_t i = 0; i < context_.external_funcs.size(); i++) {
     const auto& it = context_.external_funcs[i];
-    auto func_name = FunctionGetAttr(it, "func_name");
+    auto func_name = FunctionGetAttr(it, attr::kFuncName);
     CHECK(func_name.defined()) << "Cannot find func_name attribute";
     const auto* func_name_str = func_name.as<tvm::ir::StringImm>();
     CHECK(func_name_str);
     CHECK(it->IsExternal());
-    auto comp = FunctionGetAttr(it, "External");
+    auto comp = FunctionGetAttr(it, attr::kExternal);
     const auto* comp_name = comp.as<tvm::ir::StringImm>();
     CHECK(comp_name);
     if (comp_module.count(comp_name->value) == 0) {
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index 7f21defc9d12..c841f87dd836 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -37,21 +37,19 @@ namespace tvm {
 namespace relay {
 namespace vm {
 
-static const char* kIsClosure = "IsClosure";
-
 inline std::string GenerateName(const Function& func) {
   size_t hash = StructuralHash()(func);
   return std::string("lifted_name") + std::to_string(hash);
 }
 
 bool IsClosure(const Function& func) {
-  NodeRef res = FunctionGetAttr(func, kIsClosure);
+  NodeRef res = FunctionGetAttr(func, attr::kClosure);
   const ir::IntImm* pval = res.as<ir::IntImm>();
   return pval && pval->value != 0;
 }
 
 Function MarkClosure(const Function& func) {
-  return FunctionSetAttr(func, kIsClosure, tvm::Integer(1));
+  return FunctionSetAttr(func, attr::kClosure, tvm::Integer(1));
 }
 
 /* The goal of this class is to lift out any nested functions into top-level
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 60fac27d7ef8..3673e9d4449b 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -157,13 +157,13 @@ FuncType FunctionNode::func_type_annotation() const {
 }
 
 bool FunctionNode::IsPrimitive() const {
-  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "Primitive");
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), attr::kPrimitive);
   const ir::IntImm* pval = res.as<ir::IntImm>();
   return pval && pval->value != 0;
 }
 
 Function FunctionNode::SetParams(const tvm::Map<Var, Constant>& parameters) const {
-  return FunctionSetAttr(GetRef<Function>(this), "__params__", parameters);
+  return FunctionSetAttr(GetRef<Function>(this), attr::kParams, parameters);
 }
 
 TVM_REGISTER_API("relay._expr.FunctionSetParams")
@@ -173,7 +173,7 @@ TVM_REGISTER_API("relay._expr.FunctionSetParams")
 });
 
 tvm::Map<Var, Constant> FunctionNode::GetParams() const {
-  auto node_ref = FunctionGetAttr(GetRef<Function>(this), "__params__");
+  auto node_ref = FunctionGetAttr(GetRef<Function>(this), attr::kParams);
   return Downcast<tvm::Map<Var, Constant>>(node_ref);
 }
 
@@ -183,7 +183,7 @@ TVM_REGISTER_API("relay._expr.FunctionGetParams")
 });
 
 bool FunctionNode::IsExternal() const {
-  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "External");
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), attr::kExternal);
   const ir::StringImm* pval = res.as<ir::StringImm>();
   return pval;
 }
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index df3d5e58a3c2..9aba1aca9a5b 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -933,7 +933,7 @@ class FuseMutator : private ExprMutator {
     visitor(body);
     const GroupInfo& ginfo = ginfo_[group];
     auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
-    func = FunctionSetAttr(func, "Primitive", tvm::Integer(visitor.has_call));
+    func = FunctionSetAttr(func, attr::kPrimitive, tvm::Integer(visitor.has_call));
     return CallNode::make(func, ginfo.arguments, Attrs());
   }
 
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index f67c24efe5b0..aa0d4fc99911 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -201,9 +201,9 @@ class Partitioner : public ExprMutator {
       Expr arg0 = call->args[0];
       std::string name = "subgraph_" + std::to_string(subgraph->id);
       subgraph_func =
-          FunctionSetAttr(subgraph_func, "func_name", tvm::ir::StringImm::make(name));
-      subgraph_func = FunctionSetAttr(subgraph_func, "Primitive", tvm::Integer(1));
-      subgraph_func = FunctionSetAttr(subgraph_func, "External",
+          FunctionSetAttr(subgraph_func, attr::kFuncName, tvm::ir::StringImm::make(name));
+      subgraph_func = FunctionSetAttr(subgraph_func, attr::kPrimitive, tvm::Integer(1));
+      subgraph_func = FunctionSetAttr(subgraph_func, attr::kExternal,
                                       tvm::ir::StringImm::make(subgraph_attrs->compiler));
       return CallNode::make(subgraph_func, args);
     }
@@ -326,7 +326,9 @@ class Partitioner : public ExprMutator {
 };
 
 /*!
- * \brief Combine parallel subgraphs that belong to the same codegen backend.
+ * \brief TODO(@zhiics, @comaniac) Combine parallel subgraphs that belong to
+ * the same codegen backend. This reduces rounds trips between TVM and external
+ * backends.
  *
  * For example, sg1 and sg2 should be combined if they belong to the same
  * codegen tool in the following case.
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index b025d3787f9e..1c541cc730a5 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -329,12 +329,12 @@ Module FunctionPassNode::operator()(const Module& mod,
   return updated_mod;
 }
 
-// TODO(zhiics) Create an enum attribute for FunctionNode
-// enum Attribute {kPrimitive, kSkipOptimization}
 bool FunctionPassNode::SkipFunction(const Function& func) const {
-  NodeRef res = FunctionGetAttr(func, "SkipOptimization");
-  const ir::IntImm* pval = res.as<ir::IntImm>();
-  return pval && pval->value != 0;
+  NodeRef skip_opt = FunctionGetAttr(func, attr::kSkipOptimization);
+  NodeRef is_extern = FunctionGetAttr(func, attr::kExternal);
+  const ir::IntImm* pval = skip_opt.as<ir::IntImm>();
+  const ir::StringImm* sval = is_extern.as<ir::StringImm>();
+  return (pval && pval->value != 0) || (sval && sval->value.size() > 0);
 }
 
 Sequential::Sequential(tvm::Array<Pass> passes, PassInfo pass_info) {

From 27366194f3e098644360e6cbe85f3d376c2aae5e Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 1 Nov 2019 22:58:51 +0000
Subject: [PATCH 29/34] rebase to upstream

---
 python/tvm/relay/build_module.py            | 31 +++++++++++----------
 src/relay/backend/contrib/contrib_codegen.h |  2 +-
 src/relay/backend/vm/compiler.cc            | 16 +++++------
 src/runtime/vm/vm.cc                        |  9 +++---
 4 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index dcc7103a775e..c9280552216b 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -246,21 +246,14 @@ def build(mod, target=None, target_host=None, params=None):
     return graph_json, mod, params
 
 
-<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
 def optimize(mod, target=None, params=None):
     """Helper function that optimizes a Relay module.
-=======
-def build_extern(mod, target):
-    """Helper function that builds a Relay function to run on external codegen
-    tools.
->>>>>>> Improve:
 
     Parameters
     ----------
     mod : relay.Module
         The module to build. Using relay.Function is deprecated.
 
-<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
     target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
     name) to str/tvm.target.Target, optional
         For heterogeneous compilation, it is a dictionary indicating context to
@@ -269,15 +262,10 @@ def build_extern(mod, target):
     params : dict of str to NDArray
         Input parameters to the graph that do not change
         during inference time. Used for constant folding.
-=======
-    target : str
-        The name of the external compilation target.
->>>>>>> Improve:
 
     Returns
     -------
     mod : relay.Module
-<<<<<<< 40fc1668d61095d2f34171f221c1a98f455ff24d
         The optimized relay module.
 
     params : dict
@@ -307,7 +295,23 @@ def build_extern(mod, target):
         bld_mod = BuildModule()
         mod, params = bld_mod.optimize(func, target, params)
     return mod, params
-=======
+
+
+def build_extern(mod, target):
+    """Helper function that builds a Relay function to run on external codegen
+    tools.
+
+    Parameters
+    ----------
+    mod : relay.Module
+        The module to build. Using relay.Function is deprecated.
+
+    target : str
+        The name of the external compilation target.
+
+    Returns
+    -------
+    mod : relay.Module
         The relay module contains partitioned subgraphes for external codegen
         tools.
     """
@@ -318,7 +322,6 @@ def build_extern(mod, target):
                                  _transform.PartitionGraph()])
     mod = seq(mod)
     return mod
->>>>>>> Improve:
 
 
 class GraphExecutor(_interpreter.Executor):
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
index b3c7894ddd02..0a00c2542f33 100644
--- a/src/relay/backend/contrib/contrib_codegen.h
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -58,7 +58,7 @@ class ExternCodegenBase {
    */
   std::string GetSubgraphID(const Function& func) const {
     const auto name_node =
-        FunctionGetAttr(func, "func_name").as<tvm::ir::StringImm>();
+        FunctionGetAttr(func, attr::kFuncName).as<tvm::ir::StringImm>();
     CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
     std::string name = name_node->value;
     return runtime::contrib::GetSubgraphID(name);
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index a62b85743fc6..7f7c391a1fee 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -447,7 +447,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   }
 
   void EmitInvokeExternal(const Function& func,
-                          const std::vector<Index>& unpacked_arg_regs,
+                          const std::vector<Index>& arg_regs,
                           size_t arity,
                           size_t return_count) {
     CHECK(func->IsExternal());
@@ -457,11 +457,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     size_t subgraph_id = context_->external_funcs.size();
     context_->external_funcs.push_back(func);
     // Emit an instruction to invoke the external function/subgraph.
-    Emit(Instruction::InvokeExternal(subgraph_id, arity, return_count, unpacked_arg_regs));
+    Emit(Instruction::InvokeExternal(subgraph_id, arity, return_count, arg_regs));
   }
 
   void EmitInvokePacked(const Function& func,
-                        const std::vector<Index>& unpacked_arg_regs,
+                        const std::vector<Index>& arg_regs,
                         size_t arity,
                         size_t return_count) {
     Target target;
@@ -487,7 +487,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       op_index = context_->seen_funcs[cfunc->funcs[0]];
     }
 
-    Emit(Instruction::InvokePacked(op_index, arity, return_count, unpacked_arg_regs));
+    Emit(Instruction::InvokePacked(op_index, arity, return_count, arg_regs));
   }
 
 
@@ -526,11 +526,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     // Next generate the invoke instruction.
     CHECK(func->IsPrimitive() || func->IsExternal());
     if (func->IsExternal()) {
-      EmitInvokeExternal(op_index, argument_registers.size(), output_tuple->fields.size(),
-                         argument_registers);
+      EmitInvokeExternal(func, argument_registers, argument_registers.size(),
+                         output_tuple->fields.size());
     } else {
-      EmitInvokePacked(op_index, argument_registers.size(), output_tuple->fields.size(),
-                       argument_registers);
+      EmitInvokePacked(func, argument_registers, argument_registers.size(),
+                       output_tuple->fields.size());
     }
   }
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index f083b058cafd..86b3d3dc2dfa 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -967,13 +967,12 @@ void VirtualMachine::RunLoop() {
         const auto& arity = instr.ext_arity;
         std::vector<ObjectRef> args;
         for (Index i = 0; i < arity; ++i) {
-          args.push_back(ReadRegister(instr.ext_args[i]));
+          DLOG(INFO) <<
+            "arg" << i << " $" << instr.ext_args[i];
+          auto arg = ReadRegister(instr.ext_args[i]);
+          args.push_back(arg);
         }
         InvokePacked(instr.ext_index, func, arity, instr.ext_output_size, args);
-        for (Index i = 0; i < instr.ext_output_size; ++i) {
-          WriteRegister(instr.ext_args[instr.ext_arity - instr.ext_output_size + i],
-                        args[instr.ext_arity - instr.ext_output_size + i]);
-        }
         pc++;
         goto main_loop;
       }

From d95023429cb734c447efde8743772ab22f5ba473 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Mon, 25 Nov 2019 05:05:20 +0000
Subject: [PATCH 30/34] return csourcemodule from external codegen

---
 cmake/modules/contrib/Extern.cmake            |  20 +-
 .../tvm/runtime/contrib/dnnl/dnnl_kernel.h    |  57 ++++
 include/tvm/runtime/vm.h                      |  41 +--
 python/tvm/module.py                          |  12 +-
 src/codegen/codegen.cc                        |   1 +
 src/relay/backend/build_module.cc             |  26 ++
 src/relay/backend/contrib/contrib_codegen.h   | 147 +++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc     | 207 ++++++++------
 src/relay/backend/contrib/dnnl/libs.cc        | 224 ---------------
 src/relay/backend/contrib/gcc/codegen.cc      | 181 ++++++------
 src/relay/backend/graph_runtime_codegen.cc    |  62 ++++-
 src/relay/backend/vm/compiler.cc              | 130 ++-------
 src/relay/backend/vm/compiler.h               |   9 +-
 src/relay/pass/partition_graph.cc             |   2 +-
 src/relay/pass/pass_manager.cc                |   4 +-
 src/runtime/contrib/dnnl/dnnl.cc              | 259 +++++++++++++++---
 src/runtime/contrib/dnnl/dnnl.h               |  64 -----
 src/runtime/contrib/extern_common.h           | 179 ------------
 src/runtime/contrib/gcc/gcc.cc                |  78 ------
 src/runtime/contrib/gcc/gcc.h                 |  63 -----
 src/runtime/module_util.cc                    |   1 +
 src/runtime/vm/executable.cc                  |  20 +-
 src/runtime/vm/vm.cc                          |  71 +----
 .../python/relay/test_pass_partition_graph.py | 150 +++++++---
 tutorials/dev/custom_relay_backend.py         |  10 +-
 25 files changed, 891 insertions(+), 1127 deletions(-)
 create mode 100644 include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
 delete mode 100644 src/relay/backend/contrib/dnnl/libs.cc
 delete mode 100644 src/runtime/contrib/dnnl/dnnl.h
 delete mode 100644 src/runtime/contrib/extern_common.h
 delete mode 100644 src/runtime/contrib/gcc/gcc.cc
 delete mode 100644 src/runtime/contrib/gcc/gcc.h

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index f44ff1c9fbf4..2c55779b0a80 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -17,8 +17,8 @@
 
 message(STATUS "Build with relay.backend.contrib")
 
-list(FIND USE_EXTERN "gcc" _gcc_idx)
-if(_gcc_idx GREATER -1)
+list(FIND USE_EXTERN "gcc" GCC_IDX)
+if(GCC_IDX GREATER -1)
     file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
     list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
@@ -27,13 +27,15 @@ if(_gcc_idx GREATER -1)
     message(STATUS "Use extern library: GCC")
 endif()
 
-list(FIND USE_EXTERN "dnnl" _dnnl_idx)
-if(_dnnl_idx GREATER -1)
-    file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
-    list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+list(FIND USE_EXTERN "dnnl" DNNL_IDX)
+if(DNNL_IDX GREATER -1)
+  file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
+  list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
 
-    file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*)
-    list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
-    message(STATUS "Use extern library: MKLDNN")
+  find_library(EXTERN_LIBRARY_DNNL dnnl)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
+  file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*)
+  list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
+  message(STATUS "Use extern library: MKLDNN" ${EXTERN_LIBRARY_DNNL})
 endif()
 
diff --git a/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h b/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
new file mode 100644
index 000000000000..be9afc2c4011
--- /dev/null
+++ b/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl_kernel.h
+ * \brief Use external dnnl library kernels.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
+#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
+
+#include "dnnl.hpp"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace dnnl;
+
+extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
+                            int p_C_, int p_H_, int p_W_, int p_O_, int p_G_,
+                            int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
+                            int p_Sh_, int p_Sw_);
+
+extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_,
+                           int p_I_, int p_O_);
+
+extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_,
+                          int p_W_);
+
+extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean,
+                        float* variance, float* out, int p_n_, int p_c_,
+                        int p_h_, int p_w_, int p_e_);
+
+extern "C" void dnnl_add(float* data, float* weight, float* out, int p_n_,
+                         int p_c_, int p_h_, int p_w_);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index 8bf2c3553471..f7188e4b7896 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_VM_H_
 #define TVM_RUNTIME_VM_H_
 
-#include <tvm/relay/expr.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -140,7 +139,6 @@ enum class Opcode {
   LoadConsti = 14U,
   Fatal = 15U,
   AllocStorage = 16U,
-  InvokeExternal = 17U,
 };
 
 /*! \brief A single virtual machine instruction.
@@ -204,16 +202,6 @@ struct Instruction {
       /*! \brief The arguments to pass to the packed function. */
       RegName* packed_args;
     };
-    struct /* InvokeExternal Operands */ {
-      /*! \brief The index into the external function table. */
-      Index ext_index;
-      /*! \brief The arity of the external function. */
-      Index ext_arity;
-      /*! \brief The number of outputs produced by the external function. */
-      Index ext_output_size;
-      /*! \brief The arguments to pass to the external function. */
-      RegName* ext_args;
-    };
     struct /* If Operands */ {
       /*! \brief The register containing the test value. */
       RegName test;
@@ -301,7 +289,7 @@ struct Instruction {
    */
   static Instruction InvokePacked(Index packed_index, Index arity, Index output_size,
                                   const std::vector<RegName>& args);
-  /*! 
+  /*!
    * \brief Construct an allocate tensor instruction with constant shape.
    * \param storage The storage to allocate out of.
    * \param shape The shape of the tensor.
@@ -311,16 +299,6 @@ struct Instruction {
    */
   static Instruction AllocTensor(RegName storage,
                                  const std::vector<int64_t>& shape, DLDataType dtype, RegName dst);
-  /*! 
-   * \brief Construct an invoke external instruction.
-   * \param packed_index The index of the external function.
-   * \param ext_arity The arity of the function.
-   * \param ext_output_size The number of outputs of the external function.
-   * \param args The argument registers.
-   * \return The invoke external instruction.
-   */
-  static Instruction InvokeExternal(Index external_index, Index ext_arity, Index ext_output_size,
-                                    const std::vector<RegName>& args);
   /*!
    * \brief Construct an allocate tensor instruction with register.
    * \param storage The storage to allocate out of.
@@ -611,13 +589,9 @@ class Executable : public ModuleNode {
     return "VMExecutable";
   }
 
-  /*!
-   * \brief The runtime module/library that contains both the host and also the device
-   * code when executing on non-CPU devices.
-   */
+  /*! \brief The runtime module/library that contains both the host and also the device
+   * code when executing on non-CPU devices. */
   runtime::Module lib;
-  /*! \brief The external module/library. */
-  std::vector<runtime::Module> ext_libs;
   /*! \brief The global constant pool. */
   std::vector<ObjectRef> constants;
   /*! \brief A map from globals (as strings) to their index in the function map. */
@@ -628,13 +602,6 @@ class Executable : public ModuleNode {
   std::unordered_map<std::string, Index> primitive_map;
   /*! \brief The virtual machine's function table. */
   std::vector<VMFunction> functions;
-  /*! \brief A mapping from the subgraph id to the external library index in the
-   * `ext_libs`.
-   */
-  std::unordered_map<Index, Index> external_map;
-  /*! \brief A mapping from the subgraph id to the external function name.
-   */
-  std::unordered_map<Index, std::string> external_func_map;
 
  private:
   /*!
@@ -747,8 +714,6 @@ class VirtualMachine : public runtime::ModuleNode {
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
-  /*! \brief The virtual machine's external function table. */
-  std::vector<PackedFunc> external_funcs;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames_;
   /*! \brief The fuction table index of the current function. */
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 2790227f32c7..fcf74828f003 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -133,7 +133,17 @@ def export_library(self,
         self.save(path_obj)
         files = [path_obj]
         is_system_lib = self.type_key == "llvm" and self.get_function("__tvm_is_system_module")()
+        has_imported_c_file = False
         if self.imported_modules:
+            for i, m in enumerate(self.imported_modules):
+                if m.type_key == "c":
+                    has_imported_c_file = True
+                    c_file_name = "tmp_" + str(i) + ".cc"
+                    path_cc = temp.relpath(c_file_name)
+                    with open(path_cc, "w") as f:
+                        f.write(m.get_source())
+                        print(m.get_source())
+                    files.append(path_cc)
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
                 f.write(_PackImportsToC(self, is_system_lib))
@@ -143,7 +153,7 @@ def export_library(self,
                 fcompile = _tar.tar
             else:
                 fcompile = _cc.create_shared
-        if self.type_key == "c":
+        if self.type_key == "c" or has_imported_c_file:
             options = []
             if "options" in kwargs:
                 opts = kwargs["options"]
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index 8464e3dbbb2a..921b5a0bca5c 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -69,6 +69,7 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
         << "Only support simply one-level hierarchy";
     std::string tkey = im->type_key();
     stream->Write(tkey);
+    if (tkey == "c") continue;
     im->SaveToBinary(stream);
   }
   // translate to C program
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 9254c7e3e7b9..6d0fe581f9d2 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -73,6 +73,14 @@ struct GraphCodegen {
     return CallFunc<std::string>("get_graph_json", nullptr);
   }
 
+  Array<Function> GetExternalFuncs() {
+    return CallFunc<Array<Function> >("get_external_funcs", nullptr);
+  }
+
+  runtime::Module GetExternalModule() {
+    return CallFunc<runtime::Module>("get_external_module", nullptr);
+  }
+
   Map<std::string, Array<LoweredFunc> > GetLoweredFunc() {
     return CallFunc<Map<std::string, Array<LoweredFunc> > >("get_lowered_funcs", nullptr);
   }
@@ -148,6 +156,14 @@ class RelayBuildModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
           *rv = this->graph_codegen_->GetLoweredFunc();
       });
+    } else if (name == "get_external_funcs") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+          *rv = this->graph_codegen_->GetExternalFuncs();
+      });
+    } else if (name == "get_external_module") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+          *rv = this->graph_codegen_->GetExternalModule();
+      });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         CHECK_EQ(args.num_args, 2);
@@ -474,6 +490,16 @@ class RelayBuildModule : public runtime::ModuleNode {
         target_host_,
         BuildConfig::Current());
     }
+    Array<Function> external_funcs = graph_codegen_->GetExternalFuncs();
+    if (!external_funcs.empty()) {
+      auto ext_rt_mod = graph_codegen_->GetExternalModule();
+      // Execute the whole module using external runtime.
+      if (lowered_funcs.size() == 0) {
+        ret_.mod = ext_rt_mod;
+      } else {
+        ret_.mod.Import(ext_rt_mod);
+      }
+    }
   }
 
  protected:
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
index 0a00c2542f33..b38c3dfe0e02 100644
--- a/src/relay/backend/contrib/contrib_codegen.h
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -25,8 +25,9 @@
 #define TVM_RELAY_BACKEND_CONTRIB_CONTRIB_CODEGEN_H_
 
 #include <tvm/relay/expr.h>
+#include <sstream>
 #include <string>
-#include "../../../runtime/contrib/extern_common.h"
+#include <vector>
 
 namespace tvm {
 namespace relay {
@@ -37,32 +38,158 @@ class ExternCodegenBase {
   ExternCodegenBase() = default;
 
   /*!
-   * \brief Compile the external library.
-   */
-  virtual void CompileExternLib() = 0;
-
-  /*!
-   * \brief Build the shared library of external ops.
+   * \brief Create a runtime module for the external library. For example, it
+   * could be a CSourceModule that can be directly compiled and linked together
+   * with a DSOModule, or a json style module that emitts a json artifact that
+   * is able to be executed by a customized json runtime.
    *
    * \param ref The subgraph Relay expression/module to be executed using extern ops.
    *
+   * \return A runtime module.
    */
-  virtual void Build(const NodeRef& ref) = 0;
+  virtual runtime::Module CreateExternModule(const NodeRef& ref) = 0;
 
   /*!
    * \brief Split the Relay function name to tokens.
    *
    * \param func The provided function.
+   * \param prefix The prefix of the function name, i.e. dnnl.
    *
    * \return A vector of tokenized function name splitted by "_".
    */
-  std::string GetSubgraphID(const Function& func) const {
+  std::string GetSubgraphID(const Function& func, const std::string& prefix) const {
     const auto name_node =
         FunctionGetAttr(func, attr::kFuncName).as<tvm::ir::StringImm>();
     CHECK(name_node != nullptr) << "Fail to retrieve subgraph name.";
     std::string name = name_node->value;
-    return runtime::contrib::GetSubgraphID(name);
+    return GetSubgraphID(name, prefix);
   }
+
+  /*!
+   * \brief Split the encoded function name to tokens.
+   *
+   * \param the function name string.
+   *
+   * \return a vector of tokenized function name splitted by "_".
+   */
+  std::string GetSubgraphID(const std::string& name, const std::string& prefix) const {
+    std::string temp = name;
+    std::vector<std::string> tokens;
+    std::string delimiter = "_";
+    size_t pos = 0;
+    std::string token;
+    while ((pos = temp.find(delimiter)) != std::string::npos) {
+      token = temp.substr(0, pos);
+      tokens.push_back(token);
+      temp.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(temp);
+
+    CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
+    CHECK(tokens[0] == prefix)
+        << "Function name: " << name
+        << " does not start with: " << prefix;
+    return tokens[1];
+  }
+};
+
+// A helper class to write the declaration of external functions.
+class ExternSourcePrinter {
+ protected:
+  /*! \brief Print indents using spaces. */
+  void PrintIndents() {
+    for (int i = 0; i < indent_; i++) {
+      code_stream_ << ' ';
+    }
+  }
+
+  /*!
+   * \brief Enter a new scope.
+   */
+  void EnterScope() { indent_ += 2; }
+
+  /*!
+   * \brief Exit a scope.
+   */
+  void ExitScope() {
+    CHECK_GE(indent_, 2U) << "Wrong ident found.";
+    indent_ -= 2;
+  }
+
+  /*!
+   * \brief Gerenate a wrapper for the subgraph that will use external codegen.
+   *
+   * \param func_name The name of wrapper function.
+   * \param arg_cnt The expected number of arguments for the wrapper.
+   *
+   * \code
+   *
+   * // An example code for the wrapper.
+   * extern "C" void foo(TVMValue* value, int* type_code, int nargs) {
+   *   if (nargs != 3) {
+   *     printf("foo expects 3 args, but received %d\n", nargs);
+   *     return 1;
+   *   }
+   *
+   *   DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
+   *   DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
+   *   DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
+   *
+   *   foo_(static_cast<float*>(arg0->data),
+   *        static_cast<float*>(arg1->data),
+   *        static_cast<float*>(out->data));
+   *   return 0;
+   * }
+   *
+   * \endcode
+   */
+  void GenerateSubgraphWrapper(const std::string& func_name, int arg_cnt) {
+    // Print signature
+    code_stream_ << "\n";
+    code_stream_ << "extern \"C\" int " << func_name;
+    code_stream_ << "(TVMValue* value, int* type_code, int nargs) {\n";
+    EnterScope();
+    // Print guard
+    PrintIndents();
+    code_stream_ << "if (nargs != " << arg_cnt << "){\n";
+    EnterScope();
+    PrintIndents();
+    code_stream_ << "printf(\"" << func_name << " expects " << arg_cnt
+                 << "arguments, but received %d\\n\", nargs);\n";
+    PrintIndents();
+    code_stream_ << "return 1;\n";
+    ExitScope();
+    PrintIndents();
+    code_stream_ << "}\n";
+
+    // According to TVM's calling convention, the last one is output.
+    for (int i = 0; i < arg_cnt; i++) {
+      PrintIndents();
+      code_stream_ << "DLTensor* arg" << i << " = "
+                   << "static_cast<DLTensor*>(value[" << i << "].v_handle);\n";
+    }
+    // Generate the call.
+    PrintIndents();
+    code_stream_ << func_name << "_(";
+    for (int i = 0; i < arg_cnt - 1; i++) {
+      code_stream_ << "static_cast<float*>(arg" << i << "->data), ";
+    }
+    if (arg_cnt > 0) {
+      code_stream_ << "static_cast<float*>(arg" << arg_cnt - 1 << "->data)";
+    }
+    code_stream_ << ");\n\n";
+    PrintIndents();
+    code_stream_ << "return 0;\n";
+    ExitScope();
+    code_stream_ << "}";
+  }
+
+  /*! \brief The external function source code stream. */
+  std::ostringstream code_stream_;
+
+ private:
+  /*! \brief Indent of the source code. */
+  int indent_{0};
 };
 
 }  // namespace contrib
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 64489bbbddef..bae44cb201e4 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -27,22 +27,20 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
 
-#include <random>
 #include <fstream>
 #include <sstream>
-#include <streambuf>
 
-#include "../../../../runtime/contrib/dnnl/dnnl.h"
 #include "../contrib_codegen.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 
-// FIXME: This is an experimental implementation. We should implement all utilities
-// and make a base class such as ExternBuilder for users to implement.
-class DnnlBuilder : public ExprVisitor {
+// TODO(@zhiics, @comaniac): This is basic implementation. We should implement
+// all utilities and make a base class for users to implement.
+class DnnlBuilder : public ExprVisitor, public ExternSourcePrinter {
  public:
   explicit DnnlBuilder(const std::string& id) { this->subgraph_id_ = id; }
 
@@ -57,15 +55,13 @@ class DnnlBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    // Make function declaration
-    std::string decl = "";
-
+    std::ostringstream decl_stream;
+    std::ostringstream buf_stream;
     // Args: ID
-    std::string func_name = "";
     std::vector<std::string> args;
 
     if (IsOp(call, "nn.conv2d")) {
-      func_name = "dnnl_conv2d";
+      decl_stream << "dnnl_conv2d";
       const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
 
       auto ishape = GetShape(call->args[0]->checked_type());
@@ -86,7 +82,7 @@ class DnnlBuilder : public ExprVisitor {
       args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
       args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
     } else if (IsOp(call, "nn.dense")) {
-      func_name = "dnnl_dense";
+      decl_stream << "dnnl_dense";
       auto ishape = GetShape(call->args[0]->checked_type());
       auto wshape = GetShape(call->args[1]->checked_type());
 
@@ -96,7 +92,7 @@ class DnnlBuilder : public ExprVisitor {
       args.push_back(std::to_string(wshape[0]));
 
     } else if (IsOp(call, "nn.relu")) {
-      func_name = "dnnl_relu";
+      decl_stream << "dnnl_relu";
       auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: N, C, H, W
@@ -104,7 +100,7 @@ class DnnlBuilder : public ExprVisitor {
         args.push_back(std::to_string(s));
       }
     } else if (IsOp(call, "nn.batch_norm")) {
-      func_name = "dnnl_bn";
+      decl_stream << "dnnl_bn";
       const auto* bn_attr = call->attrs.as<BatchNormAttrs>();
       auto ishape = GetShape(call->args[0]->checked_type());
 
@@ -116,7 +112,7 @@ class DnnlBuilder : public ExprVisitor {
       // Args: epilson
       args.push_back(std::to_string(bn_attr->epsilon));
     } else if (IsOp(call, "add")) {
-      func_name = "dnnl_add";
+      decl_stream << "dnnl_add";
       auto ishape = GetShape(call->args[0]->checked_type());
 
       // Args: H, W
@@ -129,15 +125,15 @@ class DnnlBuilder : public ExprVisitor {
 
     // Make function call with input buffers when visiting arguments
     bool first = true;
-    std::string func_call = func_name + "(";
+    decl_stream << "(";
     for (size_t i = 0; i < call->args.size(); ++i) {
       VisitExpr(call->args[i]);
       for (auto out : out_) {
         if (!first) {
-          func_call += ", ";
+          decl_stream << ", ";
         }
         first = false;
-        func_call += out.first;
+        decl_stream << out.first;
       }
     }
 
@@ -151,58 +147,90 @@ class DnnlBuilder : public ExprVisitor {
     for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
-    std::string buf_decl =
-        "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
-    buf_decl_.push_back(buf_decl);
-    func_call += ", " + out;
+    this->PrintIndents();
+    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
+    buf_decl_.push_back(buf_stream.str());
+    decl_stream << ", " << out;
 
     // Attach attribute arguments
     for (size_t i = 0; i < args.size(); ++i) {
-      func_call += ", " + args[i];
+      decl_stream  << ", " << args[i];
     }
-    func_call += ");";
-    subgraph_body.push_back(func_call);
+    decl_stream << ");";
+    subgraph_body.push_back(decl_stream.str());
 
     // Update output buffer
     out_.clear();
     out_.push_back({out, out_size});
   }
 
-  std::string build() {
-    std::string code = "";
-
-    // Write subgraph function declaration
-    code += "extern \"C\" void " + subgraph_id_ + "(DnnlPackedArgs args, float* out) {\n";
+  std::string jit_dnnl() {
+    // Create the signature. For example, it could be:
+    // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
+    code_stream_ << "extern \"C\" void " << subgraph_id_ << "_(";
 
-    // Unpack inputs
-    for (size_t i = 0; i < subgraph_args_.size(); ++i) {
-      code +=
-          "  float* " + subgraph_args_[i] + " = (float*) args.data[" + std::to_string(i) + "];\n";
+    for (const auto& arg : subgraph_args_) {
+      code_stream_ << "float* " << arg << ", ";
     }
+    code_stream_ << "float* out) {\n";
+    this->EnterScope();
+
     // Function body
     for (auto decl : buf_decl_) {
-      code += "  " + decl + "\n";
+      this->PrintIndents();
+      code_stream_ << decl << "\n";
     }
+    code_stream_ << "\n";
     for (auto stmt : subgraph_body) {
-      code += "  " + stmt + "\n";
+      this->PrintIndents();
+      code_stream_ << stmt << "\n";
     }
 
     // Copy output
-    CHECK(out_.size() == 1) << "Internal error";
-    code += "  memcpy(out, " + out_[0].first + ", 4 *" + std::to_string(out_[0].second) + ");\n";
+    CHECK_EQ(out_.size(), 1U) << "Internal error: only single output is support yet.";
+    this->PrintIndents();
+    code_stream_ << "std::memcpy(out, " << out_[0].first << ", 4 * "
+                 << out_[0].second << ");\n";
+
+    // Free buffers
+    for (size_t i = 0; i < buf_decl_.size(); i++) {
+      this->PrintIndents();
+      code_stream_ << "std::free(buf_" << i << ");\n";
+    }
 
-    code += "}\n";
-    return code;
+    this->ExitScope();
+    code_stream_ << "}\n";
+
+    // Create the wrapper to call the subgraph
+    this->GenerateSubgraphWrapper(subgraph_id_,
+                                  subgraph_args_.size() + 1 /* output */);
+    return code_stream_.str();
   }
 
  private:
-  std::string subgraph_id_ = "";
-  int buf_idx_ = 0;
+  /*! \brief The id of the external dnnl subgraph. */
+  std::string subgraph_id_{""};
+  /*!
+   * \brief The index to track the output buffer. Each kernel will redirect the
+   * output to a buffer that may be consumed by other kernels.
+   */
+  int buf_idx_{0};
+  /*! \brief The arguments used by a wrapped external function. */
   std::vector<std::string> subgraph_args_;
+  /*! \brief statement of the external function. */
   std::vector<std::string> subgraph_body;
+  /*! \brief The declaration of intermeidate buffers. */
   std::vector<std::string> buf_decl_;
+  /*! \brief The name of the the outputs. */
   std::vector<std::pair<std::string, int>> out_;
 
+  /*!
+   * \brief Extract the shape from a Relay tensor type.
+   *
+   * \param type The provided type.
+   *
+   * \return The extracted shape in a list.
+   */
   std::vector<int> GetShape(const Type& type) const {
     const auto* ttype = type.as<TensorTypeNode>();
     CHECK(ttype);
@@ -215,7 +243,16 @@ class DnnlBuilder : public ExprVisitor {
     return shape;
   }
 
-  bool IsOp(const CallNode* call, std::string op_name) {
+  /*!
+   * \brief Check if a call has the provided name.
+   *
+   * \param call A Relay call node.
+   * \param op_name The name of the expected call.
+   *
+   * \return true if the call's name is equivalent to the given name. Otherwise,
+   * false.
+   */
+  bool IsOp(const CallNode* call, std::string op_name) const {
     const auto* op_node = call->op.as<OpNode>();
     CHECK(op_node) << "Expects a single op.";
     Op op = GetRef<Op>(op_node);
@@ -223,70 +260,71 @@ class DnnlBuilder : public ExprVisitor {
   }
 };
 
+/*!
+ * \brief The DNNL codegen helper to generate wrapepr function calls of DNNL
+ * libraries. The code is a CSourceModule that can be compiled separately and
+ * linked together with a DSOModule.
+ */
 class DNNLCodegen : public ExternCodegenBase {
  public:
-  std::string GetLibPath() const {
-    return lib_path_;
-  }
-
-  void CreateExternSignature(const Function& func, bool update) {
+  // Create a corresponding external function for the given relay Function.
+  void CreateExternFunction(const Function& func) {
     CHECK(func.defined())
         << "Input error: external codegen expects a Relay function.";
     const auto* call = func->body.as<CallNode>();
     CHECK(call) << "DNNL expects a single convolution or dense op";
 
     // Record subgraph ID for runtime invoke.
-    auto sid = GetSubgraphID(func);
-
-    if (update) {
-      std::random_device rd;
-      std::mt19937 gen(rd());
-      std::uniform_int_distribution<uint64_t> distr;
-      std::stringstream ss;
-      ss << std::hex << distr(gen);
-      std::ifstream lib_file("src/relay/backend/contrib/dnnl/libs.cc");
-      code_.assign((std::istreambuf_iterator<char>(lib_file)),
-                    std::istreambuf_iterator<char>());
-      lib_path_ = "/tmp/relay_dnnl_lib_" + ss.str() + ".so";
-    }
+    auto sid = GetSubgraphID(func, "dnnl");
 
-    auto builder = DnnlBuilder(runtime::contrib::kDnnlPrefix + sid);
+    auto builder = DnnlBuilder("dnnl_" + sid);
     builder.VisitExpr(func->body);
-    code_ += builder.build();
+    code_stream_ << builder.jit_dnnl();
   }
 
-  void CompileExternLib() override {
-    std::string code = "echo \'" + code_ + "\'";
-    std::string cmd = "g++ -O2 -Wall -std=c++11 -shared -fPIC -xc++ - -o " + lib_path_ +
-                      " -ldl -lpthread -lm -ldnnl";
-    cmd = code + " | " + cmd;
-    int ret = std::system(cmd.c_str());
-    if (ret < 0) {
-      LOG(FATAL) << "Failed to compile DNNL library. Error code: " << ret;
-    }
-  }
+  /*!
+   * \brief The overridden function that will create a CSourceModule. In order
+   * to compile the generated C source code, users need to specify the paths to
+   * some libraries, including some TVM required and dnnl specific ones. To make
+   * linking simpiler, the DNNL kernels are wrapped in a TVM compatible manner
+   * and are live under include/tvm/runtime/contrib/dnnl folder.
+   *
+   * \param ref A object ref that could be either a Relay function or module.
+   *
+   * \return The runtime module that contains C source code.
+   */
+  runtime::Module CreateExternModule(const NodeRef& ref) {
+    // Create headers
+    code_stream_ << "#include <cstdint>\n";
+    code_stream_ << "#include <cstdlib>\n";
+    code_stream_ << "#include <cstring>\n";
+    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
+    code_stream_ << "#include <dlpack/dlpack.h>\n";
+    code_stream_ << "#include <tvm/runtime/contrib/dnnl/dnnl_kernel.h>\n";
+    code_stream_ << "using namespace tvm::runtime::contrib;\n";
+    code_stream_ << "\n";
 
-  void Build(const NodeRef& ref) override {
     if (ref->IsInstance<FunctionNode>()) {
-      CreateExternSignature(Downcast<Function>(ref), true);
-      CompileExternLib();
+      CreateExternFunction(Downcast<Function>(ref));
     } else if (ref->IsInstance<relay::ModuleNode>()) {
       relay::Module mod = Downcast<relay::Module>(ref);
-      bool update = true;
       for (const auto& it : mod->functions) {
-        CreateExternSignature(Downcast<Function>(it.second), update);
-        update = false;
+        CreateExternFunction(Downcast<Function>(it.second));
       }
-      CompileExternLib();
     } else {
       LOG(FATAL) << "The input ref is expected to be a Relay function or module"
                  << "\n";
     }
+
+    // Create a CSourceModule
+    const auto* pf = runtime::Registry::Get("module.csource_module_create");
+    CHECK(pf != nullptr) << "Cannot find csource module to create the external function";
+    return (*pf)(code_stream_.str(), "cc");
   }
 
  private:
-  std::string code_;
-  std::string lib_path_;
+  /*! \brief The code stream that prints the external functions. */
+  std::ostringstream code_stream_;
 };
 
 /*!
@@ -295,10 +333,7 @@ class DNNLCodegen : public ExternCodegenBase {
  */
 runtime::Module DNNLCompiler(const NodeRef& ref) {
   DNNLCodegen dnnl;
-  dnnl.Build(ref);
-  std::shared_ptr<runtime::contrib::DNNLModule> n =
-      std::make_shared<runtime::contrib::DNNLModule>(dnnl.GetLibPath());
-  return runtime::Module(n);
+  return dnnl.CreateExternModule(ref);
 }
 
 TVM_REGISTER_API("relay.ext.dnnl")
diff --git a/src/relay/backend/contrib/dnnl/libs.cc b/src/relay/backend/contrib/dnnl/libs.cc
deleted file mode 100644
index b97b36099ef6..000000000000
--- a/src/relay/backend/contrib/dnnl/libs.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <string.h>
-#include <assert.h>
-#include <stdlib.h>
-
-#include <algorithm>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "dnnl.hpp"
-
-using namespace dnnl;
-
-typedef struct {
-  void** data;
-} DnnlPackedArgs;
-
-// Read from memory, write to handle
-inline void read_from_dnnl_memory(void* handle, const memory& mem) {
-  size_t bytes = mem.get_desc().get_size();
-
-  uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
-  std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
-}
-
-extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_,
-                            int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_,
-                            int p_Kw_, int p_Sh_, int p_Sw_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};
-  memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
-  if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
-  memory::dims conv2d_bias_tz = {p_O_};
-  memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
-                                (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
-  memory::dims conv2d_strides = {p_Sh_, p_Sw_};
-  memory::dims conv2d_padding = {p_Ph_, p_Pw_};
-
-  std::vector<float> conv2d_bias(p_O_, 0);
-
-  auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
-  auto user_weights_memory =
-      memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights);
-  auto conv2d_user_bias_memory =
-      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());
-
-  auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
-  auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
-  auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
-  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
-
-  auto conv2d_desc = convolution_forward::desc(
-      prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
-      conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding, conv2d_padding);
-  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);
-
-  auto conv2d_src_memory = user_src_memory;
-  auto conv2d_weights_memory = user_weights_memory;
-  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
-
-  auto conv = convolution_forward(conv2d_prim_desc);
-  conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
-                   {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
-                   {DNNL_ARG_BIAS, conv2d_user_bias_memory},
-                   {DNNL_ARG_DST, conv2d_dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, conv2d_dst_memory);
-}
-
-extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_B_, p_I_};
-  memory::dims weight_tz = {p_O_, p_I_};
-  memory::dims bias_tz = {p_O_};
-  memory::dims dst_tz = {p_B_, p_O_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};
-  auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});
-  auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});
-  auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});
-
-  std::vector<float> bias(p_O_, 0);
-  auto data_memory = memory(data_md, eng, data);
-  auto weight_memory = memory(weight_md, eng, weight);
-  auto bias_memory = memory(bias_md, eng, bias.data());
-  auto dst_memory = memory(dst_md, eng);
-
-  auto dense_desc = inner_product_forward::desc(prop_kind::forward_inference, data_md, weight_md,
-                                                bias_md, dst_md);
-  auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);
-  assert(dst_md == dense_prim_desc.dst_desc());
-
-  auto dense = inner_product_forward(dense_prim_desc);
-  dense.execute(s, {{DNNL_ARG_SRC, data_memory},
-                    {DNNL_ARG_WEIGHTS, weight_memory},
-                    {DNNL_ARG_BIAS, bias_memory},
-                    {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
-
-extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_, int p_W_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-
-  auto data_memory = memory(data_md, eng, data);
-  auto dst_memory = memory(data_md, eng);
-
-  auto relu_desc =
-      eltwise_forward::desc(prop_kind::forward_inference, algorithm::eltwise_relu, data_md, 0);
-  auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);
-  assert(data_md == relu_prim_desc.dst_desc());
-
-  auto relu = eltwise_forward(relu_prim_desc);
-  relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
-
-extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean, float* variance,
-                        float* out, int p_N_, int p_C_, int p_H_, int p_W_, int p_E_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-
-  auto data_memory = memory(data_md, eng, data);
-  auto dst_memory = memory(data_md, eng);
-
-  auto bn_desc = batch_normalization_forward::desc(
-      prop_kind::forward_inference, data_md, p_E_,
-      normalization_flags::use_global_stats | normalization_flags::use_scale_shift);
-  auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);
-  assert(data_md == bn_prim_desc.dst_desc());
-
-  float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));
-  memcpy(weight, gamma, sizeof(float) * p_C_);
-  memcpy(weight + p_C_, beta, sizeof(float) * p_C_);
-
-  auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);
-  auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);
-  auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);
-
-  auto bn = batch_normalization_forward(bn_prim_desc);
-  bn.execute(s, {{DNNL_ARG_SRC, data_memory},
-                 {DNNL_ARG_DST, dst_memory},
-                 {DNNL_ARG_SCALE_SHIFT, weight_memory},
-                 {DNNL_ARG_MEAN, mean_memory},
-                 {DNNL_ARG_VARIANCE, variance_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-  free(weight);
-}
-
-extern "C" void dnnl_add(float* data, float* weight, float* out,
-                         int p_N_, int p_C_, int p_H_, int p_W_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-  auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
-  auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
-
-  auto data_memory = memory(data_md, eng, data);
-  auto weight_memory = memory(weight_md, eng, weight);
-  auto dst_memory = memory(dst_md, eng);
-
-  auto add_desc = binary::desc(algorithm::binary_add, data_md, weight_md, dst_md);
-  auto add_prim_desc = binary::primitive_desc(add_desc, eng);
-  assert(dst_md == add_prim_desc.dst_desc());
-
-  auto add = binary(add_prim_desc);
-  add.execute(
-      s,
-      {{DNNL_ARG_SRC_0, data_memory}, {DNNL_ARG_SRC_1, weight_memory}, {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 7317b7017a6d..36005ee540ad 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -20,22 +20,23 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
+#include <tvm/runtime/object.h>
 
-#include <random>
 #include <fstream>
 #include <sstream>
-#include <streambuf>
 
 #include "../contrib_codegen.h"
-#include "../../../../runtime/contrib/gcc/gcc.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 
-// FIXME: This is an experimental implementation. We should implement all utilities
-// and make a base claaa such as ExternBuilder for users to implement.
-class GccBuilder : public ExprVisitor {
+/*!
+ * \brief An example codegen that is only used for quick prototyping and testing
+ * purpose. Only several binary options are covered in the GCC builder. Users
+ * may need to extend them to cover more operators.
+ */
+class GccBuilder : public ExprVisitor, public ExternSourcePrinter {
  public:
   explicit GccBuilder(const std::string& id) { this->subgraph_id_ = id; }
 
@@ -46,41 +47,44 @@ class GccBuilder : public ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
+    std::ostringstream macro_stream;
+    std::ostringstream decl_stream;
+    std::ostringstream buf_stream;
+
     auto op_node = call->op.as<OpNode>();
     std::string func_name = subgraph_id_ + "_" + std::to_string(func_idx++);
 
     // Make function declaration
-    std::string decl = "GCC_BINARY_OP_" + std::to_string(call->args.size()) +
-                       "D(" + func_name + ", ";
+    macro_stream << "GCC_BINARY_OP_" << call->args.size() << "D(" << func_name << ", ";
 
     if (GetRef<Op>(op_node) == Op::Get("add")) {
-      decl += "+";
+      macro_stream << "+";
     } else if (GetRef<Op>(op_node) == Op::Get("subtract")) {
-      decl += "-";
+      macro_stream << "-";
     } else if (GetRef<Op>(op_node) == Op::Get("multiply")) {
-      decl += "*";
+      macro_stream << "*";
     } else {
       LOG(FATAL) << "Unrecognized op";
     }
 
     auto in_shape = GetShape(call->args[0]->checked_type());
     for (size_t i = 0; i < in_shape.size(); ++i) {
-      decl += ", " + std::to_string(in_shape[i]);
+      macro_stream << ", " << in_shape[i];
     }
-    decl += ");";
-    func_decl_.push_back(decl);
+    macro_stream << ");";
+    func_decl_.push_back(macro_stream.str());
 
     // Make function call when visiting arguments
     bool first = true;
-    std::string gcc_call = func_name + "(";
+    decl_stream << func_name << "(";
     for (size_t i = 0; i < call->args.size(); ++i) {
       VisitExpr(call->args[i]);
       for (auto out : out_) {
         if (!first) {
-          gcc_call += ", ";
+          decl_stream << ", ";
         }
         first = false;
-        gcc_call += out.first;
+        decl_stream << out.first;
       }
     }
 
@@ -93,47 +97,62 @@ class GccBuilder : public ExprVisitor {
     for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
-    std::string buf_decl =
-        "float* " + out + " = (float*)malloc(4 * " + std::to_string(out_size) + ");";
-    buf_decl_.push_back(buf_decl);
+    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
+    buf_decl_.push_back(buf_stream.str());
 
-    gcc_call += ", " + out + ");";
-    subgraph_body.push_back(gcc_call);
+    decl_stream << ", " << out << ");";
+    subgraph_body.push_back(decl_stream.str());
 
     // Update output buffer
     out_.clear();
     out_.push_back({out, out_size});
   }
 
-  std::string build() {
-    std::string code = "";
-
+  std::string jit_csource() {
     // Write function macros
     for (auto decl : func_decl_) {
-      code += decl + "\n";
+      code_stream_ << decl << "\n";
     }
 
     // Write subgraph function declaration
-    code += "extern  \"C\" void " + subgraph_id_ + "(GccPackedArgs args, float* out) {\n";
+    code_stream_ << "extern  \"C\" void " << subgraph_id_ << "_(";
 
-    // Unpack inputs
-    for (size_t i = 0; i < subgraph_args_.size(); ++i) {
-      code += "  float* " + subgraph_args_[i] + " = args.data[" + std::to_string(i) + "];\n";
+    for (const auto& arg : subgraph_args_) {
+      code_stream_ << "float* " << arg << ", ";
     }
+
+    code_stream_ << "float* out) {\n";
+    this->EnterScope();
+
     // Function body
     for (auto decl : buf_decl_) {
-      code += "  " + decl + "\n";
+      this->PrintIndents();
+      code_stream_ << decl << "\n";
     }
+    code_stream_ << "\n";
     for (auto stmt : subgraph_body) {
-      code += "  " + stmt + "\n";
+      this->PrintIndents();
+      code_stream_ << stmt << "\n";
     }
 
     // Copy output
     CHECK(out_.size() == 1) << "Internal error";
-    code += "  memcpy(out, " + out_[0].first + ", 4 *" + std::to_string(out_[0].second) + ");\n";
+    this->PrintIndents();
+    code_stream_ << "std::memcpy(out, " << out_[0].first << ", 4 * " << out_[0].second << ");\n";
+
+    // Free buffers
+    for (size_t i = 0; i < buf_decl_.size(); i++) {
+      this->PrintIndents();
+      code_stream_ << "std::free(buf_" << i << ");\n";
+    }
+
+    this->ExitScope();
+    code_stream_ << "}\n";
 
-    code += "}\n";
-    return code;
+    // Create the wrapper to call the subgraph
+    this->GenerateSubgraphWrapper(subgraph_id_,
+                                  subgraph_args_.size() + 1 /* output */);
+    return code_stream_.str();
   }
 
  private:
@@ -161,72 +180,71 @@ class GccBuilder : public ExprVisitor {
 
 class GccCodegen : public ExternCodegenBase {
  public:
-  std::string GetLibPath() const {
-    return lib_path_;
-  }
-
-  void CreateExternSignature(const Function& func, bool update) {
+  void CreateExternFunction(const Function& func) {
     CHECK(func.defined())
         << "Input error: external codegen expects a Relay function.";
-    const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "Unknown error";  // comaniac: Don't know in what case this will fail.
 
     // Record subgraph ID for runtime invoke.
-    auto sid = GetSubgraphID(func);
-
-    // Prepare library source
-    if (update) {
-      std::random_device rd;
-      std::mt19937 gen(rd());
-      std::uniform_int_distribution<uint64_t> distr;
-      std::stringstream ss;
-      ss << std::hex << distr(gen);
-      std::ifstream lib_file("src/relay/backend/contrib/gcc/libs.cc");
-      code_.assign((std::istreambuf_iterator<char>(lib_file)),
-                    std::istreambuf_iterator<char>());
-      lib_path_ = "/tmp/relay_gcc_lib_" + ss.str() + ".so";
-    }
+    auto sid = GetSubgraphID(func, "gcc");
 
-    auto builder = GccBuilder(runtime::contrib::kGccPrefix + sid);
+    auto builder = GccBuilder("gcc_" + sid);
     builder.VisitExpr(func->body);
-    std::string code = builder.build();
-
-    // Append the signature.
-    code_ = code_ + code;
+    code_stream_ << builder.jit_csource();
   }
 
-  void CompileExternLib() override {
-    // Compile from pipe and generate the library.
-    std::string code = "echo \'" + code_ + "\'";
-    std::string cmd = "g++ -std=c++11 -shared -fPIC -ldl -o " + lib_path_ + " -xc++ -";
-    cmd = code + " | " + cmd;
+  runtime::Module CreateExternModule(const NodeRef& ref) {
+    // Create headers
+    code_stream_ << "#include <cstdint>\n";
+    code_stream_ << "#include <iostream>\n";
+    code_stream_ << "#include <cstdlib>\n";
+    code_stream_ << "#include <stdio.h>\n";
+    code_stream_ << "#include <cstring>\n";
+    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
+    code_stream_ << "#include <dlpack/dlpack.h>\n";
+
+    // Append some common macro for operator definition.
+    const char* operator_macro = R"op_marco(
+    #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          out[i] = a[i] p_OP_ b[i];                           \
+        }                                                     \
+      }
+    
+    #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
+      extern "C" void p_ID_(float* a, float* b, float* out) { \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {             \
+            int64_t k = i * p_DIM2_ + j;                      \
+            out[k] = a[k] p_OP_ b[k];                         \
+            std::cout << a[k] << "  " << b[k] << out[k] << std::endl;        \
+          }                                                   \
+        }                                                     \
+      }
+    )op_marco";
 
-    int ret = std::system(cmd.c_str());
-    if (ret != 0) {
-      LOG(FATAL) << "Failed to compile GCC library. Error code: " << ret;
-    }
-  }
+    code_stream_ << operator_macro << "\n\n";
 
-  void Build(const NodeRef& ref) override {
     if (ref->IsInstance<FunctionNode>()) {
-      CreateExternSignature(Downcast<Function>(ref), true);
+      CreateExternFunction(Downcast<Function>(ref));
     } else if (ref->IsInstance<relay::ModuleNode>()) {
       relay::Module mod = Downcast<relay::Module>(ref);
-      bool update = true;
       for (const auto& it : mod->functions) {
-        CreateExternSignature(Downcast<Function>(it.second), update);
-        update = false;
+        CreateExternFunction(Downcast<Function>(it.second));
       }
     } else {
       LOG(FATAL) << "The input ref is expected to be a Relay function or module"
                  << "\n";
     }
-    CompileExternLib();
+    LOG(INFO) << code_stream_.str();
+    // Create a CSourceModule
+    const auto* pf = runtime::Registry::Get("module.csource_module_create");
+    CHECK(pf != nullptr) << "Cannot find csource module to create the external function";
+    return (*pf)(code_stream_.str(), "cc");
   }
 
  private:
-  std::string code_;
-  std::string lib_path_;
+  std::ostringstream code_stream_;
 };
 
 /*!
@@ -239,10 +257,7 @@ class GccCodegen : public ExternCodegenBase {
  */
 runtime::Module GccCompiler(const NodeRef& ref) {
   GccCodegen gcc;
-  gcc.Build(ref);
-  std::shared_ptr<runtime::contrib::GccModule> n =
-    std::make_shared<runtime::contrib::GccModule>(gcc.GetLibPath());
-  return runtime::Module(n);
+  return gcc.CreateExternModule(ref);
 }
 
 TVM_REGISTER_API("relay.ext.gcc")
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index e2881785766c..cf5f26fedfa7 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -24,6 +24,7 @@
 
 #include <dmlc/any.h>
 #include <dmlc/json.h>
+#include <tvm/relay/module.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
 
@@ -55,6 +56,7 @@ using TargetsMap = std::unordered_map<int, Target>;
 struct LoweredOutput {
   std::string graph_json;
   Map<std::string, Array<LoweredFunc> > lowered_funcs;
+  Array<Function> external_funcs;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
 
@@ -212,6 +214,7 @@ class GraphRuntimeCodegen
     LoweredOutput ret;
     ret.graph_json = os.str();
     ret.params = params_;
+    ret.external_funcs = external_funcs_;
     for (auto& kv : lowered_funcs_) {
       if (ret.lowered_funcs.count(kv.first) == 0) {
         ret.lowered_funcs.Set(kv.first, Array<LoweredFunc>());
@@ -380,6 +383,28 @@ class GraphRuntimeCodegen
     }
     return fields;
   }
+
+  std::vector<GraphNodeRef> InvokeExternalCodegen(const CallNode* op, const Function& func) {
+    CHECK(func->IsExternal());
+    std::vector<GraphNodeRef> inputs;
+    for (auto arg : op->args) {
+      auto res = VisitExpr(arg);
+      for (auto nr : res) {
+        inputs.push_back(nr);
+      }
+    }
+    external_funcs_.push_back(func);
+    const auto name_node = FunctionGetAttr(func, attr::kFuncName).as<tvm::ir::StringImm>();
+    CHECK(name_node != nullptr) << "External function has not been attached a name yet.";
+    std::string op_name = name_node->value;
+    auto node = GraphOpNode::make_node_ptr(_GetUniqueName(op_name),
+                                           GraphAttrs(),
+                                           op_name,
+                                           inputs,
+                                           GraphAttrs());
+    return AddNode(node, GetRef<Expr>(op));
+  }
+
   std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
     Expr expr = GetRef<Expr>(op);
     Function func;
@@ -390,6 +415,9 @@ class GraphRuntimeCodegen
       LOG(FATAL) << "Not implemented";
     } else if (op->op.as<FunctionNode>()) {
       func = GetRef<Function>(op->op.as<FunctionNode>());
+      if (func->IsExternal()) {
+        return InvokeExternalCodegen(op, func);
+      }
     } else {
       LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
     }
@@ -470,7 +498,7 @@ class GraphRuntimeCodegen
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
-    throw std::invalid_argument("function not supported");
+    CHECK(op->IsExternal()) << "Only external function is supported";
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const RefCreateNode* op) override {
@@ -587,6 +615,8 @@ class GraphRuntimeCodegen
   std::unordered_map<std::string, size_t> name_map_;
   /*! \brief compile engine */
   CompileEngine compile_engine_;
+  /*! \brief external functions */
+  Array<Function> external_funcs_;
 };
 
 class GraphRuntimeCodegenModule : public runtime::ModuleNode {
@@ -628,7 +658,6 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
         }
         *rv = ret;
       });
-
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         std::string key = args[0];
@@ -639,6 +668,35 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->output_.lowered_funcs;
       });
+    } else if (name == "get_external_funcs") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.external_funcs;
+      });
+    } else if (name == "get_external_module") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        CHECK(!this->output_.external_funcs.empty()) << "No external function is annotated.";
+        // Invoke the external codegen to generate a external runtime module.
+        auto compiler = FunctionGetAttr(output_.external_funcs[0], attr::kExternal);
+        const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
+        CHECK(code_gen) << "No external codegen is set";
+        std::string ext_name = "relay.ext." + code_gen->value;
+        auto pf = tvm::runtime::Registry::Get(ext_name);
+        CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+
+        // Invoke the 3rd party codegen to generate a library for the external
+        // functions.
+        relay::Module rly_mod = relay::ModuleNode::make({}, {});
+        for (const auto& func : output_.external_funcs) {
+          auto ext_func_name = FunctionGetAttr(func, attr::kFuncName);
+          const tvm::ir::StringImm* func_name = ext_func_name.as<tvm::ir::StringImm>();
+          CHECK(func_name) << "No external function name is set for:\n" << AsText(func, false);
+          auto gv = GlobalVarNode::make(func_name->value);
+          rly_mod->Add(gv, func);
+        }
+        runtime::Module ext_mod = (*pf)(rly_mod);
+        CHECK(ext_mod.defined()) << "No external runtime is generated.";
+        *rv = ext_mod;
+      });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 7f7c391a1fee..c38ca1ae0469 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -27,7 +27,6 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/qnn/transform.h>
-#include <tvm/ir.h>
 #include <tvm/logging.h>
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/vm.h>
@@ -296,7 +295,6 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         last_register_ = instr.dst;
         break;
       case Opcode::InvokePacked:
-      case Opcode::InvokeExternal:
       case Opcode::If:
       case Opcode::Ret:
       case Opcode::Goto:
@@ -446,51 +444,6 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       argument_registers));
   }
 
-  void EmitInvokeExternal(const Function& func,
-                          const std::vector<Index>& arg_regs,
-                          size_t arity,
-                          size_t return_count) {
-    CHECK(func->IsExternal());
-    // Append all subgraphs to a list, and then perform codegen for each
-    // category (i.e. the ones that use the same codegen should be compiled
-    // together.)
-    size_t subgraph_id = context_->external_funcs.size();
-    context_->external_funcs.push_back(func);
-    // Emit an instruction to invoke the external function/subgraph.
-    Emit(Instruction::InvokeExternal(subgraph_id, arity, return_count, arg_regs));
-  }
-
-  void EmitInvokePacked(const Function& func,
-                        const std::vector<Index>& arg_regs,
-                        size_t arity,
-                        size_t return_count) {
-    Target target;
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      for (auto kv : targets_) {
-        target = kv.second;
-      }
-    } else {
-      // heterogeneous execution.
-      LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
-    }
-    auto key = CCacheKeyNode::make(func, target);
-    auto cfunc = engine_->Lower(key);
-    // TODO(jroesch): support lowered funcs for multiple targets
-    CHECK_EQ(cfunc->funcs.size(), 1);
-    auto op_index = -1;
-    if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
-      op_index = context_->cached_funcs.size();
-      context_->cached_funcs.push_back(cfunc);
-      context_->seen_funcs[cfunc->funcs[0]] = op_index;
-    } else {
-      op_index = context_->seen_funcs[cfunc->funcs[0]];
-    }
-
-    Emit(Instruction::InvokePacked(op_index, arity, return_count, arg_regs));
-  }
-
-
   void EmitInvokeTVMOp(const Function& func,
                        const Expr& inputs,
                        const Expr& outputs) {
@@ -524,14 +477,35 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     }
 
     // Next generate the invoke instruction.
-    CHECK(func->IsPrimitive() || func->IsExternal());
-    if (func->IsExternal()) {
-      EmitInvokeExternal(func, argument_registers, argument_registers.size(),
-                         output_tuple->fields.size());
+    Target target;
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      for (auto kv : targets_) {
+        target = kv.second;
+      }
+    } else {
+      // heterogeneous execution.
+      LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
+    }
+
+    auto key = CCacheKeyNode::make(func, target);
+    auto cfunc = engine_->Lower(key);
+
+    // TODO(jroesch): support lowered funcs for multiple targets
+    CHECK_EQ(cfunc->funcs.size(), 1);
+    auto op_index = -1;
+    if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
+      op_index = context_->cached_funcs.size();
+      context_->cached_funcs.push_back(cfunc);
+      context_->seen_funcs[cfunc->funcs[0]] = op_index;
     } else {
-      EmitInvokePacked(func, argument_registers, argument_registers.size(),
-                       output_tuple->fields.size());
+      op_index = context_->seen_funcs[cfunc->funcs[0]];
     }
+
+    Emit(Instruction::InvokePacked(op_index,
+      argument_registers.size(),
+      output_tuple->fields.size(),
+      argument_registers));
   }
 
   void VisitExpr_(const CallNode* call_node) {
@@ -665,7 +639,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   }
 
   void VisitExpr_(const FunctionNode* func_node) {
-    if (!func_node->IsPrimitive() && !func_node->IsExternal()) {
+    if (!func_node->IsPrimitive()) {
       LOG(FATAL) << "local functions should have been removed by lambda lifting:" << std::endl
                  << "Program: " << AsText(GetRef<Function>(func_node), false) << std::endl
                  << "AST: " << GetRef<Function>(func_node);
@@ -881,13 +855,11 @@ void VMCompiler::Compile(Module mod,
     exec_->constants.push_back(vm::Tensor(data));
   }
 
-  PrimitiveFuncCodegen();
+  LibraryCodegen();
 
   for (auto gv : context_.global_map) {
     exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
-
-  ExternalFuncCodegen();
 }
 
 Module VMCompiler::OptimizeModule(const Module& mod, const TargetsMap& targets) {
@@ -973,7 +945,7 @@ void VMCompiler::PopulateGlobalMap() {
   }
 }
 
-void VMCompiler::PrimitiveFuncCodegen() {
+void VMCompiler::LibraryCodegen() {
   auto const &cached_funcs = context_.cached_funcs;
   if (cached_funcs.size() == 0) {
     return;
@@ -1007,48 +979,6 @@ void VMCompiler::PrimitiveFuncCodegen() {
   }
 }
 
-void VMCompiler::ExternalFuncCodegen() {
-  // The codegen tool/compiler to the list of function mapping.
-  std::unordered_map<std::string, Module > comp_module;
-  // The codegen tool to lib index mapping.
-  std::unordered_map<std::string, size_t> comp_map;
-  // The function index to the external function and codegen tool mapping.
-  std::unordered_map<int, std::pair<std::string, std::string> > func_codgen;
-  for (size_t i = 0; i < context_.external_funcs.size(); i++) {
-    const auto& it = context_.external_funcs[i];
-    auto func_name = FunctionGetAttr(it, attr::kFuncName);
-    CHECK(func_name.defined()) << "Cannot find func_name attribute";
-    const auto* func_name_str = func_name.as<tvm::ir::StringImm>();
-    CHECK(func_name_str);
-    CHECK(it->IsExternal());
-    auto comp = FunctionGetAttr(it, attr::kExternal);
-    const auto* comp_name = comp.as<tvm::ir::StringImm>();
-    CHECK(comp_name);
-    if (comp_module.count(comp_name->value) == 0) {
-      comp_module.emplace(comp_name->value, relay::ModuleNode::make({}, {}));
-    }
-    CHECK(it->checked_type_.defined())
-        << "Please perform type inference on the external function first."
-        << "\n";
-    comp_module[comp_name->value]->Add(GlobalVarNode::make(func_name_str->value), it);
-    func_codgen[i] = std::make_pair(func_name_str->value, comp_name->value);
-  }
-
-
-  for (const auto& it : comp_module) {
-    const auto *cg = runtime::Registry::Get("relay.ext." + it.first);
-    CHECK(cg) << "relay.ext." << it.first << " is not registered";
-    runtime::Module mod = (*cg)(it.second);
-    comp_map.emplace(it.first, exec_->ext_libs.size());
-    exec_->ext_libs.push_back(mod);
-  }
-
-  for (size_t i = 0; i < context_.external_funcs.size(); i++) {
-    exec_->external_func_map.emplace(i, std::get<0>(func_codgen[i]));
-    exec_->external_map.emplace(i, comp_map[std::get<1>(func_codgen[i])]);
-  }
-}
-
 runtime::Module CreateVMCompiler() {
   auto exec = make_object<VMCompiler>();
   return runtime::Module(exec);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index c5c022d9d2d7..8cdb12e4dafa 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -77,10 +77,9 @@ struct VMCompilerContext {
   std::vector<CachedFunc> cached_funcs;
   // The functions that have been lowered.
   std::unordered_map<LoweredFunc, size_t, NodeHash, NodeEqual> seen_funcs;
-  // List of external functions that are used by external libraries.
-  std::vector<Function> external_funcs;
 };
 
+
 class VMCompiler : public runtime::ModuleNode {
  public:
   virtual ~VMCompiler() {}
@@ -131,11 +130,7 @@ class VMCompiler : public runtime::ModuleNode {
 
   void PopulateGlobalMap();
 
-  /* \brief Use TVM codegen to generat code for primitive functions. */
-  void PrimitiveFuncCodegen();
-
-  /* \brief Use TVM codegen to generat code for external functions. */
-  void ExternalFuncCodegen();
+  void LibraryCodegen();
 
  protected:
   /*! \brief Target devices. */
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index aa0d4fc99911..6383d95f1581 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -199,7 +199,7 @@ class Partitioner : public ExprMutator {
           FunctionNode::make(params, input, call->args[0]->checked_type_, {}, Attrs());
 
       Expr arg0 = call->args[0];
-      std::string name = "subgraph_" + std::to_string(subgraph->id);
+      std::string name = subgraph_attrs->compiler + "_" + std::to_string(subgraph->id);
       subgraph_func =
           FunctionSetAttr(subgraph_func, attr::kFuncName, tvm::ir::StringImm::make(name));
       subgraph_func = FunctionSetAttr(subgraph_func, attr::kPrimitive, tvm::Integer(1));
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index 1c541cc730a5..fd834a679a93 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -331,9 +331,9 @@ Module FunctionPassNode::operator()(const Module& mod,
 
 bool FunctionPassNode::SkipFunction(const Function& func) const {
   NodeRef skip_opt = FunctionGetAttr(func, attr::kSkipOptimization);
-  NodeRef is_extern = FunctionGetAttr(func, attr::kExternal);
+  NodeRef ext = FunctionGetAttr(func, attr::kExternal);
   const ir::IntImm* pval = skip_opt.as<ir::IntImm>();
-  const ir::StringImm* sval = is_extern.as<ir::StringImm>();
+  const ir::StringImm* sval = ext.as<ir::StringImm>();
   return (pval && pval->value != 0) || (sval && sval->value.size() > 0);
 }
 
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index cbeddd816417..1412a9bc0dae 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -17,62 +17,231 @@
  * under the License.
  */
 
-#include "dnnl.h"
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl.cc
+ * \brief TVM compatible wrappers for dnnl kernels.
+ */
+
+#include <tvm/runtime/contrib/dnnl/dnnl_kernel.h>
 
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
 #include <string>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
 namespace contrib {
 
-void DNNLModule::Init() {
-  if (!IsOpen()) {
-    CHECK_GT(lib_path_.size(), 0U);
-    Open({lib_path_});
-  }
+using namespace dnnl;
+
+typedef struct {
+  void** data;
+} DnnlPackedArgs;
+
+// Read from memory, write to handle
+inline void read_from_dnnl_memory(void* handle, const memory& mem) {
+  size_t bytes = mem.get_desc().get_size();
+
+  uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
+  std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
 }
 
-runtime::PackedFunc DNNLModule::GetFunction(
-    const std::string& name, const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "init") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->Init();
-    });
-  } else {
-    std::string curr_id = GetSubgraphID(name);
-
-    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
-
-    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
-      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
-      runtime::NDArray out_arg = args[args.size() - 1];
-      auto out = reinterpret_cast<float*>(out_arg->data);
-
-      // Get function from the library
-      std::string encoded_name = kDnnlPrefix + curr_id;
-      auto func_s = reinterpret_cast<DnnlSubgraphFunc>(GetSymbol(encoded_name));
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        DnnlPackedArgs packed_args;
-        packed_args.data = reinterpret_cast<void**>(malloc(sizeof(float*) * args.size()));
-        for (int i = 0; i < args.size() - 1; ++i) {
-          runtime::NDArray arg = args[i];
-          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
-        }
-        (*func_s)(packed_args, out);
-      } else {
-        LOG(FATAL) << "Only support float32 type.";
-      }
-      *rv = out;
-    });
-  }
+extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
+                            int p_C_, int p_H_, int p_W_, int p_O_, int p_G_,
+                            int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
+                            int p_Sh_, int p_Sw_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};
+  memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
+  if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
+  memory::dims conv2d_bias_tz = {p_O_};
+  memory::dims conv2d_dst_tz = {p_N_, p_O_,
+                                (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
+                                (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
+  memory::dims conv2d_strides = {p_Sh_, p_Sw_};
+  memory::dims conv2d_padding = {p_Ph_, p_Pw_};
+
+  std::vector<float> conv2d_bias(p_O_, 0);
+
+  auto user_src_memory =
+      memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
+  auto user_weights_memory = memory(
+      {{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng,
+      weights);
+  auto conv2d_user_bias_memory =
+      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());
+
+  auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
+  auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
+  auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
+  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
+
+  auto conv2d_desc = convolution_forward::desc(
+      prop_kind::forward_inference, algorithm::convolution_direct,
+      conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md,
+      conv2d_strides, conv2d_padding, conv2d_padding);
+  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);
+
+  auto conv2d_src_memory = user_src_memory;
+  auto conv2d_weights_memory = user_weights_memory;
+  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+
+  auto conv = convolution_forward(conv2d_prim_desc);
+  conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
+                   {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
+                   {DNNL_ARG_BIAS, conv2d_user_bias_memory},
+                   {DNNL_ARG_DST, conv2d_dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, conv2d_dst_memory);
+}
+
+extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_,
+                           int p_I_, int p_O_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_B_, p_I_};
+  memory::dims weight_tz = {p_O_, p_I_};
+  memory::dims bias_tz = {p_O_};
+  memory::dims dst_tz = {p_B_, p_O_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};
+  auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});
+  auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});
+  auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});
+
+  std::vector<float> bias(p_O_, 0);
+  auto data_memory = memory(data_md, eng, data);
+  auto weight_memory = memory(weight_md, eng, weight);
+  auto bias_memory = memory(bias_md, eng, bias.data());
+  auto dst_memory = memory(dst_md, eng);
+
+  auto dense_desc = inner_product_forward::desc(
+      prop_kind::forward_inference, data_md, weight_md, bias_md, dst_md);
+  auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);
+  assert(dst_md == dense_prim_desc.dst_desc());
+
+  auto dense = inner_product_forward(dense_prim_desc);
+  dense.execute(s, {{DNNL_ARG_SRC, data_memory},
+                    {DNNL_ARG_WEIGHTS, weight_memory},
+                    {DNNL_ARG_BIAS, bias_memory},
+                    {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+}
+
+extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_,
+                          int p_W_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+
+  auto data_memory = memory(data_md, eng, data);
+  auto dst_memory = memory(data_md, eng);
+
+  auto relu_desc = eltwise_forward::desc(prop_kind::forward_inference,
+                                         algorithm::eltwise_relu, data_md, 0);
+  auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);
+  assert(data_md == relu_prim_desc.dst_desc());
+
+  auto relu = eltwise_forward(relu_prim_desc);
+  relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+}
+
+extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean,
+                        float* variance, float* out, int p_N_, int p_C_,
+                        int p_H_, int p_W_, int p_E_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+
+  auto data_memory = memory(data_md, eng, data);
+  auto dst_memory = memory(data_md, eng);
+
+  auto bn_desc = batch_normalization_forward::desc(
+      prop_kind::forward_inference, data_md, p_E_,
+      normalization_flags::use_global_stats |
+          normalization_flags::use_scale_shift);
+  auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);
+  assert(data_md == bn_prim_desc.dst_desc());
+
+  float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));
+  memcpy(weight, gamma, sizeof(float) * p_C_);
+  memcpy(weight + p_C_, beta, sizeof(float) * p_C_);
+
+  auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);
+  auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);
+  auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);
+
+  auto bn = batch_normalization_forward(bn_prim_desc);
+  bn.execute(s, {{DNNL_ARG_SRC, data_memory},
+                 {DNNL_ARG_DST, dst_memory},
+                 {DNNL_ARG_SCALE_SHIFT, weight_memory},
+                 {DNNL_ARG_MEAN, mean_memory},
+                 {DNNL_ARG_VARIANCE, variance_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
+  free(weight);
+}
+
+extern "C" void dnnl_add(float* data, float* weight, float* out, int p_N_,
+                         int p_C_, int p_H_, int p_W_) {
+  using tag = memory::format_tag;
+  using dt = memory::data_type;
+
+  engine eng(engine::kind::cpu, 0);
+  stream s(eng);
+
+  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
+
+  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
+  auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
+  auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
+
+  auto data_memory = memory(data_md, eng, data);
+  auto weight_memory = memory(weight_md, eng, weight);
+  auto dst_memory = memory(dst_md, eng);
+
+  auto add_desc =
+      binary::desc(algorithm::binary_add, data_md, weight_md, dst_md);
+  auto add_prim_desc = binary::primitive_desc(add_desc, eng);
+  assert(dst_md == add_prim_desc.dst_desc());
+
+  auto add = binary(add_prim_desc);
+  add.execute(s, {{DNNL_ARG_SRC_0, data_memory},
+                  {DNNL_ARG_SRC_1, weight_memory},
+                  {DNNL_ARG_DST, dst_memory}});
+  s.wait();
+  read_from_dnnl_memory(out, dst_memory);
 }
 
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
-
-
diff --git a/src/runtime/contrib/dnnl/dnnl.h b/src/runtime/contrib/dnnl/dnnl.h
deleted file mode 100644
index bde9ae068fa8..000000000000
--- a/src/runtime/contrib/dnnl/dnnl.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
-#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <string>
-#include "../extern_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-/*!
- * \brief Defined a data structure to save dnnl subgraph args.
- */
-typedef struct {
-  void** data;
-} DnnlPackedArgs;
-
-constexpr const char* kDnnlPrefix = "dnnl_";
-
-typedef void (*DnnlSubgraphFunc)(DnnlPackedArgs in, float* out);
-
-class DNNLModule : public ExternModuleBase {
- public:
-  explicit DNNLModule(const std::string& lib_path) : lib_path_(lib_path) {}
-
-  const char* type_key() const final {
-    return "DNNLModule";
-  }
-
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  void Init() final;
-
- private:
-  /*! \brief The path to the compiled dnnl library.*/
-  std::string lib_path_;
-};
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_H_
diff --git a/src/runtime/contrib/extern_common.h b/src/runtime/contrib/extern_common.h
deleted file mode 100644
index 0a400c11649d..000000000000
--- a/src/runtime/contrib/extern_common.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/extern_common.h
- * \brief The definition of the base class for the external runtime.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
-#define TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
-
-#include <stdlib.h>
-#include <dlpack/dlpack.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/util.h>
-
-#include <string>
-#include <vector>
-
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-/*!
- * \brief Split the encoded function name to tokens.
- *
- * \param the function name string.
- *
- * \return a vector of tokenized function name splitted by "_".
- */
-static inline std::string GetSubgraphID(const std::string& name) {
-  std::string temp = name;
-  std::vector<std::string> tokens;
-  std::string delimiter = "_";
-  size_t pos = 0;
-  std::string token;
-  while ((pos = temp.find(delimiter)) != std::string::npos) {
-    token = temp.substr(0, pos);
-    tokens.push_back(token);
-    temp.erase(0, pos + delimiter.length());
-  }
-  tokens.push_back(temp);
-
-  CHECK(tokens.size() >= 2) << "Invalid subgraph name: " << name;
-  CHECK(tokens[0] == "subgraph")
-      << "Function name does not start with \"subgraph\": " << name;
-  return tokens[1];
-}
-
-class ExternModuleBase : public runtime:: ModuleNode {
- public:
-  ExternModuleBase() = default;
-
-  ~ExternModuleBase() {
-    Close();
-  }
-
-  /*!
-   * \brief Get a PackedFunc from module, which is a function ptr can be invoked
-   * for execution given some parameters.
-   *
-   * \param name the name of the external function.
-   * \param sptr_to_self The shared_ptr that points to this module node.
-   *
-   * \return PackedFunc(nullptr) when it is not available.
-   */
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) override = 0;
-
-  const char* type_key() const override {
-    return "ExternModuleBase";
-  }
-
- protected:
-  // Platform dependent handlers for opening system lib.
-#if defined(_WIN32)
-  // The handle.
-  HMODULE handle_{nullptr};
-
-  // Check if the handle_ is open.
-  bool IsOpen() const {
-    return handle_ != nullptr;
-  }
-
-  // Open the library.
-  virtual void Open(const std::string& name) {
-    std::wstring wname(name.begin(), name.end());
-    handle_ = LoadLibraryW(wname.c_str());
-    CHECK(handle_ != nullptr)
-        << "Failed to open the dynamic shared library " << name;
-  }
-
-  // Retrieve a symbol.
-  virtual void* GetSymbol(const std::string& name) {
-    return reinterpret_cast<void*>(
-        GetProcAddress(handle_, (LPCSTR)name.c_str()));  // NOLINT(*)
-  }
-
-  // Close the handle.
-  virtual void Close() {
-    if (handle_) {
-      FreeLibrary(handle_);
-    }
-  }
-#else
-  // The handle.
-  void* handle_{nullptr};
-
-  // Check if the handle_ is open.
-  bool IsOpen() const {
-    return handle_ != nullptr;
-  }
-
-  // load the library.
-  virtual void Open(const std::vector<std::string> lib_names) {
-    CHECK(lib_names.size() == 1)
-        << "Default library loader only loads one library. "
-        << "Please override the loader if multiple libraries are used";
-    handle_ = dlopen(lib_names[0].c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(handle_ != nullptr) << "Failed to open the dynamic shared library "
-                              << lib_names[0] << " " << dlerror();
-  }
-
-  /*!
-   * \brief Retrieve the pre-compiled function symbol from the opened library.
-   *
-   * \param name the name of the external function.
-   *
-   * \return The pointer to the external function.
-   * \note Exceptions when loading the symbol can be retrieved by dlerror().
-   */
-  virtual void* GetSymbol(const std::string& name) {
-    auto sym = dlsym(handle_, name.c_str());
-    char* error = dlerror();
-    if (error) {
-      CHECK(0) << "Fail to get symbol " << name << ": " << error;
-    }
-    return sym;
-  }
-
-  virtual void Close() {
-    if (handle_) {
-      dlclose(handle_);
-    }
-  }
-#endif
-
-  // Initialize an external runtime module.
-  virtual void Init() = 0;
-};
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_EXTERN_COMMON_H_
diff --git a/src/runtime/contrib/gcc/gcc.cc b/src/runtime/contrib/gcc/gcc.cc
deleted file mode 100644
index 183fd85d65fb..000000000000
--- a/src/runtime/contrib/gcc/gcc.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include "gcc.h"
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-#include <string>
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-void GccModule::Init() {
-  if (!IsOpen()) {
-    CHECK_GT(lib_path_.size(), 0U);
-    Open({lib_path_});
-  }
-}
-
-runtime::PackedFunc GccModule::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "init") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->Init();
-    });
-  } else {
-    std::string curr_id = GetSubgraphID(name);
-
-    CHECK(IsOpen()) << "The external module has not been built or failed to open.\n";
-    // Generate an external packed function
-    return PackedFunc([sptr_to_self, curr_id, this](TVMArgs args, TVMRetValue* rv) {
-      const DLTensor* dptr = ((runtime::NDArray)args[0]).operator->();
-      runtime::NDArray out_arg = args[args.size() - 1];
-      auto out = reinterpret_cast<float*>(out_arg->data);
-
-      // Get function from the library
-      std::string encoded_name = "gcc_" + curr_id;
-      auto func_s = reinterpret_cast<GccSubgraphFunc>(GetSymbol(encoded_name));
-
-      // Reinterpret data and function to the right type and invoke
-      if (runtime::TypeMatch(dptr->dtype, kDLFloat, 32)) {
-        GccPackedArgs packed_args;
-        packed_args.data = reinterpret_cast<float**>(malloc(sizeof(float*) * args.size()));
-        for (int i = 0; i < args.size() - 1; ++i) {
-          runtime::NDArray arg = args[i];
-          packed_args.data[i] = reinterpret_cast<float*>(arg->data);
-        }
-        (*func_s)(packed_args, out);
-      } else {
-        LOG(FATAL) << "Only support float32 type.";
-      }
-      *rv = out;
-    });
-  }
-}
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/gcc/gcc.h b/src/runtime/contrib/gcc/gcc.h
deleted file mode 100644
index 4c23c218ea8f..000000000000
--- a/src/runtime/contrib/gcc/gcc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_RUNTIME_CONTRIB_GCC_GCC_H_
-#define TVM_RUNTIME_CONTRIB_GCC_GCC_H_
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <string>
-#include "../extern_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-constexpr const char* kGccPrefix = "gcc_";
-
-/*!
- * \brief Defined a data structure to save subgraph args.
- */
-typedef struct {
-  float** data;
-} GccPackedArgs;
-
-typedef void (*GccSubgraphFunc)(GccPackedArgs in, float* out);
-
-class GccModule : public ExternModuleBase {
- public:
-  explicit GccModule(const std::string& lib_path) : lib_path_(lib_path) {}
-
-  const char* type_key() const final {
-    return "GccModule";
-  }
-
-  runtime::PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  void Init() final;
-
- private:
-  std::string lib_path_;
-};
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_GCC_GCC_H_
diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc
index 445bfd343653..c20e52414cc3 100644
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -49,6 +49,7 @@ void ImportModuleBlob(const char* mblob, std::vector<Module>* mlist) {
   for (uint64_t i = 0; i < size; ++i) {
     std::string tkey;
     CHECK(stream->Read(&tkey));
+    if (tkey == "c") continue;
     std::string fkey = "module.loadbinary_" + tkey;
     const PackedFunc* f = Registry::Get(fkey);
     CHECK(f != nullptr)
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 81d3d11e0ecd..f02fadb53ed9 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -308,17 +308,10 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
       // Number of fields = 0
       break;
     }
-    case Opcode::InvokeExternal: {
+    case Opcode::InvokePacked: {
       // Number of fields = 3 + instr.arity
       // Note that arity includes both input arguments and outputs. We will
       // put all the `arity` number of fields in the end for serialization.
-      fields.assign({instr.ext_index, instr.ext_arity, instr.ext_output_size});
-      // Save the args.
-      fields.insert(fields.end(), instr.ext_args, instr.ext_args + instr.ext_arity);
-      break;
-    }
-    case Opcode::InvokePacked: {
-      // Number of fields = 3 + instr.arity
       fields.assign({instr.packed_index, instr.arity, instr.output_size});
       // Save the args.
       fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
@@ -564,17 +557,6 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
       DCHECK(instr.fields.empty());
       return Instruction::Fatal();
     }
-    case Opcode::InvokeExternal: {
-      // Number of fields = 3 + instr.arity
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index ext_index = instr.fields[0];
-      Index arity = instr.fields[1];
-      Index output_size = instr.fields[2];
-      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
-      return Instruction::InvokePacked(ext_index, arity, output_size, args);
-    }
     case Opcode::InvokePacked: {
       // Number of fields = 3 + instr.arity
       DCHECK_GE(instr.fields.size(), 3U);
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 86b3d3dc2dfa..333dd1e44506 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -105,12 +105,6 @@ Instruction::Instruction(const Instruction& instr) {
       this->output_size = instr.output_size;
       this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
       return;
-    case Opcode::InvokeExternal:
-      this->ext_index = instr.ext_index;
-      this->ext_arity = instr.ext_arity;
-      this->ext_output_size = instr.ext_output_size;
-      this->ext_args = Duplicate<RegName>(instr.ext_args, instr.ext_arity);
-      return;
     case Opcode::InvokeClosure:
       this->closure = instr.closure;
       this->num_closure_args = instr.num_closure_args;
@@ -204,13 +198,6 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       FreeIf(this->packed_args);
       this->packed_args = Duplicate<RegName>(instr.packed_args, instr.arity);
       return *this;
-    case Opcode::InvokeExternal:
-      this->ext_index = instr.ext_index;
-      this->ext_arity = instr.ext_arity;
-      this->ext_output_size = instr.ext_output_size;
-      FreeIf(this->ext_args);
-      this->ext_args = Duplicate<RegName>(instr.ext_args, instr.ext_arity);
-      return *this;
     case Opcode::InvokeClosure:
       this->closure = instr.closure;
       this->num_closure_args = instr.num_closure_args;
@@ -275,9 +262,6 @@ Instruction::~Instruction() {
     case Opcode::InvokePacked:
       delete this->packed_args;
       return;
-    case Opcode::InvokeExternal:
-      delete this->ext_args;
-      return;
     case Opcode::InvokeClosure:
       delete this->closure_args;
       return;
@@ -319,22 +303,6 @@ Instruction Instruction::InvokePacked(Index packed_index,
   return instr;
 }
 
-Instruction Instruction::InvokeExternal(Index ext_index,
-                                        Index ext_arity,
-                                        Index ext_output_size,
-                                        const std::vector<RegName>& args) {
-  Instruction instr;
-  instr.op = Opcode::InvokeExternal;
-  instr.ext_index = ext_index;
-  instr.ext_arity = ext_arity;
-  instr.ext_output_size = ext_output_size;
-  instr.ext_args = new RegName[ext_arity];
-  for (Index i = 0; i < ext_arity; ++i) {
-    instr.ext_args[i] = args[i];
-  }
-  return instr;
-}
-
 Instruction Instruction::AllocTensor(
   RegName storage,
   const std::vector<int64_t>& shape,
@@ -548,16 +516,6 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
          << ")";
       break;
     }
-    case Opcode::InvokeExternal: {
-      os << "invoke_external Function[" << instr.ext_index << "] (in: $"
-         << StrJoin<RegName>(instr.ext_args, 0,
-                             instr.ext_arity - instr.ext_output_size, ", $")
-         << ", out: $"
-         << StrJoin<RegName>(instr.ext_args, instr.ext_arity - instr.ext_output_size,
-                             instr.ext_output_size, ", $")
-         << ")";
-      break;
-    }
     case Opcode::AllocTensor: {
       os << "alloc_tensor $" << instr.dst << " $"
          << instr.alloc_tensor.storage << " ["
@@ -842,22 +800,9 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
     }
     packed_funcs_[packed_index] = lib.GetFunction(packed_name);
   }
-
-  for (const auto& it : this->exec->external_map) {
-    Index subgraph_id = it.first;
-    Index ext_lib_idx = it.second;
-    if (external_funcs.size() <= static_cast<size_t>(subgraph_id)) {
-      external_funcs.resize(subgraph_id + 1);
-    }
-    CHECK_GT(this->exec->external_func_map.count(subgraph_id), 0U);
-    const std::string& symb = exec->external_func_map.at(subgraph_id);
-    auto ext_mod = exec->ext_libs.at(ext_lib_idx);
-    CHECK(ext_mod.operator->()) << "external module is not defined." << "\n";
-    ext_mod.GetFunction("init")();
-    external_funcs[subgraph_id] = ext_mod.GetFunction(symb);
-  }
 }
 
+
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
   ctxs_ = ctxs;
 }
@@ -962,20 +907,6 @@ void VirtualMachine::RunLoop() {
         pc_++;
         goto main_loop;
       }
-      case Opcode::InvokeExternal: {
-        const auto& func = external_funcs[instr.ext_index];
-        const auto& arity = instr.ext_arity;
-        std::vector<ObjectRef> args;
-        for (Index i = 0; i < arity; ++i) {
-          DLOG(INFO) <<
-            "arg" << i << " $" << instr.ext_args[i];
-          auto arg = ReadRegister(instr.ext_args[i]);
-          args.push_back(arg);
-        }
-        InvokePacked(instr.ext_index, func, arity, instr.ext_output_size, args);
-        pc++;
-        goto main_loop;
-      }
       case Opcode::InvokeClosure: {
         auto object = ReadRegister(instr.closure);
         const auto* closure = object.as<ClosureObj>();
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index b8bad9419ad2..a397f5d0ed73 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -24,6 +24,8 @@
 import tvm.relay.transform
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.annotation import subgraph_begin, subgraph_end
+from tvm.contrib import util
+
 
 class GCCAnnotator(ExprMutator):
     """
@@ -40,12 +42,13 @@ class GCCAnnotator(ExprMutator):
        -- end --
            |
     """
+
     def __init__(self):
         super(GCCAnnotator, self).__init__()
         self.in_subgraph = 0
 
     def visit_call(self, call):
-        if call.op.name == "add": # Annotate begin at args
+        if call.op.name == "add":  # Annotate begin at args
             if self.in_subgraph == 1:
                 lhs = subgraph_begin(super().visit(call.args[0]), "gcc")
                 rhs = subgraph_begin(super().visit(call.args[1]), "gcc")
@@ -61,7 +64,7 @@ def visit_call(self, call):
                 if isinstance(rhs, relay.expr.Var):
                     rhs = subgraph_begin(rhs, "gcc")
                 return relay.subtract(lhs, rhs)
-        elif call.op.name == "multiply": # Annotate end at output
+        elif call.op.name == "multiply":  # Annotate end at output
             self.in_subgraph = 1
             lhs = super().visit(call.args[0])
             rhs = super().visit(call.args[1])
@@ -135,14 +138,14 @@ def visit_call(self, call):
 
 def test_multi_node_subgraph():
     x = relay.var('x', shape=(10, 10))
-    w0 = relay.var('w', shape=(10, 10))
-    w1 = relay.var('w', shape=(10, 10))
-    w2 = relay.var('w', shape=(10, 10))
-    w3 = relay.var('w', shape=(10, 10))
-    w4 = relay.var('w', shape=(10, 10))
-    w5 = relay.var('w', shape=(10, 10))
-    w6 = relay.var('w', shape=(10, 10))
-    w7 = relay.var('w', shape=(10, 10))
+    w0 = relay.var('w0', shape=(10, 10))
+    w1 = relay.var('w1', shape=(10, 10))
+    w2 = relay.var('w2', shape=(10, 10))
+    w3 = relay.var('w3', shape=(10, 10))
+    w4 = relay.var('w4', shape=(10, 10))
+    w5 = relay.var('w5', shape=(10, 10))
+    w6 = relay.var('w6', shape=(10, 10))
+    w7 = relay.var('w7', shape=(10, 10))
 
     # Subgraph on GCC
     # FIXME: We generate two subgraphs for this case but they should be merged to one
@@ -172,16 +175,34 @@ def test_multi_node_subgraph():
     for _ in range(8):
         w_data.append(np.random.rand(10, 10).astype('float32'))
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(0))
-        res = ex.evaluate()(x_data, *w_data)
-        tvm.testing.assert_allclose(
-            res.asnumpy(),
-            np.concatenate(
-                (((x_data + w_data[0]) - w_data[1]) * w_data[2],
-                 ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-                 x_data + w_data[6] - w_data[7]),
-                axis=0))
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        json, lib, _ = relay.build(mod, "llvm")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++11"]
+    tmp_path = util.tempdir()
+    lib_name = 'lib.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.module.load(lib_path)
+
+    ctx = tvm.cpu()
+    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+    for i in range(8):
+        data = np.random.rand(10, 10).astype('float32')
+        w_data.append(data)
+        var = "w" + str(i)
+        rt_mod.set_input(var, data)
+    rt_mod.run()
+    out = tvm.nd.empty((30, 10), ctx=ctx)
+    out = rt_mod.get_output(0, out)
+
+    tvm.testing.assert_allclose(
+        out.asnumpy(),
+        np.concatenate(
+            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
+             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+             x_data + w_data[6] - w_data[7]),
+            axis=0))
 
 
 def test_extern_gcc_single_op():
@@ -195,10 +216,25 @@ def test_extern_gcc_single_op():
     mod["main"] = f
     mod = relay.build_extern(mod, "gcc")
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        res = ex.evaluate()(x_data, y_data)
-        tvm.testing.assert_allclose(res.asnumpy(), (x_data + y_data))
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        json, lib, _ = relay.build(mod, "llvm")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++11"]
+    tmp_path = util.tempdir()
+    lib_name = 'lib.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.module.load(lib_path)
+
+    ctx = tvm.cpu()
+    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+    rt_mod.set_input("x", x_data)
+    rt_mod.set_input("y", y_data)
+    rt_mod.run()
+    out = tvm.nd.empty((8, 8), ctx=ctx)
+    out = rt_mod.get_output(0, out)
+
+    tvm.testing.assert_allclose(out.asnumpy(), (x_data + y_data))
 
 
 def test_extern_gcc():
@@ -213,11 +249,27 @@ def test_extern_gcc():
     mod["main"] = f
     mod = relay.build_extern(mod, "gcc")
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        res = ex.evaluate()(x_data, y_data)
-        tvm.testing.assert_allclose(res.asnumpy(),
-                                    (y_data * y_data) - (x_data + x_data))
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        json, lib, _ = relay.build(mod, "llvm")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++11"]
+    tmp_path = util.tempdir()
+    lib_name = 'lib.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.module.load(lib_path)
+
+    ctx = tvm.cpu()
+    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+    rt_mod.set_input("x", x_data)
+    rt_mod.set_input("y", y_data)
+    rt_mod.run()
+    out = tvm.nd.empty((2, 2), ctx=ctx)
+    out = rt_mod.get_output(0, out)
+
+    tvm.testing.assert_allclose(out.asnumpy(),
+                                (y_data * y_data) - (x_data + x_data))
+
 
 def test_extern_dnnl():
     dtype = 'float32'
@@ -249,21 +301,37 @@ def test_extern_dnnl():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu())
-        res = ex.evaluate()(i_data, w1_data)
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        json, lib, _ = relay.build(mod, "llvm")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++11"]
+    tmp_path = util.tempdir()
+    lib_name = 'lib.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.module.load(lib_path)
+
+    ctx = tvm.cpu()
+    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+    rt_mod.set_input("data", i_data)
+    rt_mod.set_input("weight1", w1_data)
+    rt_mod.run()
+    out = tvm.nd.empty((1, 32, 14, 14), ctx=ctx)
+    out = rt_mod.get_output(0, out)
+
+    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=ctx)
+    ref_res = ref_ex.evaluate()(i_data, w1_data)
 
-        ref_ex = relay.create_executor(kind, mod=ref_mod, ctx=tvm.cpu(0))
-        ref_res = ref_ex.evaluate()(i_data, w1_data)
+    tvm.testing.assert_allclose(out.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
-        tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 @nottest
 def test_extern_dnnl_mobilenet():
     # FIXME: This test is only for demo purpose and supposed to be removed.
     dtype = 'float32'
     ishape = (1, 3, 224, 224)
-    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype='float32')
+    mod, params = relay.testing.mobilenet.get_workload(
+        batch_size=1, dtype='float32')
 
     mod = relay.build_extern(mod, "dnnl")
 
@@ -283,8 +351,8 @@ def test_extern_dnnl_mobilenet():
 
 
 if __name__ == "__main__":
-    test_multi_node_subgraph()
-    test_extern_gcc_single_op()
-    test_extern_gcc()
-    #test_extern_dnnl()
-    #test_extern_dnnl_mobilenet()
+    # test_multi_node_subgraph()
+    # test_extern_gcc_single_op()
+    # test_extern_gcc()
+    test_extern_dnnl()
+    # test_extern_dnnl_mobilenet()
diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index 1933cd94b860..0e3d8e93fad0 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 
-.. _tutorial-custom-relay-backend
+.. _tutorial-custom-relay-backend:
 
 Bring Your Own Codegen To TVM
 ============================================
@@ -28,7 +28,7 @@
 provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators,
 or provide frameworks such as TensorRT to let users describle their models in a certain way to
 achieve high performance. However, users have to learn a new programming interface when they
-attempt to work on a new libaray or device. As a result, the demand of a unified programming
+attempt to work on a new library or device. As a result, the demand of a unified programming
 interface becomes more and more important to 1) let all users and hardware vendors stand on the
 same page, and 2) provide a feasible solution to allow a specialized hardware or library to only
 support widely used operators with extremely high perofrmance, but fallback unsupported operators
@@ -51,7 +51,7 @@
 # Define The Supported Operators
 # ------------------------------
 # The first step is to define which operators are supported by your backend.
-# A templated is provided to ease vendor's effort to add the supported
+# A template is provided to ease vendor's effort to add the supported
 # operators.
 #
 # For example, We create a new Python file at python/relay/backend/op/contrib/gcc/extern_op.py,
@@ -105,7 +105,7 @@ def multiply(attrs, args):
 # a Relay IR mutator to find the supported subgraphs, which may include multiple operators,
 # for the target backend. Here we implement an annotator that includes an entire Relay graph
 # to be offloaded. Specifically, we are going to do two tasks:
-# - insert `aubgraph_begin` after all input variables
+# - insert `subgraph_begin` after all input variables
 # - insert `subgraph_end` before the primary output. For example, given a Relay graph as follows:
 #       input_a
 #          |
@@ -265,7 +265,7 @@ def visit_call(self, call):
 
 
 ######################################################################
-# We can now build TVM with the external GCC backedn and test the correctness:
+# We can now build TVM with the external GCC backend and test the correctness:
 # 1. cd build
 # 2. set(USE_EXTERN gcc) in config.cmake
 # 3. cmake ..; make -j

From 6fb0605b311a207a8724053d2d4f5ac9e2dc1cfa Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 27 Nov 2019 13:12:47 -0800
Subject: [PATCH 31/34] fix test and clean code

---
 python/tvm/module.py                          |   1 -
 src/relay/backend/contrib/contrib_codegen.h   |  47 +++++++
 src/relay/backend/contrib/dnnl/codegen.cc     |  45 +-----
 src/relay/backend/contrib/gcc/codegen.cc      |  48 +------
 src/relay/pass/extern_op.cc                   |   1 -
 src/relay/pass/partition_graph.cc             |   2 +-
 .../python/relay/test_pass_partition_graph.py | 129 ++++++------------
 7 files changed, 93 insertions(+), 180 deletions(-)

diff --git a/python/tvm/module.py b/python/tvm/module.py
index fcf74828f003..d9676169cc5a 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -142,7 +142,6 @@ def export_library(self,
                     path_cc = temp.relpath(c_file_name)
                     with open(path_cc, "w") as f:
                         f.write(m.get_source())
-                        print(m.get_source())
                     files.append(path_cc)
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
index b38c3dfe0e02..7f5b6724c8d6 100644
--- a/src/relay/backend/contrib/contrib_codegen.h
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -184,6 +184,53 @@ class ExternSourcePrinter {
     code_stream_ << "}";
   }
 
+  virtual std::string JIT(void) = 0;
+
+  std::string JitImpl(std::string subgraph_id,
+                  std::vector<std::string> args,
+                  std::vector<std::string> buf_decl,
+                  std::vector<std::string> body,
+                  std::vector<std::pair<std::string, int>> out) {
+    // Create the signature. For example, it could be:
+    // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
+    code_stream_ << "extern \"C\" void " << subgraph_id << "_(";
+
+    for (const auto& arg : args) {
+      code_stream_ << "float* " << arg << ", ";
+    }
+    code_stream_ << "float* out) {\n";
+    this->EnterScope();
+
+    // Function body
+    for (auto decl : buf_decl) {
+      this->PrintIndents();
+      code_stream_ << decl << "\n";
+    }
+    code_stream_ << "\n";
+    for (auto stmt : body) {
+      this->PrintIndents();
+      code_stream_ << stmt << "\n";
+    }
+
+    // Copy output
+    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
+    this->PrintIndents();
+    code_stream_ << "std::memcpy(out, " << out[0].first << ", 4 * " << out[0].second << ");\n";
+
+    // Free buffers
+    for (size_t i = 0; i < buf_decl.size(); i++) {
+      this->PrintIndents();
+      code_stream_ << "std::free(buf_" << i << ");\n";
+    }
+
+    this->ExitScope();
+    code_stream_ << "}\n";
+
+    // Create the wrapper to call the subgraph
+    this->GenerateSubgraphWrapper(subgraph_id, args.size() + 1 /* output */);
+    return code_stream_.str();
+  }
+
   /*! \brief The external function source code stream. */
   std::ostringstream code_stream_;
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index bae44cb201e4..2f66a446194f 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -164,47 +164,8 @@ class DnnlBuilder : public ExprVisitor, public ExternSourcePrinter {
     out_.push_back({out, out_size});
   }
 
-  std::string jit_dnnl() {
-    // Create the signature. For example, it could be:
-    // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
-    code_stream_ << "extern \"C\" void " << subgraph_id_ << "_(";
-
-    for (const auto& arg : subgraph_args_) {
-      code_stream_ << "float* " << arg << ", ";
-    }
-    code_stream_ << "float* out) {\n";
-    this->EnterScope();
-
-    // Function body
-    for (auto decl : buf_decl_) {
-      this->PrintIndents();
-      code_stream_ << decl << "\n";
-    }
-    code_stream_ << "\n";
-    for (auto stmt : subgraph_body) {
-      this->PrintIndents();
-      code_stream_ << stmt << "\n";
-    }
-
-    // Copy output
-    CHECK_EQ(out_.size(), 1U) << "Internal error: only single output is support yet.";
-    this->PrintIndents();
-    code_stream_ << "std::memcpy(out, " << out_[0].first << ", 4 * "
-                 << out_[0].second << ");\n";
-
-    // Free buffers
-    for (size_t i = 0; i < buf_decl_.size(); i++) {
-      this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
-    }
-
-    this->ExitScope();
-    code_stream_ << "}\n";
-
-    // Create the wrapper to call the subgraph
-    this->GenerateSubgraphWrapper(subgraph_id_,
-                                  subgraph_args_.size() + 1 /* output */);
-    return code_stream_.str();
+  std::string JIT(void) {
+    return JitImpl(subgraph_id_, subgraph_args_, buf_decl_, subgraph_body, out_);
   }
 
  private:
@@ -279,7 +240,7 @@ class DNNLCodegen : public ExternCodegenBase {
 
     auto builder = DnnlBuilder("dnnl_" + sid);
     builder.VisitExpr(func->body);
-    code_stream_ << builder.jit_dnnl();
+    code_stream_ << builder.JIT();
   }
 
   /*!
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 36005ee540ad..9806f1942c88 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -108,51 +108,12 @@ class GccBuilder : public ExprVisitor, public ExternSourcePrinter {
     out_.push_back({out, out_size});
   }
 
-  std::string jit_csource() {
+  std::string JIT(void) {
     // Write function macros
     for (auto decl : func_decl_) {
       code_stream_ << decl << "\n";
     }
-
-    // Write subgraph function declaration
-    code_stream_ << "extern  \"C\" void " << subgraph_id_ << "_(";
-
-    for (const auto& arg : subgraph_args_) {
-      code_stream_ << "float* " << arg << ", ";
-    }
-
-    code_stream_ << "float* out) {\n";
-    this->EnterScope();
-
-    // Function body
-    for (auto decl : buf_decl_) {
-      this->PrintIndents();
-      code_stream_ << decl << "\n";
-    }
-    code_stream_ << "\n";
-    for (auto stmt : subgraph_body) {
-      this->PrintIndents();
-      code_stream_ << stmt << "\n";
-    }
-
-    // Copy output
-    CHECK(out_.size() == 1) << "Internal error";
-    this->PrintIndents();
-    code_stream_ << "std::memcpy(out, " << out_[0].first << ", 4 * " << out_[0].second << ");\n";
-
-    // Free buffers
-    for (size_t i = 0; i < buf_decl_.size(); i++) {
-      this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
-    }
-
-    this->ExitScope();
-    code_stream_ << "}\n";
-
-    // Create the wrapper to call the subgraph
-    this->GenerateSubgraphWrapper(subgraph_id_,
-                                  subgraph_args_.size() + 1 /* output */);
-    return code_stream_.str();
+    return JitImpl(subgraph_id_, subgraph_args_, buf_decl_, subgraph_body, out_);
   }
 
  private:
@@ -189,7 +150,7 @@ class GccCodegen : public ExternCodegenBase {
 
     auto builder = GccBuilder("gcc_" + sid);
     builder.VisitExpr(func->body);
-    code_stream_ << builder.jit_csource();
+    code_stream_ << builder.JIT();
   }
 
   runtime::Module CreateExternModule(const NodeRef& ref) {
@@ -217,7 +178,6 @@ class GccCodegen : public ExternCodegenBase {
           for (int64_t j = 0; j < p_DIM2_; ++j) {             \
             int64_t k = i * p_DIM2_ + j;                      \
             out[k] = a[k] p_OP_ b[k];                         \
-            std::cout << a[k] << "  " << b[k] << out[k] << std::endl;        \
           }                                                   \
         }                                                     \
       }
@@ -236,7 +196,7 @@ class GccCodegen : public ExternCodegenBase {
       LOG(FATAL) << "The input ref is expected to be a Relay function or module"
                  << "\n";
     }
-    LOG(INFO) << code_stream_.str();
+
     // Create a CSourceModule
     const auto* pf = runtime::Registry::Get("module.csource_module_create");
     CHECK(pf != nullptr) << "Cannot find csource module to create the external function";
diff --git a/src/relay/pass/extern_op.cc b/src/relay/pass/extern_op.cc
index f22d8a762345..e63b506f41be 100644
--- a/src/relay/pass/extern_op.cc
+++ b/src/relay/pass/extern_op.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
  * \file src/relay/pass/extern_op.cc
  * \brief Wraps a call with subgraph_begin and subgraph_end to indicate that
  * the op of this call node will use external compiler.
diff --git a/src/relay/pass/partition_graph.cc b/src/relay/pass/partition_graph.cc
index 6383d95f1581..f873c322c288 100644
--- a/src/relay/pass/partition_graph.cc
+++ b/src/relay/pass/partition_graph.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/*! Copyright (c) 2019 by Contributorsr
+/*
  * \file src/relay/pass/partition_graph.cc
  *
  * \brief  Partition an input function into multiple Functions according based
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index a397f5d0ed73..d5c75b2a935a 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -135,6 +135,28 @@ def visit_call(self, call):
         new_call = relay.Call(call.op, params, call.attrs)
         return new_call
 
+def check_result(mod, map_inputs, out_shape, result, tol=1e-7):
+    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        json, lib, _ = relay.build(mod, "llvm")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++11"]
+    tmp_path = util.tempdir()
+    lib_name = 'lib.so'
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.module.load(lib_path)
+
+    ctx = tvm.cpu()
+    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+
+    for name, data in map_inputs.items():
+        rt_mod.set_input(name, data)
+    rt_mod.run()
+    out = tvm.nd.empty(out_shape, ctx=ctx)
+    out = rt_mod.get_output(0, out)
+
+    tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
+
 
 def test_multi_node_subgraph():
     x = relay.var('x', shape=(10, 10))
@@ -175,34 +197,14 @@ def test_multi_node_subgraph():
     for _ in range(8):
         w_data.append(np.random.rand(10, 10).astype('float32'))
 
-    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        json, lib, _ = relay.build(mod, "llvm")
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++11"]
-    tmp_path = util.tempdir()
-    lib_name = 'lib.so'
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.module.load(lib_path)
-
-    ctx = tvm.cpu()
-    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
-    for i in range(8):
-        data = np.random.rand(10, 10).astype('float32')
-        w_data.append(data)
-        var = "w" + str(i)
-        rt_mod.set_input(var, data)
-    rt_mod.run()
-    out = tvm.nd.empty((30, 10), ctx=ctx)
-    out = rt_mod.get_output(0, out)
-
-    tvm.testing.assert_allclose(
-        out.asnumpy(),
-        np.concatenate(
-            (((x_data + w_data[0]) - w_data[1]) * w_data[2],
-             ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-             x_data + w_data[6] - w_data[7]),
-            axis=0))
+    map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
+    map_inputs["x"] = x_data
+    check_result(
+        mod, map_inputs, (30, 10),
+        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                        x_data + w_data[6] - w_data[7]),
+                       axis=0))
 
 
 def test_extern_gcc_single_op():
@@ -216,25 +218,7 @@ def test_extern_gcc_single_op():
     mod["main"] = f
     mod = relay.build_extern(mod, "gcc")
 
-    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        json, lib, _ = relay.build(mod, "llvm")
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++11"]
-    tmp_path = util.tempdir()
-    lib_name = 'lib.so'
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.module.load(lib_path)
-
-    ctx = tvm.cpu()
-    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
-    rt_mod.set_input("x", x_data)
-    rt_mod.set_input("y", y_data)
-    rt_mod.run()
-    out = tvm.nd.empty((8, 8), ctx=ctx)
-    out = rt_mod.get_output(0, out)
-
-    tvm.testing.assert_allclose(out.asnumpy(), (x_data + y_data))
+    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
 
 
 def test_extern_gcc():
@@ -249,26 +233,7 @@ def test_extern_gcc():
     mod["main"] = f
     mod = relay.build_extern(mod, "gcc")
 
-    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        json, lib, _ = relay.build(mod, "llvm")
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++11"]
-    tmp_path = util.tempdir()
-    lib_name = 'lib.so'
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.module.load(lib_path)
-
-    ctx = tvm.cpu()
-    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
-    rt_mod.set_input("x", x_data)
-    rt_mod.set_input("y", y_data)
-    rt_mod.run()
-    out = tvm.nd.empty((2, 2), ctx=ctx)
-    out = rt_mod.get_output(0, out)
-
-    tvm.testing.assert_allclose(out.asnumpy(),
-                                (y_data * y_data) - (x_data + x_data))
+    check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
 def test_extern_dnnl():
@@ -301,28 +266,10 @@ def test_extern_dnnl():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-    with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        json, lib, _ = relay.build(mod, "llvm")
-    kwargs = {}
-    kwargs["options"] = ["-O2", "-std=c++11"]
-    tmp_path = util.tempdir()
-    lib_name = 'lib.so'
-    lib_path = tmp_path.relpath(lib_name)
-    lib.export_library(lib_path, fcompile=False, **kwargs)
-    lib = tvm.module.load(lib_path)
-
-    ctx = tvm.cpu()
-    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
-    rt_mod.set_input("data", i_data)
-    rt_mod.set_input("weight1", w1_data)
-    rt_mod.run()
-    out = tvm.nd.empty((1, 32, 14, 14), ctx=ctx)
-    out = rt_mod.get_output(0, out)
-
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=ctx)
+    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
     ref_res = ref_ex.evaluate()(i_data, w1_data)
-
-    tvm.testing.assert_allclose(out.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+    check_result(mod, {"data": i_data, "weight1": w1_data},
+                 (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
 
 
 @nottest
@@ -351,8 +298,8 @@ def test_extern_dnnl_mobilenet():
 
 
 if __name__ == "__main__":
-    # test_multi_node_subgraph()
-    # test_extern_gcc_single_op()
-    # test_extern_gcc()
+    test_multi_node_subgraph()
+    test_extern_gcc_single_op()
+    test_extern_gcc()
     test_extern_dnnl()
     # test_extern_dnnl_mobilenet()

From 7beb0e34b3eee115cd3e30c2bb4d2d000a21738a Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Wed, 27 Nov 2019 13:44:51 -0800
Subject: [PATCH 32/34] fix ci

---
 cmake/config.cmake                              |  1 -
 cmake/modules/contrib/Extern.cmake              | 13 +++++--------
 include/tvm/runtime/contrib/dnnl/dnnl_kernel.h  |  2 +-
 src/relay/backend/contrib/contrib_codegen.h     |  1 +
 tests/python/relay/test_pass_partition_graph.py | 13 +++++++------
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index af785cc4524b..dabe1e930a60 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -165,7 +165,6 @@ set(USE_SORT ON)
 
 # Whether use contrib extern (use ";" to separate multiple externs)
 # Available externs:
-#   gcc
 #   dnnl
 set(USE_EXTERN none)
 
diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 2c55779b0a80..96ef6c10c4f8 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -17,15 +17,12 @@
 
 message(STATUS "Build with relay.backend.contrib")
 
-list(FIND USE_EXTERN "gcc" GCC_IDX)
-if(GCC_IDX GREATER -1)
-    file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
-    list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
+file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
+list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
-    file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-    list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
-    message(STATUS "Use extern library: GCC")
-endif()
+file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
+list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
+message(STATUS "Use extern library: GCC")
 
 list(FIND USE_EXTERN "dnnl" DNNL_IDX)
 if(DNNL_IDX GREATER -1)
diff --git a/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h b/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
index be9afc2c4011..39ebcc2aa55a 100644
--- a/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
+++ b/include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file src/runtime/contrib/dnnl/dnnl_kernel.h
+ * \file include/tvm/runtime/contrib/dnnl/dnnl_kernel.h
  * \brief Use external dnnl library kernels.
  */
 
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
index 7f5b6724c8d6..bbb2b16f46ac 100644
--- a/src/relay/backend/contrib/contrib_codegen.h
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -27,6 +27,7 @@
 #include <tvm/relay/expr.h>
 #include <sstream>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace tvm {
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index d5c75b2a935a..13ed3580e09b 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -16,15 +16,15 @@
 # under the License.
 """Unit tests for graph partitioning."""
 import numpy as np
-from nose.tools import nottest
+import pytest
 
 import tvm
-from tvm import relay
 import tvm.relay.testing
 import tvm.relay.transform
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.annotation import subgraph_begin, subgraph_end
+from tvm import relay
 from tvm.contrib import util
+from tvm.relay.annotation import subgraph_begin, subgraph_end
+from tvm.relay.expr_functor import ExprMutator
 
 
 class GCCAnnotator(ExprMutator):
@@ -236,6 +236,7 @@ def test_extern_gcc():
     check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
+@pytest.mark.skip(reason="Only for DEMO purpose")
 def test_extern_dnnl():
     dtype = 'float32'
     ishape = (1, 32, 14, 14)
@@ -272,7 +273,7 @@ def test_extern_dnnl():
                  (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
 
 
-@nottest
+@pytest.mark.skip(reason="Only for DEMO purpose")
 def test_extern_dnnl_mobilenet():
     # FIXME: This test is only for demo purpose and supposed to be removed.
     dtype = 'float32'
@@ -301,5 +302,5 @@ def test_extern_dnnl_mobilenet():
     test_multi_node_subgraph()
     test_extern_gcc_single_op()
     test_extern_gcc()
-    test_extern_dnnl()
+    # test_extern_dnnl()
     # test_extern_dnnl_mobilenet()

From 5ff7fa6937ac16a0b6e2e2133ae7abd8898d67e7 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 28 Nov 2019 22:10:25 +0000
Subject: [PATCH 33/34] more cleanup

---
 cmake/modules/contrib/Extern.cmake          |  4 --
 python/tvm/relay/op/op.py                   |  4 +-
 src/relay/backend/compile_engine.cc         | 33 ++--------------
 src/relay/backend/compile_engine.h          |  2 -
 src/relay/backend/contrib/contrib_codegen.h | 40 ++++++++++++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc   | 19 ---------
 src/relay/backend/contrib/gcc/codegen.cc    | 27 ++++++-------
 src/relay/backend/contrib/gcc/libs.cc       | 43 ---------------------
 tutorials/dev/custom_relay_backend.py       | 12 +++---
 9 files changed, 64 insertions(+), 120 deletions(-)
 delete mode 100644 src/relay/backend/contrib/gcc/libs.cc

diff --git a/cmake/modules/contrib/Extern.cmake b/cmake/modules/contrib/Extern.cmake
index 96ef6c10c4f8..cf381a080b88 100644
--- a/cmake/modules/contrib/Extern.cmake
+++ b/cmake/modules/contrib/Extern.cmake
@@ -20,10 +20,6 @@ message(STATUS "Build with relay.backend.contrib")
 file(GLOB GCC_RELAY_CONTRIB_SRC src/relay/backend/contrib/gcc/codegen.cc)
 list(APPEND COMPILER_SRCS ${GCC_RELAY_CONTRIB_SRC})
 
-file(GLOB GCC_CONTRIB_SRC src/runtime/contrib/gcc/*.cc)
-list(APPEND RUNTIME_SRCS ${GCC_CONTRIB_SRC})
-message(STATUS "Use extern library: GCC")
-
 list(FIND USE_EXTERN "dnnl" DNNL_IDX)
 if(DNNL_IDX GREATER -1)
   file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index a70068c95047..a30688c9fafc 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -275,8 +275,8 @@ def register_extern_op(op_name, fextern=None, level=10):
     op_name : str
         The name of the operator.
 
-    fextern: function (attrs: Attrs, args: List[Expr], compiler: str) ->
-    new_expr: Expr
+    fextern : function (attrs: Attrs, args: List[Expr], compiler: str)
+              -> new_expr: Expr
         The function for wrapping a call expr with subgraph_start and
         subgraph_end.
 
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 798d8b632bb2..083fa5d5610c 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -21,11 +21,8 @@
  * \file relay/backend/compile_engine.cc
  * \brief Internal compialtion engine.
  */
-#include "compile_engine.h"
-
 #include <tvm/schedule.h>
 #include <tvm/packed_func_ext.h>
-#include <tvm/ir.h>
 #include <tvm/operation.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/relay/attrs/device_copy.h>
@@ -39,9 +36,8 @@
 #include <functional>
 #include <vector>
 #include <unordered_map>
-
-#include "contrib/contrib_codegen.h"
 #include "../ir/type_functor.h"
+#include "compile_engine.h"
 
 namespace tvm {
 namespace relay {
@@ -598,15 +594,8 @@ class CompileEngineImpl : public CompileEngineNode {
   PackedFunc JIT(const CCacheKey& key) final {
     CCacheValue value = LowerInternal(key);
     if (value->packed_func != nullptr) return value->packed_func;
-    // Handle 3rd party generated code library.
-    if (value->lib.operator->()) {
-      auto name = FunctionGetAttr(key->source_func, attr::kFuncName);
-      const tvm::ir::StringImm* func_name = name.as<tvm::ir::StringImm>();
-      CHECK(func_name);
-      value->lib.GetFunction("init")();
-      value->packed_func = value->lib.GetFunction(func_name->value);
-    } else if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
-      // build the function.
+    // build the function.
+    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
       tvm::runtime::Module m = (*f)(value->cached_func->funcs, key->target);
       value->packed_func = m.GetFunction(value->cached_func->func_name);
     } else {
@@ -659,22 +648,6 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
-
-    if (key->source_func->IsExternal()) {
-      auto compiler = FunctionGetAttr(key->source_func, attr::kExternal);
-      const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
-      CHECK(code_gen);
-      std::string ext_name = "relay.ext." + code_gen->value;
-      auto pf = tvm::runtime::Registry::Get(ext_name);
-      CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
-
-      // Invoke the 3rd party codegen to generate a library for the subgraph.
-      runtime::Module mod = (*pf)(key->source_func);
-      value->lib = mod;
-      value->cached_func = CachedFunc();
-      return value;
-    }
-
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 2849ca7fe7ad..31e246ecf1fe 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -137,8 +137,6 @@ class CCacheValueNode : public Node {
   CachedFunc cached_func;
   /*! \brief Result of Packed function generated by JIT */
   PackedFunc packed_func;
-  /*! \brief An external library generated by the 3rd party codegen. */
-  runtime::Module lib;
   /*! \brief usage statistics */
   int use_count{0};
 
diff --git a/src/relay/backend/contrib/contrib_codegen.h b/src/relay/backend/contrib/contrib_codegen.h
index bbb2b16f46ac..f7e651251b97 100644
--- a/src/relay/backend/contrib/contrib_codegen.h
+++ b/src/relay/backend/contrib/contrib_codegen.h
@@ -185,8 +185,46 @@ class ExternSourcePrinter {
     code_stream_ << "}";
   }
 
-  virtual std::string JIT(void) = 0;
+  /*!
+   * \brief Emit the code for external runtime.
+   *
+   * \return The code string.
+   */
+  virtual std::string JIT() = 0;
+
+  /*!
+   * \brief Extract the shape from a Relay tensor type.
+   *
+   * \param type The provided type.
+   *
+   * \return The extracted shape in a list.
+   */
+  std::vector<int> GetShape(const Type& type) const {
+    const auto* ttype = type.as<TensorTypeNode>();
+    CHECK(ttype) << "Expect TensorTypeNode";
+    std::vector<int> shape;
+    for (size_t i = 0; i < ttype->shape.size(); ++i) {
+      auto* val = ttype->shape[i].as<IntImm>();
+      CHECK(val);
+      shape.push_back(val->value);
+    }
+    return shape;
+  }
 
+  /*!
+   * \briefa A common interface that that used by various external runtime to
+   * generate the wrapper to invoke external kernels.
+   *
+   * \param subgraph_id The unique id of an external function. It will be used
+   * during runtime to pick the correct external function.
+   * \param args The arguments used by the external function.
+   * \param buf_decl The declaration of temporary buffers that used to store the
+   * intermeidate of each external kernel.
+   * \param body The statements of the external function.
+   * \param out The name and id pairs for output.
+   *
+   * \return The emitted code string.
+   */
   std::string JitImpl(std::string subgraph_id,
                   std::vector<std::string> args,
                   std::vector<std::string> buf_decl,
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 2f66a446194f..49fe916f6446 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -185,25 +185,6 @@ class DnnlBuilder : public ExprVisitor, public ExternSourcePrinter {
   /*! \brief The name of the the outputs. */
   std::vector<std::pair<std::string, int>> out_;
 
-  /*!
-   * \brief Extract the shape from a Relay tensor type.
-   *
-   * \param type The provided type.
-   *
-   * \return The extracted shape in a list.
-   */
-  std::vector<int> GetShape(const Type& type) const {
-    const auto* ttype = type.as<TensorTypeNode>();
-    CHECK(ttype);
-    std::vector<int> shape;
-    for (size_t i = 0; i < ttype->shape.size(); ++i) {
-      auto* val = ttype->shape[i].as<IntImm>();
-      CHECK(val);
-      shape.push_back(val->value);
-    }
-    return shape;
-  }
-
   /*!
    * \brief Check if a call has the provided name.
    *
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index 9806f1942c88..ce327d6a2c37 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -108,7 +108,12 @@ class GccBuilder : public ExprVisitor, public ExternSourcePrinter {
     out_.push_back({out, out_size});
   }
 
-  std::string JIT(void) {
+  /*!
+   * \brief Emit the source code that invokes gcc compatible wrappers.
+   *
+   * \return The emitted code.
+   */
+  std::string JIT() {
     // Write function macros
     for (auto decl : func_decl_) {
       code_stream_ << decl << "\n";
@@ -117,26 +122,22 @@ class GccBuilder : public ExprVisitor, public ExternSourcePrinter {
   }
 
  private:
+  /*! \brief The subgraph id that represents an GCC external function. */
   std::string subgraph_id_ = "";
+  /*! \brief The index of an external function. */
   int func_idx = 0;
+  /*! \brief The index of allocated buffers. */
   int buf_idx_ = 0;
+  /*! \brief The arguments of a GCC compatible external function. */
   std::vector<std::string> subgraph_args_;
+  /*! \brief The statements of a GCC compatible external function. */
   std::vector<std::string> subgraph_body;
+  /*! \brief The declaration statements of a GCC compatible external function. */
   std::vector<std::string> func_decl_;
+  /*! \brief The declaration statements of buffers. */
   std::vector<std::string> buf_decl_;
+  /*! \brief The name and index pairs for output. */
   std::vector<std::pair<std::string, int>> out_;
-
-  std::vector<int> GetShape(const Type& type) const {
-    const auto* ttype = type.as<TensorTypeNode>();
-    CHECK(ttype);
-    std::vector<int> shape;
-    for (size_t i = 0; i < ttype->shape.size(); ++i) {
-      auto* val = ttype->shape[i].as<IntImm>();
-      CHECK(val);
-      shape.push_back(val->value);
-    }
-    return shape;
-  }
 };
 
 class GccCodegen : public ExternCodegenBase {
diff --git a/src/relay/backend/contrib/gcc/libs.cc b/src/relay/backend/contrib/gcc/libs.cc
deleted file mode 100644
index 472fc12e4323..000000000000
--- a/src/relay/backend/contrib/gcc/libs.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <cstdint>
-#include <cstring>
-#include <iostream>
-
-typedef struct {
-  float** data;
-} GccPackedArgs;
-
-#define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \
-  extern "C" void p_ID_(float* a, float* b, float* out) { \
-    for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-      out[i] = a[i] p_OP_ b[i];                           \
-    }                                                     \
-  }
-
-#define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-  extern "C" void p_ID_(float* a, float* b, float* out) { \
-    for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-      for (int64_t j = 0; j < p_DIM2_; ++j) {             \
-        int64_t k = i * p_DIM2_ + j;                      \
-        out[k] = a[k] p_OP_ b[k];                         \
-      }                                                   \
-    }                                                     \
-  }
diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index 0e3d8e93fad0..d258c615da98 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -19,19 +19,19 @@
 .. _tutorial-custom-relay-backend:
 
 Bring Your Own Codegen To TVM
-============================================
+=============================
 **Author**: `Zhi Chen <https://github.com/zhiics>`_, `Cody Hao Yu <https:://github.com/comaniac>`_
 
 As the hardware devices targeted by deep learning workloads keep increasing, the required knowledge
 for users to achieve high performance on various devices keeps increasing as well. To free data
 scientists from worrying about the performance when developing a new model, hardware vendors either
 provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators,
-or provide frameworks such as TensorRT to let users describle their models in a certain way to
+or provide frameworks such as TensorRT to let users describe their models in a certain way to
 achieve high performance. However, users have to learn a new programming interface when they
 attempt to work on a new library or device. As a result, the demand of a unified programming
 interface becomes more and more important to 1) let all users and hardware vendors stand on the
 same page, and 2) provide a feasible solution to allow a specialized hardware or library to only
-support widely used operators with extremely high perofrmance, but fallback unsupported operators
+support widely used operators with extremely high performance, but fallback unsupported operators
 to general devices like CPU/GPU.
 
 In this tutorial, we demonstrate how a hardware vendor can easily implement
@@ -86,14 +86,14 @@ def multiply(attrs, args):
 # can define more complicated rules. For example, we can only support conv2d
 # with float32 data type or with kernel size 1x1. In addition, the vendors can
 # also check the attributes associated with a given operator to decide if it is
-# supported by checking the fields in `attrs`. In a even more complicated but
+# supported by checking the fields in `attrs`. In an even more complicated but
 # interesting scenario, we also allow developers to check the sequence of
 # operators through iterating on the `agrs`. However, this is only
 # unidirectional as only the inputs are visible.
 #
 # After annotating whether an operator can be executed on the given backend.
 # Users can directly invoke the partitioning pass to separate the graph into
-# multiple segments. The C++ backend implements a partitioning pass to fullfil
+# multiple segments. The C++ backend implements a partitioning pass to fulfill
 # the task and creates subgraphs/sub-functions with *External* attribute,
 # indicating that this function will be handled by external codegen tool.
 # Therefore, Relay passes should skip optimizations on them.
@@ -274,7 +274,7 @@ def visit_call(self, call):
 #     The complete GCC backend implementation is in the TVM codebase
 #     so we can directly use it in this tutorial for demonstration.
 #
-#     Multiple external backends can be eneabled simultaneously by ";".
+#     Multiple external backends can be enabled simultaneously by ";".
 #     For example: set(USE_EXTERN gcc;dnnl)
 
 import numpy as np

From cf7ac3c1f7acad22e50d688eafa4fd4558b203bd Mon Sep 17 00:00:00 2001
From: Cody Hao Yu <comaniac0422@gmail.com>
Date: Sun, 1 Dec 2019 19:28:22 +0000
Subject: [PATCH 34/34] fix typo

---
 src/relay/backend/contrib/dnnl/codegen.cc | 2 +-
 src/relay/backend/contrib/gcc/codegen.cc  | 2 +-
 tutorials/dev/custom_relay_backend.py     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 49fe916f6446..1057af560665 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -235,7 +235,7 @@ class DNNLCodegen : public ExternCodegenBase {
    *
    * \return The runtime module that contains C source code.
    */
-  runtime::Module CreateExternModule(const NodeRef& ref) {
+  runtime::Module CreateExternModule(const NodeRef& ref) override {
     // Create headers
     code_stream_ << "#include <cstdint>\n";
     code_stream_ << "#include <cstdlib>\n";
diff --git a/src/relay/backend/contrib/gcc/codegen.cc b/src/relay/backend/contrib/gcc/codegen.cc
index ce327d6a2c37..0530dec9ae79 100644
--- a/src/relay/backend/contrib/gcc/codegen.cc
+++ b/src/relay/backend/contrib/gcc/codegen.cc
@@ -154,7 +154,7 @@ class GccCodegen : public ExternCodegenBase {
     code_stream_ << builder.JIT();
   }
 
-  runtime::Module CreateExternModule(const NodeRef& ref) {
+  runtime::Module CreateExternModule(const NodeRef& ref) override {
     // Create headers
     code_stream_ << "#include <cstdint>\n";
     code_stream_ << "#include <iostream>\n";
diff --git a/tutorials/dev/custom_relay_backend.py b/tutorials/dev/custom_relay_backend.py
index d258c615da98..ba8ce514d27d 100644
--- a/tutorials/dev/custom_relay_backend.py
+++ b/tutorials/dev/custom_relay_backend.py
@@ -22,7 +22,7 @@
 =============================
 **Author**: `Zhi Chen <https://github.com/zhiics>`_, `Cody Hao Yu <https:://github.com/comaniac>`_
 
-As the hardware devices targeted by deep learning workloads keep increasing, the required knowledge
+As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge
 for users to achieve high performance on various devices keeps increasing as well. To free data
 scientists from worrying about the performance when developing a new model, hardware vendors either
 provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators,
@@ -134,7 +134,7 @@ def multiply(attrs, args):
 #         out
 #
 # The implementation is shown as follows. As can be seen, the annotator is derived from
-# `ExprMutator` that traverses a Relay graph and allows we to mutate it. We know that all ops
+# `ExprMutator` that traverses a Relay graph and allows us to mutate it. We know that all ops
 # are `call` nodes in Relay graph, so we override the call node mutator `visit_call` in
 # `ExprMutator` and insert annotations.