From 8745a3c9966351a3840961c17d94e0adb6708adf Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 23 Feb 2022 15:49:31 -0800
Subject: [PATCH 01/21] Add memory pools to Metadata classes.

---
 include/tvm/runtime/metadata.h |  9 +++++++
 src/runtime/metadata.cc        |  3 +++
 src/target/metadata.h          | 25 ++++++++++++++++--
 tests/cpp/aot_metadata_test.cc | 46 +++++++++++++++++++++++++++++++---
 4 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index b716d41c5d27..49eef34d0819 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -75,6 +75,13 @@ struct TVMMetadata {
   const struct TVMTensorInfo* outputs;
   /*! \brief Number of elements in `outputs` array. */
   int64_t num_outputs;
+  /*! \brief Memory Pools needed by the AOT run_model function.
+   * The order of the elements is the same as in the arguments to run_model. That is to say,
+   * this array specifies the last `num_pools` arguments to run_model.
+   */
+  const struct TVMTensorInfo* pools;
+  /*! \brief Number of elements in `pools` array. */
+  int64_t num_pools;
   /*! \brief Name of the model, as passed to tvm.relay.build. */
   const char* mod_name;
 };
@@ -114,6 +121,8 @@ class MetadataNode : public MetadataBaseNode {
   ArrayAccessor<struct TVMTensorInfo, TensorInfo> inputs();
   inline int64_t num_outputs() const { return data_->num_outputs; }
   ArrayAccessor<struct TVMTensorInfo, TensorInfo> outputs();
+  inline int64_t num_pools() const { return data_->num_pools; }
+  ArrayAccessor<struct TVMTensorInfo, TensorInfo> pools();
   inline ::tvm::runtime::String mod_name() const { return ::tvm::runtime::String(data_->mod_name); }
   const struct ::TVMMetadata* data() const { return data_; }
   TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, MetadataBaseNode);
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index 7ca333b06c82..df2bb0d461f3 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -34,6 +34,9 @@ ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::inputs() {
 ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::outputs() {
   return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->outputs, data_->num_outputs);
 }
+ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::pools() {
+  return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->pools, data_->num_pools);
+}
 
 TVM_REGISTER_OBJECT_TYPE(MetadataBaseNode);
 
diff --git a/src/target/metadata.h b/src/target/metadata.h
index 2621d5d4e65d..e77ea0bce71a 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -71,6 +71,17 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
     v->Visit("outputs", &outputs_metadata_array);
     int64_t num_outputs_cpp = num_outputs();
     v->Visit("num_outputs", &num_outputs_cpp);
+    auto pools_array = Array<ObjectRef>();
+    auto pools_accessor = pools();
+    pools_array.reserve(num_pools());
+    for (int64_t i = 0; i < num_pools(); ++i) {
+      pools_array.push_back(::tvm::runtime::metadata::TensorInfo{pools_accessor[i]});
+    }
+    ::tvm::runtime::metadata::MetadataArray pools_metadata_array{
+        pools_array, ::tvm::runtime::metadata::MetadataTypeIndex::kMetadata, "TVMTensorInfo"};
+    v->Visit("pools", &pools_metadata_array);
+    int64_t num_pools_cpp = num_pools();
+    v->Visit("num_pools", &num_pools_cpp);
     ::std::string mod_name_cpp{data()->mod_name};
     v->Visit("mod_name", &mod_name_cpp);
   }
@@ -86,19 +97,22 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
 class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNode {
  public:
   InMemoryMetadataNode()
-      : InMemoryMetadataNode(0 /* version */, {} /* inputs */, {} /* outputs */,
+      : InMemoryMetadataNode(0 /* version */, {} /* inputs */, {} /* outputs */, {} /* pools */,
                              "" /* mod_name */) {}
   InMemoryMetadataNode(int64_t version,
                        const ::std::vector<::tvm::runtime::metadata::TensorInfo>& inputs,
                        const ::std::vector<::tvm::runtime::metadata::TensorInfo>& outputs,
+                       const ::std::vector<::tvm::runtime::metadata::TensorInfo>& pools,
                        const ::tvm::runtime::String mod_name)
       : VisitableMetadataNode{&storage_},
         inputs_{new struct TVMTensorInfo[inputs.size()]()},
         inputs_objs_{inputs},
         outputs_{new struct TVMTensorInfo[outputs.size()]()},
         outputs_objs_{outputs},
+        pools_{new struct TVMTensorInfo[pools.size()]()},
+        pools_objs_{pools},
         mod_name_{mod_name},
-        storage_{version, nullptr, 0, nullptr, 0, mod_name_.c_str()} {
+        storage_{version, nullptr, 0, nullptr, 0, nullptr, 0, mod_name_.c_str()} {
     storage_.inputs = inputs_.get();
     storage_.num_inputs = inputs.size();
     for (unsigned int i = 0; i < inputs.size(); ++i) {
@@ -109,6 +123,11 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
     for (unsigned int i = 0; i < outputs.size(); ++i) {
       outputs_.get()[i] = *outputs[i]->data();
     }
+    storage_.pools = pools_.get();
+    storage_.num_pools = pools.size();
+    for (unsigned int i = 0; i < pools.size(); ++i) {
+      pools_.get()[i] = *pools[i]->data();
+    }
   }
 
  private:
@@ -116,6 +135,8 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
   std::vector<::tvm::runtime::metadata::TensorInfo> inputs_objs_;
   ::std::unique_ptr<struct TVMTensorInfo> outputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> outputs_objs_;
+  ::std::unique_ptr<struct TVMTensorInfo> pools_;
+  std::vector<::tvm::runtime::metadata::TensorInfo> pools_objs_;
   ::std::string mod_name_;
   struct ::TVMMetadata storage_;
 };
diff --git a/tests/cpp/aot_metadata_test.cc b/tests/cpp/aot_metadata_test.cc
index 730762237488..abf37ce4569a 100644
--- a/tests/cpp/aot_metadata_test.cc
+++ b/tests/cpp/aot_metadata_test.cc
@@ -37,8 +37,11 @@ const int64_t kNormalOutput1Shape[3] = {3, 8, 8};
 const struct TVMTensorInfo kNormalOutputs[1] = {
     {"output1", kNormalOutput1Shape, 3, DLDataType{3, 4, 5}}};
 
+const int64_t kNormalPool1Shape[3] = {3, 8, 8};
+const struct TVMTensorInfo kNormalPools[1] = {{"pool1", kNormalPool1Shape, 3, DLDataType{3, 4, 7}}};
+
 const struct TVMMetadata kNormal = {
-    TVM_METADATA_VERSION, kNormalInputs, 2, kNormalOutputs, 1, "default",
+    TVM_METADATA_VERSION, kNormalInputs, 2, kNormalOutputs, 1, kNormalPools, 1, "default",
 };
 }  // namespace
 
@@ -74,6 +77,14 @@ TEST(Metadata, ParseStruct) {
   EXPECT_THAT(output1->shape(), ElementsAre(3, 8, 8));
   EXPECT_THAT(output1->dtype(), Eq(tvm::runtime::DataType(DLDataType{3, 4, 5})));
 
+  auto pools = md->pools();
+  EXPECT_THAT(pools.size(), Eq(1));
+
+  auto pool1 = pools[0];
+  EXPECT_THAT(pool1->name(), Eq("pool1"));
+  EXPECT_THAT(pool1->shape(), ElementsAre(3, 8, 8));
+  EXPECT_THAT(pool1->dtype(), Eq(tvm::runtime::DataType(DLDataType{3, 4, 7})));
+
   EXPECT_THAT(md->mod_name(), Eq("default"));
 }
 
@@ -131,7 +142,8 @@ TEST(Metadata, Visitor) {
   ::tvm::ReflectionVTable::Global()->VisitAttrs(md.operator->(), &v);
 
   EXPECT_THAT(v.keys, ElementsAre(StrEq("version"), StrEq("inputs"), StrEq("num_inputs"),
-                                  StrEq("outputs"), StrEq("num_outputs"), StrEq("mod_name")));
+                                  StrEq("outputs"), StrEq("num_outputs"), StrEq("pools"),
+                                  StrEq("num_pools"), StrEq("mod_name")));
   EXPECT_THAT(Downcast<tvm::IntImm>(v.values[0])->value, Eq(TVM_METADATA_VERSION));
 
   EXPECT_THAT(Downcast<tvm::IntImm>(v.values[0])->value, Eq(TVM_METADATA_VERSION));
@@ -164,6 +176,19 @@ TEST(Metadata, Visitor) {
 
   auto num_outputs = Downcast<tvm::IntImm>(v.values[4]);
   EXPECT_THAT(num_outputs->value, Eq(1));
+
+  auto pool_array = Downcast<tvm::runtime::metadata::MetadataArray>(v.values[5]);
+  EXPECT_THAT(pool_array->type_index, Eq(tvm::runtime::metadata::MetadataTypeIndex::kMetadata));
+  EXPECT_THAT(pool_array->struct_name, StrEq("TVMTensorInfo"));
+  auto pool1 = Downcast<tvm::runtime::metadata::TensorInfo>(pool_array->array[0]);
+
+  EXPECT_THAT(pool1->name(), Eq("pool1"));
+
+  auto num_pools = Downcast<tvm::IntImm>(v.values[6]);
+  EXPECT_THAT(num_pools->value, Eq(1));
+
+  auto mod_name = Downcast<tvm::String>(v.values[7]);
+  EXPECT_THAT(mod_name, Eq("default"));
 }
 
 using ::tvm::runtime::make_object;
@@ -184,6 +209,10 @@ TEST(Metadata, InMemory) {
               make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
                   tvm::String("Output1"), std::vector<int64_t>{3, 8, 8},
                   tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
+          std::vector<tvm::runtime::metadata::TensorInfo>({tvm::runtime::metadata::TensorInfo(
+              make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+                  tvm::String("Pool1"), std::vector<int64_t>{5, 10, 10},
+                  tvm::runtime::DataType(DLDataType{3, 4, 7})))}),
           "default"));
 
   auto md_data = md->data();
@@ -211,6 +240,13 @@ TEST(Metadata, InMemory) {
   EXPECT_THAT(tvm::runtime::DataType(output0->dtype),
               Eq(tvm::runtime::DataType(DLDataType({3, 4, 5}))));
 
+  auto pool0 = &md_data->pools[0];
+  EXPECT_THAT(pool0->name, StrEq("Pool1"));
+  EXPECT_THAT(std::vector<int64_t>(pool0->shape, pool0->shape + pool0->num_shape),
+              ElementsAre(5, 10, 10));
+  EXPECT_THAT(tvm::runtime::DataType(pool0->dtype),
+              Eq(tvm::runtime::DataType(DLDataType({3, 4, 7}))));
+
   EXPECT_THAT(md_data->mod_name, StrEq("default"));
 }
 
@@ -222,7 +258,7 @@ TEST(Metadata, ZeroElementLists) {
               make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
                   tvm::String("Output1"), std::vector<int64_t>{},
                   tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
-          "default"));
+          std::vector<tvm::runtime::metadata::TensorInfo>({}), "default"));
 
   EXPECT_THAT(md->data()->num_inputs, Eq(0));
   EXPECT_THAT(md->inputs().size(), Eq(0));
@@ -233,4 +269,8 @@ TEST(Metadata, ZeroElementLists) {
   EXPECT_THAT(output0.num_shape, Eq(0));
   EXPECT_THAT(md->outputs()[0]->shape().size(), Eq(0));
   EXPECT_THAT(md->outputs()[0]->shape(), ElementsAre());
+
+  EXPECT_THAT(md->pools().size(), Eq(0));
+  EXPECT_THAT(md->num_pools(), Eq(0));
+  EXPECT_THAT(md->pools(), ElementsAre());
 }

From 81a43523fbf365654141c829113686f4b06cc57c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 28 Nov 2021 22:19:03 -0800
Subject: [PATCH 02/21] Move ShapeToJSON to utils.

---
 src/relay/backend/graph_executor_codegen.cc | 19 ++-----------------
 src/relay/backend/utils.cc                  |  9 +++++++++
 src/relay/backend/utils.h                   |  9 +++++++++
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 227f7bbfdf31..16d0207ff278 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -294,21 +294,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   }
 
  protected:
-  /*!
-   * \brief Extract shape from expr to vector<int64_t>
-   *
-   * \param shape
-   * \return std::vector<int64_t>
-   */
-  std::vector<int64_t> _ShapeToJSON(tvm::Array<IndexExpr> shape) {
-    std::vector<int64_t> ret;
-    for (IndexExpr dim : shape) {
-      const int64_t* pval = tir::as_const_int(dim);
-      ret.push_back(*pval);
-    }
-    return ret;
-  }
-
   /*!
    * \brief Add node to graph
    *
@@ -352,7 +337,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       for (size_t i = 0; i < tuple_type->fields.size(); ++i) {
         if (const auto* typ = tuple_type->fields[i].as<TensorTypeNode>()) {
           ret.push_back(GraphNodeRef(node_id, i));
-          shape.emplace_back(_ShapeToJSON(typ->shape));
+          shape.emplace_back(ShapeToJSON(typ->shape));
           dtype.emplace_back(DType2String(typ->dtype));
         } else {
           LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
@@ -369,7 +354,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     if (const auto* tensor_type = checked_type.as<TensorTypeNode>()) {
       ShapeVector shape;
       std::vector<std::string> dtype;
-      shape.emplace_back(_ShapeToJSON(tensor_type->shape));
+      shape.emplace_back(ShapeToJSON(tensor_type->shape));
       dtype.emplace_back(DType2String(tensor_type->dtype));
       node->attrs_["shape"] = shape;
       node->attrs_["dtype"] = dtype;
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 7d0df672bf6f..a3eb00ae984c 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -294,6 +294,15 @@ void UpdateAutoSchedulerOpWeights(const IRModule& module) {
   (*te_compiler_update_weights)(weight_map);
 }
 
+std::vector<int64_t> ShapeToJSON(tvm::Array<IndexExpr> shape) {
+  std::vector<int64_t> ret;
+  for (IndexExpr dim : shape) {
+    const int64_t* pval = tir::as_const_int(dim);
+    ret.push_back(*pval);
+  }
+  return ret;
+}
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 645940e7de89..d04bc9bb26ea 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -45,6 +45,7 @@
 #include <vector>
 
 #include "../../runtime/meta_data.h"
+#include "../../target/metadata.h"
 
 namespace tvm {
 namespace relay {
@@ -587,6 +588,14 @@ Map<Target, IRModule> TargetStrModuleMapToTargetModuleMap(
  */
 void UpdateAutoSchedulerOpWeights(const IRModule& module);
 
+/*!
+ * \brief Extract shape from expr to vector<int64_t>
+ *
+ * \param shape
+ * \return std::vector<int64_t>
+ */
+std::vector<int64_t> ShapeToJSON(tvm::Array<IndexExpr> shape);
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm

From 3ea361b0957950c9e6b21f981c4faf4b9e13c30b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 15 Feb 2022 13:44:18 -0800
Subject: [PATCH 03/21] Track returned TensorType from AOTExecutorCodegen.

---
 src/relay/backend/aot_executor_codegen.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a25ef458906c..983d985b24d6 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -70,6 +70,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
   void Run(const Function& func) { VisitExpr(func); }
 
   std::vector<int> GetReturnIds() const { return return_ids_; }
+  std::vector<TensorType> GetReturnTtypes() const { return return_ttypes_; }
 
   StorageMap GetStorageMap() const { return storage_device_map_; }
 
@@ -177,6 +178,12 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
       for (auto sid : sinfo->storage_ids) {
         return_ids_.push_back(sid);
       }
+      return_ttypes_.clear();
+      auto ttypes = FlattenTupleType(e->checked_type());
+      return_ttypes_.reserve(ttypes.size());
+      for (auto ttype : ttypes) {
+        return_ttypes_.push_back(ttype);
+      }
     }
   }
   /*!
@@ -252,6 +259,8 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
   int next_available_sid_{0};
   /*! \brief the set of intermediate tensors that are return variables */
   std::vector<int> return_ids_;
+  /*! \brief the data types of the return values */
+  std::vector<TensorType> return_ttypes_;
 };
 
 /*! \brief Code generator for AOT executor */

From 2e4c27722f35da424feda0434b3e2f0ab1aacca8 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 15 Feb 2022 17:32:50 -0800
Subject: [PATCH 04/21] Support calling Relay functions with Tuple.

---
 src/relay/backend/aot_executor_codegen.cc | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 983d985b24d6..575df71ed8b4 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -326,6 +326,16 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  void PushArgs(const Expr& expr, std::vector<tir::Var> sids, Array<PrimExpr>* args) {
+    const TupleNode* t = expr.as<TupleNode>();
+    if (t != nullptr) {
+      CHECK_EQ(sids.size(), t->fields.size()) << "Relay tuple does not map 1:1 into TIR; AOT can't "
+                                                 "handle this type of Relay Expr in a CallNode.";
+    }
+
+    args->insert(args->end(), sids.begin(), sids.end());
+  }
+
   /*!
    * brief Create a function call
    * \param call_lowered_props The lowered function and the arguments to call it with
@@ -343,17 +353,14 @@ class AOTExecutorCodegen : public MixedModeVisitor {
                                            {tir::StringImm(params_by_expr_[arg])});
         args.push_back(tvm::tir::Cast(DataType::Handle(), param_handle));
       } else {
-        auto var_arg = FindExpr(arg);
-        for (const auto& var : var_arg) {
-          args.push_back(var);
-        }
+        auto sids = FindExpr(arg);
+        PushArgs(arg, sids, &args);
       }
     }
 
     // Pack the return(s) value. A call node can produce multiple outputs
-    for (const auto& var : PackSid(result_expr)) {
-      args.push_back(var);
-    }
+    auto result_expr_sid = PackSid(result_expr);
+    PushArgs(result_expr, result_expr_sid, &args);
 
     // Use tvm_call_packed to execute the function unless we're calling directly
     auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();

From e5571e2bad57278f22c646deb29a9e1f85821d96 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 15 Feb 2022 17:33:09 -0800
Subject: [PATCH 05/21] Expand supported TIR calling conventions to work with
 C++ runtime.

---
 src/relay/backend/aot_executor_codegen.cc | 59 ++++++++++++++++++++---
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 575df71ed8b4..65f293b15cdb 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -27,6 +27,7 @@
 #include <tvm/relay/attrs/call.h>
 #include <tvm/relay/executor.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/runtime.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
 #include <tvm/tir/analysis.h>
@@ -362,16 +363,19 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     auto result_expr_sid = PackSid(result_expr);
     PushArgs(result_expr, result_expr_sid, &args);
 
-    // Use tvm_call_packed to execute the function unless we're calling directly
-    auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
+    // Choose call style based on Runtime/Executor config.
+    Op calling_pattern;
     if (use_unpacked_api_) {
       calling_pattern = tvm::tir::builtin::call_extern();
+    } else if (use_call_cpacked_) {
+      calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
+    } else {
+      calling_pattern = tvm::tir::builtin::tvm_call_packed();
     }
 
     GlobalVar global_var = call_lowered_props.lowered_func;
     tir::Var empty_var("no_device_context", DataType::Handle());
     bool has_c_device_api_context = device_contexts_.count(global_var) != 0;
-    bool use_cpacked_api = !use_unpacked_api_;
 
     // The device context is passed to the operator in one of the following calling patterns:
     //  * Unpacked / direct function call with context:
@@ -395,7 +399,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           func_call,
           GenerateDeviceHook(context, "Close"),
       }));
-    } else if (use_cpacked_api) {
+    } else if (use_call_cpacked_) {
       // call_cpacked calling convention needs a blank context
       args.push_back(tir::make_zero(DataType::Handle()));
       tir::Evaluate func_call(tvm::tir::Call(DataType::Int(32), calling_pattern, args));
@@ -828,13 +832,29 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   Target target_host_;
   /*!
    * \brief unpacked api toggle
-   * When set to true the code generated will use unpacked calls to functions:
+   * When set to true, the generated code will use unpacked calls to functions:
    * func(void* arg0, void* arg1)
-   * Rather than packed calls:
-   * func(void* args)
+   * Rather than packed calls (in which arg0 and arg1 are in `arg_values`).
+   * func(TVMValue* arg_values, int* arg_type_codes, int num_args, ...)
    * Defaults to using the packed calling convention
+   *
+   * Unpacked API is supported when runtime == "c" and interface_api is "c".
    */
   Bool use_unpacked_api_;
+  /*!
+   * \brief cpacked api toggle
+   * When set to true, the generated code will use call_cpacked to call functions directly, assuming
+   * they exist in a DSO-exportable module:
+   * func(...)
+   * Rather than through the traditional call_packed calls, which should use function pointers
+   * looked-up through TVMBackendGetFuncFromEnv:
+   * TVMBackendPackedCFunc* func_ptr = TVMBackendGetFuncFromEnv("func");
+   * func_ptr(...)
+   * Defaults to using the packed calling convention
+   *
+   * call_cpacked is required when runtime is "c++" and supported when runtime is "c"
+   */
+  Bool use_call_cpacked_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -863,7 +883,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const tec::TargetMap& targets, Target target_host)
-      : mod_(mod), targets_(targets), target_host_(target_host), use_unpacked_api_(Bool(false)) {}
+      : mod_(mod),
+        targets_(targets),
+        target_host_(target_host),
+        use_unpacked_api_(Bool(false)),
+        use_call_cpacked_(Bool(false)) {}
 
   LoweredOutput Codegen(IRModule mod, relay::Function func, String mod_name) {
     VLOG_CONTEXT << "AOT";
@@ -873,11 +897,30 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     ICHECK(target_host_.defined()) << "require a target_host to be given for AOT codegen";
     VLOG(1) << "target host: " << target_host_->ToDebugString();
 
+    Runtime runtime_config = mod->GetAttr<Runtime>(tvm::attr::kRuntime).value();
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
     String interface_api = executor_config->GetAttr<String>("interface-api").value_or("packed");
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     use_unpacked_api_ = executor_config->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
+    use_call_cpacked_ = !use_unpacked_api_;
+
+    // Validate choice of use_unpacked_api_ and use_call_cpacked_
+    if (runtime_config->name == kTvmRuntimeCrt) {
+      CHECK(interface_api == "c" || static_cast<bool>(use_unpacked_api_) == false)
+          << "Either need interface_api == \"c\" (got: " << interface_api
+          << ") or unpacked-api == false (got: " << use_unpacked_api_
+          << ") when targeting c runtime";
+    } else if (runtime_config->name == kTvmRuntimeCpp) {
+      CHECK(static_cast<bool>(use_unpacked_api_) == false &&
+            static_cast<bool>(use_call_cpacked_) == true)
+          << "Need unpacked-api == false (got: " << use_unpacked_api_
+          << ") and interface-api == \"packed\" (got: " << interface_api
+          << ") when targeting c++ runtime";
+    } else {
+      ICHECK(false) << "runtime_config (" << runtime_config->name
+                    << ") is not one of the expected values";
+    }
 
     // TODO(mbs): Plumb from compiler config
     VirtualDevice host_virtual_device = VirtualDevice::ForTarget(target_host_);

From 9e81293e3876dfcf70a4d625e71dd8344418cc19 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 28 Nov 2021 22:33:13 -0800
Subject: [PATCH 06/21] Rename MetadataModule to ConstLoaderModule.

---
 python/tvm/contrib/graph_executor.py          |  2 +-
 ...adata_module.cc => const_loader_module.cc} | 89 ++++++++++---------
 src/runtime/const_loader_module.h             | 53 +++++++++++
 src/runtime/meta_data.h                       | 13 ---
 src/target/metadata_module.cc                 | 23 ++---
 5 files changed, 112 insertions(+), 68 deletions(-)
 rename src/runtime/{metadata_module.cc => const_loader_module.cc} (67%)
 create mode 100644 src/runtime/const_loader_module.h

diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index ac6803ca9842..e8e2798ef734 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -189,7 +189,7 @@ def set_input(self, key=None, value=None, **params):
             keys.sort(key=lambda x: -np.prod(params[x].shape))
             for k in keys:
                 # TODO(zhiics) Skip the weights for submodule in a better way.
-                # We should use MetadataModule for initialization and remove
+                # We should use ConstLoaderModule for initialization and remove
                 # params from set_input
                 val = self._get_input(k)
                 if val:
diff --git a/src/runtime/metadata_module.cc b/src/runtime/const_loader_module.cc
similarity index 67%
rename from src/runtime/metadata_module.cc
rename to src/runtime/const_loader_module.cc
index 7cb986bba62c..5496e161e57f 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/const_loader_module.cc
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file src/runtime/metadata_module.cc
- * \brief A wrapper for initializing imported modules using metadata. This
+ * \file src/runtime/const_loader_module.cc
+ * \brief A wrapper for initializing imported modules using constant NDArray. This
  * module is intended to be used by various runtime in the TVM stack, i.e.
  * graph executor, relay VM, AOT runtime, and various user defined runtimes. It
  * paves the way to separate the code and metedata, which makes compilation
  * and/or interpretation more convenient. In addition, the clear separation of
- * code and metadata significantly reduces the efforts for handling external
+ * code and constants significantly reduces the efforts for handling external
  * codegen and runtimes.
  */
 #include <tvm/runtime/container/array.h>
@@ -42,18 +42,19 @@ namespace tvm {
 namespace runtime {
 
 /*!
- * \brief The metadata module is designed to manage initialization of the
- * imported submodules.
+ * \brief The const-loader module is designed to manage initialization of the
+ * imported submodules for the C++ runtime.
  */
-class MetadataModuleNode : public ModuleNode {
+class ConstLoaderModuleNode : public ModuleNode {
  public:
-  MetadataModuleNode(const std::unordered_map<std::string, NDArray>& metadata,
-                     const std::unordered_map<std::string, std::vector<std::string>>& sym_vars)
-      : metadata_(metadata), sym_vars_(sym_vars) {
+  ConstLoaderModuleNode(
+      const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+      const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol)
+      : const_var_ndarray_(const_var_ndarray), const_vars_by_symbol_(const_vars_by_symbol) {
     // Only the related submodules are cached to reduce the number of runtime
     // symbol lookup for initialization. Otherwise, symbols/primitives in the
     // DSO module will also be cached but they never need to be initialized.
-    for (const auto& it : sym_vars_) {
+    for (const auto& it : const_vars_by_symbol_) {
       initialized_[it.first] = false;
     }
   }
@@ -78,20 +79,20 @@ class MetadataModuleNode : public ModuleNode {
     return PackedFunc(nullptr);
   }
 
-  const char* type_key() const { return "metadata"; }
+  const char* type_key() const { return "const_loader"; }
 
   /*!
-   * \brief Get the list of metadata that is required by the given module.
+   * \brief Get the list of constants that is required by the given module.
    * \param symbol The symbol that is being queried.
    * \return The list of needed NDArray.
    */
-  Array<NDArray> GetRequiredMetadata(const std::string& symbol) {
+  Array<NDArray> GetRequiredConstants(const std::string& symbol) {
     Array<NDArray> ret;
-    ICHECK_GT(sym_vars_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
-    std::vector<std::string> vars = sym_vars_[symbol];
+    ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
+    std::vector<std::string> vars = const_vars_by_symbol_[symbol];
     for (const auto& it : vars) {
-      ICHECK_GT(metadata_.count(it), 0U) << "Found not recorded constant variable: " << it;
-      ret.push_back(metadata_[it]);
+      ICHECK_GT(const_var_ndarray_.count(it), 0U) << "Found not recorded constant variable: " << it;
+      ret.push_back(const_var_ndarray_[it]);
     }
     return ret;
   }
@@ -102,12 +103,12 @@ class MetadataModuleNode : public ModuleNode {
    * for runtime lookup.
    *
    * \note  A module could be like the following:
-   *  MetadataModuleNode (contains all the metadata)
+   *  ConstLoaderModuleNode (contains all the constants)
    *    - CSourceModule
    *    - JSON runtime module
    *
    *  The initializer iterates through the imported modules and intilizes the
-   *  found module accordingly by passing the needed metadata into it.
+   *  found module accordingly by passing the needed constants into it.
    */
   void InitSubModule(const std::string& symbol) {
     PackedFunc init(nullptr);
@@ -116,8 +117,8 @@ class MetadataModuleNode : public ModuleNode {
       std::string init_name = "__init_" + symbol;
       init = it.GetFunction(init_name, false);
       if (init != nullptr) {
-        auto md = GetRequiredMetadata(symbol);
-        // Initialize the module with metadata.
+        auto md = GetRequiredConstants(symbol);
+        // Initialize the module with constants.
         int ret = init(md);
         // Report the error if initialization is failed.
         ICHECK_EQ(ret, 0) << TVMGetLastError();
@@ -128,32 +129,32 @@ class MetadataModuleNode : public ModuleNode {
 
   void SaveToBinary(dmlc::Stream* stream) final {
     std::vector<std::string> variables;
-    std::vector<NDArray> metadata;
-    for (const auto& it : metadata_) {
+    std::vector<NDArray> const_var_ndarray;
+    for (const auto& it : const_var_ndarray_) {
       String var_name = it.first;
       variables.push_back(var_name);
-      metadata.push_back(it.second);
+      const_var_ndarray.push_back(it.second);
     }
 
     // Save all variables in the function.
     stream->Write(variables);
     // Save all constant data.
-    uint64_t sz = static_cast<uint64_t>(metadata.size());
+    uint64_t sz = static_cast<uint64_t>(const_var_ndarray.size());
     stream->Write(sz);
     for (uint64_t i = 0; i < sz; i++) {
-      metadata[i].Save(stream);
+      const_var_ndarray[i].Save(stream);
     }
 
     // Save the symbol to list of required constant variables mapping
     std::vector<std::string> symbols;
     std::vector<std::vector<std::string>> const_vars;
-    for (const auto& it : sym_vars_) {
+    for (const auto& it : const_vars_by_symbol_) {
       symbols.push_back(it.first);
       const_vars.push_back(it.second);
     }
 
     stream->Write(symbols);
-    sz = static_cast<uint64_t>(sym_vars_.size());
+    sz = static_cast<uint64_t>(const_vars_by_symbol_.size());
     stream->Write(sz);
     for (uint64_t i = 0; i < sz; i++) {
       stream->Write(const_vars[i]);
@@ -165,9 +166,9 @@ class MetadataModuleNode : public ModuleNode {
 
     // Load the variables.
     std::vector<std::string> variables;
-    ICHECK(stream->Read(&variables)) << "Loading variables failed";
+    ICHECK(stream->Read(&variables)) << "Loading variable names failed";
     uint64_t sz;
-    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading metadata size failed";
+    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading number of vars failed";
     ICHECK_EQ(static_cast<size_t>(sz), variables.size())
         << "The number of variables and ndarray counts must match";
     // Load the list of ndarray.
@@ -178,10 +179,10 @@ class MetadataModuleNode : public ModuleNode {
       arrays.push_back(temp);
     }
 
-    std::unordered_map<std::string, NDArray> metadata;
+    std::unordered_map<std::string, NDArray> const_var_ndarray;
     for (uint64_t i = 0; i < sz; i++) {
-      ICHECK_EQ(metadata.count(variables[i]), 0U);
-      metadata[variables[i]] = arrays[i];
+      ICHECK_EQ(const_var_ndarray.count(variables[i]), 0U);
+      const_var_ndarray[variables[i]] = arrays[i];
     }
 
     // Load the symbol to list of required constant variables mapping
@@ -196,12 +197,12 @@ class MetadataModuleNode : public ModuleNode {
       const_vars.push_back(vars);
     }
 
-    std::unordered_map<std::string, std::vector<std::string>> sym_vars;
+    std::unordered_map<std::string, std::vector<std::string>> const_vars_by_symbol;
     for (uint64_t i = 0; i < sz; i++) {
-      sym_vars[symbols[i]] = const_vars[i];
+      const_vars_by_symbol[symbols[i]] = const_vars[i];
     }
 
-    auto n = make_object<MetadataModuleNode>(metadata, sym_vars);
+    auto n = make_object<ConstLoaderModuleNode>(const_var_ndarray, const_vars_by_symbol);
     return Module(n);
   }
 
@@ -212,19 +213,21 @@ class MetadataModuleNode : public ModuleNode {
    */
   std::unordered_map<std::string, bool> initialized_;
   /*! \brief Variable name to NDArray mapping. */
-  std::unordered_map<std::string, NDArray> metadata_;
+  std::unordered_map<std::string, NDArray> const_var_ndarray_;
   /*! \brief Symbol name to required constant variables mapping. */
-  std::unordered_map<std::string, std::vector<std::string>> sym_vars_;
+  std::unordered_map<std::string, std::vector<std::string>> const_vars_by_symbol_;
 };
 
-Module MetadataModuleCreate(
-    const std::unordered_map<std::string, NDArray>& metadata,
-    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars) {
-  auto n = make_object<MetadataModuleNode>(metadata, sym_vars);
+Module ConstLoaderModuleCreate(
+    const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+    const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol) {
+  auto n = make_object<ConstLoaderModuleNode>(const_var_ndarray, const_vars_by_symbol);
   return Module(n);
 }
 
 TVM_REGISTER_GLOBAL("runtime.module.loadbinary_metadata")
-    .set_body_typed(MetadataModuleNode::LoadFromBinary);
+    .set_body_typed(ConstLoaderModuleNode::LoadFromBinary);
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_const_loader")
+    .set_body_typed(ConstLoaderModuleNode::LoadFromBinary);
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/const_loader_module.h b/src/runtime/const_loader_module.h
new file mode 100644
index 000000000000..eb548dfcf370
--- /dev/null
+++ b/src/runtime/const_loader_module.h
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file const_loader_module.h
+ * \brief Defines an interface to use the ConstLoaderModule.
+ */
+
+#ifndef TVM_RUNTIME_CONST_LOADER_MODULE_H_
+#define TVM_RUNTIME_CONST_LOADER_MODULE_H_
+
+#include <tvm/runtime/ndarray.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Create a ConstLoader module object.
+ *
+ * \param const_var_ndarray Maps consts var name to NDArray containing data for the var.
+ * \param const_vars_by_symbol Maps the name of a module init function to a list of names of
+ * const vars whose data will be passed to that init function.
+ *
+ * \return The created ConstLoaderModule.
+ */
+Module ConstLoaderModuleCreate(
+    const std::unordered_map<std::string, NDArray>& const_var_ndarray,
+    const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONST_LOADER_MODULE_H_
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index e83e1a3a7629..c19229243f1f 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -49,19 +49,6 @@ inline String get_name_mangled(const String& module_name, const String& name) {
   return ss.str();
 }
 
-/*!
- * \brief Create a metadata module object.
- *
- * \param metadata The variable name to ndarray mapping.
- * \param sym_vars The symbol to the list of required constant variables
- * mapping.
- *
- * \return The created metadata module.
- */
-Module MetadataModuleCreate(
-    const std::unordered_map<std::string, NDArray>& metadata,
-    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars);
-
 /*! \brief A tag to specify whether or not dynamic shared memory is used */
 constexpr const char* kUseDynamicSharedMemoryTag = "tir.use_dyn_shared_memory";
 
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 2facf1de64d5..14ac3c9b662f 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -35,7 +35,7 @@ namespace tvm {
 namespace codegen {
 
 runtime::Module CreateMetadataModule(
-    const std::unordered_map<std::string, runtime::NDArray>& params,
+    const std::unordered_map<std::string, runtime::NDArray>& const_var_ndarray,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
     tvm::relay::Runtime runtime, relay::backend::ExecutorCodegenMetadata metadata) {
   // Here we split modules into two groups:
@@ -52,19 +52,19 @@ runtime::Module CreateMetadataModule(
   bool is_targeting_crt = runtime->name == "crt";
 
   // Wrap all submodules in the initialization wrapper.
-  std::unordered_map<std::string, std::vector<std::string>> sym_metadata;
+  std::unordered_map<std::string, std::vector<std::string>> const_vars_by_symbol;
   for (tvm::runtime::Module mod : ext_modules) {
     auto pf_sym = mod.GetFunction("get_symbol");
     auto pf_var = mod.GetFunction("get_const_vars");
-    std::vector<std::string> arrays;
+    std::vector<std::string> symbol_const_vars;
     if (pf_sym != nullptr && pf_var != nullptr) {
       String symbol = pf_sym();
       Array<String> variables = pf_var();
       for (size_t i = 0; i < variables.size(); i++) {
-        arrays.push_back(variables[i].operator std::string());
+        symbol_const_vars.push_back(variables[i].operator std::string());
       }
-      ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
-      sym_metadata[symbol] = arrays;
+      ICHECK_EQ(const_vars_by_symbol.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
+      const_vars_by_symbol[symbol] = symbol_const_vars;
     }
     // We only need loading of serialized constant data
     // if there are constants present and required by the
@@ -74,7 +74,7 @@ runtime::Module CreateMetadataModule(
 
     // TODO(@manupa-arm) : we should be able to use csource_metadata
     // if the variables are empty when all the runtime modules implement get_func_names
-    if (arrays.empty() && is_targeting_crt && DSOExportable(mod) &&
+    if (symbol_const_vars.empty() && is_targeting_crt && DSOExportable(mod) &&
         (target->kind->name == "c" || target->kind->name == "llvm")) {
       crt_exportable_modules.push_back(mod);
     } else {
@@ -116,12 +116,13 @@ runtime::Module CreateMetadataModule(
     }
   } else {
     if (!non_crt_exportable_modules.empty()) {
-      runtime::Module binary_meta_mod = runtime::MetadataModuleCreate(params, sym_metadata);
-      binary_meta_mod.Import(target_module);
+      runtime::Module binary_const_loader_mod =
+          runtime::ConstLoaderModuleCreate(const_var_ndarray, const_vars_by_symbol);
+      binary_const_loader_mod.Import(target_module);
       for (const auto& it : non_crt_exportable_modules) {
-        binary_meta_mod.Import(it);
+        binary_const_loader_mod.Import(it);
       }
-      return binary_meta_mod;
+      return binary_const_loader_mod;
     }
   }
   return target_module;

From 66c8ef5b8558e9d6604d5ec20bfde017727264f1 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 29 Nov 2021 20:40:22 -0800
Subject: [PATCH 07/21] Add runtime AOT executor module.

---
 CMakeLists.txt                                |   8 +
 python/tvm/relay/backend/executor_factory.py  |  10 +
 python/tvm/runtime/__init__.py                |   2 +
 python/tvm/runtime/executor/__init__.py       |  26 +++
 python/tvm/runtime/executor/aot_executor.py   | 182 +++++++++++++++++
 src/runtime/aot_executor/aot_executor.cc      | 189 ++++++++++++++++++
 src/runtime/aot_executor/aot_executor.h       | 150 ++++++++++++++
 .../aot_executor/aot_executor_factory.cc      | 132 ++++++++++++
 .../aot_executor/aot_executor_factory.h       | 119 +++++++++++
 9 files changed, 818 insertions(+)
 create mode 100644 python/tvm/runtime/executor/__init__.py
 create mode 100644 python/tvm/runtime/executor/aot_executor.py
 create mode 100644 src/runtime/aot_executor/aot_executor.cc
 create mode 100644 src/runtime/aot_executor/aot_executor.h
 create mode 100644 src/runtime/aot_executor/aot_executor_factory.cc
 create mode 100644 src/runtime/aot_executor/aot_executor_factory.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f87a3a9f617f..9c6a7dddfdf6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_EXECUTOR "Build with tiny graph executor" ON)
 tvm_option(USE_GRAPH_EXECUTOR_CUDA_GRAPH "Build with tiny graph executor with CUDA Graph for GPUs" OFF)
+tvm_option(USE_AOT_EXECUTOR "Build with AOT executor" ON)
 tvm_option(USE_PROFILER "Build profiler for the VM and graph executor" ON)
 tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
@@ -395,6 +396,13 @@ if(USE_PROFILER)
   list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
 endif(USE_PROFILER)
 
+if(USE_AOT_EXECUTOR)
+  message(STATUS "Build with AOT Executor support...")
+  file(GLOB RUNTIME_AOT_EXECUTOR_SRCS src/runtime/aot_executor/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_AOT_EXECUTOR_SRCS})
+
+endif(USE_AOT_EXECUTOR)
+
 # Enable ctest if gtest is available
 if(USE_GTEST)
   # Check env var for backward compatibility. A better way to specify package
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index 9ff7a7a8120b..eee3169400ff 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -109,6 +109,13 @@ def __init__(
         executor_codegen_metadata,
         devices,
     ):
+        fcreate = get_global_func("tvm.aot_executor_factory.create")
+        args = []
+        for k, v in params.items():
+            args.append(k)
+            args.append(ndarray.array(v))
+
+        self.module = fcreate(libmod, libmod_name, *args)
         self.ir_mod = ir_mod
         self.lowered_ir_mods = lowered_ir_mods
         self.target = target
@@ -134,6 +141,9 @@ def get_executor_config(self):
     def get_lib(self):
         return self.lib
 
+    def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
+        return self.module.export_library(file_name, fcompile, addons, **kwargs)
+
 
 class GraphExecutorFactoryModule(ExecutorFactoryModule):
     """Graph executor factory module.
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index b3504dbac506..ab0fc1709fa9 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -31,3 +31,5 @@
 from .module import load_module, enabled, system_lib
 from .container import String, ShapeTuple
 from .params import save_param_dict, load_param_dict
+
+from . import executor
diff --git a/python/tvm/runtime/executor/__init__.py b/python/tvm/runtime/executor/__init__.py
new file mode 100644
index 000000000000..ecc4097dbaa0
--- /dev/null
+++ b/python/tvm/runtime/executor/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""This module contains Python wrappers for the TVM C++ Executor implementations.
+
+NOTE: at present, only AOT Executor is contained here. The others are:
+ - GraphExecutor, in python/tvm/contrib/graph_executor.py
+ - VM Executor, in python/tvm/runtime/vm.py
+
+TODO(areusch): Consolidate these into this module.
+"""
+from .aot_executor import AotModule
diff --git a/python/tvm/runtime/executor/aot_executor.py b/python/tvm/runtime/executor/aot_executor.py
new file mode 100644
index 000000000000..91f056ff25fa
--- /dev/null
+++ b/python/tvm/runtime/executor/aot_executor.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A Python wrapper for the Module-based Model Runtime Interface for Ahead-of-Time compilation."""
+
+import numpy as np
+
+
+class AotModule(object):
+    """Wraps the AOT executor runtime.Module.
+
+    This is a thin wrapper of the underlying TVM module.
+    you can also directly call set_input, run, and get_output
+    of underlying module functions
+
+    Parameters
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    Attributes
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import tvm
+        from tvm import relay
+        from tvm.contrib import graph_executor
+
+        # build the library using graph executor
+        lib = relay.build(...)
+        lib.export_library("compiled_lib.so")
+        # load it back as a runtime
+        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
+        # Call the library factory function for default and create
+        # a new runtime.Module, wrap with graph module.
+        gmod = graph_executor.GraphModule(lib["default"](dev))
+        # use the graph module.
+        gmod.set_input("x", data)
+        gmod.run()
+    """
+
+    def __init__(self, module):
+        self.module = module
+        self._set_input = module["set_input"]
+        self._run = module["run"]
+        self._get_output = module["get_output"]
+        self._get_input = module["get_input"]
+        self._get_num_outputs = module["get_num_outputs"]
+        self._get_input_index = module["get_input_index"]
+        self._get_num_inputs = module["get_num_inputs"]
+
+    def set_input(self, key=None, value=None, **params):
+        """Set inputs to the module via kwargs
+
+        Parameters
+        ----------
+        key : int or str
+           The input key
+
+        value : the input value.
+           The input key
+
+        params : dict of str to NDArray
+           Additional arguments
+        """
+        if key is not None:
+            v = self._get_input(key)
+            if v is None:
+                raise RuntimeError("Could not find '%s' in graph's inputs" % key)
+            v.copyfrom(value)
+
+        if params:
+            # upload big arrays first to avoid memory issue in rpc mode
+            keys = list(params.keys())
+            keys.sort(key=lambda x: -np.prod(params[x].shape))
+            for k in keys:
+                # TODO(zhiics) Skip the weights for submodule in a better way.
+                # We should use MetadataModule for initialization and remove
+                # params from set_input
+                val = self._get_input(k)
+                if val:
+                    self._get_input(k).copyfrom(params[k])
+
+    def run(self, **input_dict):
+        """Run forward execution of the graph
+
+        Parameters
+        ----------
+        input_dict: dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+        self._run()
+
+    def get_num_outputs(self):
+        """Get the number of outputs from the graph
+
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    def get_num_inputs(self):
+        """Get the number of inputs to the graph
+
+        Returns
+        -------
+        count : int
+            The number of inputs.
+        """
+        return self._get_num_inputs()
+
+    def get_input(self, index, out=None):
+        """Get index-th input to out
+
+        Parameters
+        ----------
+        index : int
+            The input index
+
+        out : NDArray
+            The output array container
+        """
+        if out:
+            self._get_input(index).copyto(out)
+            return out
+
+        return self._get_input(index)
+
+    def get_input_index(self, name):
+        """Get inputs index via input name.
+
+        Parameters
+        ----------
+        name : str
+           The input key name
+
+        Returns
+        -------
+        index: int
+            The input index. -1 will be returned if the given input name is not found.
+        """
+        return self._get_input_index(name)
+
+    def get_output(self, index, out=None):
+        """Get index-th output to out
+
+        Parameters
+        ----------
+        index : int
+            The output index
+
+        out : NDArray
+            The output array container
+        """
+        if out:
+            self._get_output(index, out)
+            return out
+
+        return self._get_output(index)
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
new file mode 100644
index 000000000000..398b0120ab2c
--- /dev/null
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Defines an implementation of Module-based Model Runtime Interface that works with
+ *        Ahead-of-Time compilation.
+ * \file aot_executor.cc
+ */
+
+#include "aot_executor.h"
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <memory>
+
+#include "../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+
+AotExecutor::AotExecutor(tvm::runtime::Module module, const std::vector<Device>& devs)
+    : module_{module}, devices_{devs} {
+  auto fmetadata = module->GetFunction("get_metadata");
+  CHECK(fmetadata != nullptr) << "Expected a module with PackedFunc get_metadata";
+  auto ret_value = fmetadata();
+  metadata_ = ret_value.AsObjectRef<tvm::runtime::metadata::Metadata>();
+
+  for (auto input : metadata_->inputs()) {
+    // TODO(areusch): Encode device information in Metadata.
+    args_.emplace_back(NDArray::Empty(ShapeTuple(input->shape().begin(), input->shape().end()),
+                                      input->dtype(), devices_[0]));
+  }
+
+  for (auto output : metadata_->outputs()) {
+    args_.emplace_back(NDArray::Empty(ShapeTuple(output->shape().begin(), output->shape().end()),
+                                      output->dtype(), devices_[0]));
+  }
+
+  for (auto pool : metadata_->pools()) {
+    args_.emplace_back(NDArray::Empty(ShapeTuple(pool->shape().begin(), pool->shape().end()),
+                                      pool->dtype(), devices_[0]));
+  }
+}
+
+PackedFunc AotExecutor::GetFunction(const std::string& name,
+                                    const ObjectPtr<Object>& sptr_to_self) {
+  // Return member functions during query.
+  if (name == "set_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (String::CanConvertFrom(args[0])) {
+        int in_idx = this->GetInputIndex(args[0].operator String());
+        if (in_idx >= 0) this->SetInput(in_idx, args[1]);
+      } else {
+        this->SetInput(args[0], args[1]);
+      }
+    });
+  } else if (name == "set_input_zero_copy") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (String::CanConvertFrom(args[0])) {
+        int in_idx = this->GetInputIndex(args[0].operator String());
+        if (in_idx >= 0) this->SetInputZeroCopy(in_idx, args[1]);
+      } else {
+        this->SetInputZeroCopy(args[0], args[1]);
+      }
+    });
+  } else if (name == "set_output_zero_copy") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (String::CanConvertFrom(args[0])) {
+        int out_idx = this->GetOutputIndex(args[0].operator String());
+        if (out_idx >= 0) this->SetOutputZeroCopy(out_idx, args[1]);
+      } else {
+        this->SetOutputZeroCopy(args[0], args[1]);
+      }
+    });
+  } else if (name == "get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (args.num_args == 2) {
+        this->CopyOutputTo(args[0], args[1]);
+      } else {
+        *rv = this->GetOutput(args[0]);
+      }
+    });
+  } else if (name == "get_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int in_idx = 0;
+      if (String::CanConvertFrom(args[0])) {
+        in_idx = this->GetInputIndex(args[0].operator String());
+      } else {
+        in_idx = args[0];
+      }
+      if (in_idx >= 0) {
+        *rv = this->GetInput(in_idx);
+      }
+    });
+  } else if (name == "get_num_outputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else if (name == "get_num_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
+  } else if (name == "run") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
+  } else if (name == "get_input_index") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK(String::CanConvertFrom(args[0])) << "Input key is not a string";
+      *rv = this->GetInputIndex(args[0].operator String());
+    });
+  } else {
+    return PackedFunc();
+  }
+}
+
+void AotExecutor::Run() {
+  auto pf = module_.GetFunction(
+      get_name_mangled(metadata_->mod_name(), ::tvm::runtime::symbol::tvm_module_main),
+      true /* query_imports */);
+  ICHECK(pf != nullptr) << "Module entrypoint is not defined";
+
+  const int num_args = args_.size();
+  ::std::unique_ptr<TVMValue> call_values{new TVMValue[num_args]};
+  ::std::unique_ptr<int> call_type_codes{new int[num_args]};
+  for (int i = 0; i < num_args; ++i) {
+    auto managed = args_[i].ToDLPack();
+    call_values.get()[i].v_handle = &managed->dl_tensor;
+    call_type_codes.get()[i] = kTVMDLTensorHandle;
+  }
+
+  TVMArgs args{call_values.get(), call_type_codes.get(), num_args};
+  TVMRetValue rv;
+  pf.CallPacked(args, &rv);
+}
+
+int AotExecutor::GetInputIndex(const std::string& name) {
+  auto inputs = metadata_->inputs();
+  for (unsigned int i = 0; i < inputs.size(); i++) {
+    if (inputs[i]->name() == name) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+int AotExecutor::GetOutputIndex(const std::string& name) {
+  auto outputs = metadata_->outputs();
+  for (unsigned int i = 0; i < outputs.size(); i++) {
+    if (outputs[i]->name() == name) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+void AotExecutor::SetInput(int index, DLTensor* data_ref) { args_[index].CopyFrom(data_ref); }
+
+void AotExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
+  ICHECK(false) << "not implemented";
+}
+
+void AotExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) {
+  ICHECK(false) << "not implemented";
+}
+
+int AotExecutor::NumOutputs() const { return metadata_->num_outputs(); }
+
+int AotExecutor::NumInputs() const { return metadata_->num_inputs(); }
+
+NDArray AotExecutor::GetInput(int index) const { return args_[index]; }
+
+NDArray AotExecutor::GetOutput(int index) const { return args_[metadata_->num_inputs() + index]; }
+
+void AotExecutor::CopyOutputTo(int index, DLTensor* data_out) { GetOutput(index).CopyTo(data_out); }
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/aot_executor/aot_executor.h b/src/runtime/aot_executor/aot_executor.h
new file mode 100644
index 000000000000..a213ef83bdc2
--- /dev/null
+++ b/src/runtime/aot_executor/aot_executor.h
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Defines an implementation of Module-based Model Runtime Interface that works with
+ *        Ahead-of-Time compilation.
+ * \file aot_executor.h
+ */
+#ifndef TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_H_
+#define TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_H_
+
+#include <tvm/runtime/metadata.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+class TVM_DLL AotExecutor : public ModuleNode {
+ public:
+  /*!
+   * \brief Implements member function lookup for this Module for the frontend.
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) override;
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const final { return "AotExecutor"; }
+
+  void Run();
+
+  /*!
+   * \brief Initialize the AOT executor with metadata, runtime::Module, and device.
+   * \param module The module containing the compiled functions for the host
+   *  processor.
+   * \param devs The device of the host and devices where graph nodes will be
+   *  executed on.
+   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
+   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
+   *  which is not compatible with RPCModules. Default is nullptr.
+   */
+  AotExecutor(tvm::runtime::Module module, const std::vector<Device>& devs);
+
+  /*!
+   * \brief Get the input index given the name of input.
+   * \param name The name of the input.
+   * \return The index of input.
+   */
+  int GetInputIndex(const std::string& name);
+
+  /*!
+   * \brief Get the output index given the name of output.
+   * \param name The name of the output.
+   * \return The index of output.
+   */
+  int GetOutputIndex(const std::string& name);
+
+  /*!
+   * \brief set index-th input to the graph.
+   * \param index The input index.
+   * \param data_in The input data.
+   */
+  void SetInput(int index, DLTensor* data_in);
+  /*!
+   * \brief set index-th input to the graph without copying the data
+   * \param index The input index.
+   * \param data_ref The input data that is referred.
+   */
+  void SetInputZeroCopy(int index, DLTensor* data_ref);
+  /*!
+   * \brief set index-th output to the graph without copying the data.
+   * \param index The output index.
+   * \param data_ref The output data that is referred.
+   */
+  void SetOutputZeroCopy(int index, DLTensor* data_ref);
+  /*!
+   * \brief Get the number of outputs
+   *
+   * \return The number of outputs from graph.
+   */
+  int NumOutputs() const;
+  /*!
+   * \brief Get the number of inputs
+   *
+   * \return The number of inputs to the graph.
+   */
+  int NumInputs() const;
+  /*!
+   * \brief Return NDArray for given input index.
+   * \param index The input index.
+   *
+   * \return NDArray corresponding to given input node index.
+   */
+  NDArray GetInput(int index) const;
+  /*!
+   * \brief Return NDArray for given output index.
+   * \param index The output index.
+   *
+   * \return NDArray corresponding to given output node index.
+   */
+  NDArray GetOutput(int index) const;
+  /*!
+   * \brief Copy index-th output to data_out.
+   * \param index The output index.
+   * \param data_out the output data.
+   */
+  void CopyOutputTo(int index, DLTensor* data_out);
+
+ private:
+  /*! \brief Metadata provided to the runtime from the compiler. */
+  metadata::Metadata metadata_;
+
+  /*! \brief Runtime module which contains the AOT top-level function. */
+  Module module_;
+
+  /*! \brief The devices which should be used to execute the computations. */
+  std::vector<Device> devices_;
+
+  /*! \brief Holds one NDArray per function argument in the same order. */
+  std::vector<NDArray> args_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_H_
diff --git a/src/runtime/aot_executor/aot_executor_factory.cc b/src/runtime/aot_executor/aot_executor_factory.cc
new file mode 100644
index 000000000000..4cb3026991fe
--- /dev/null
+++ b/src/runtime/aot_executor/aot_executor_factory.cc
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file aot_executor_factory.cc
+ * \brief Graph executor factory implementations
+ */
+
+#include "./aot_executor_factory.h"
+
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include <iterator>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+AotExecutorFactory::AotExecutorFactory(
+    const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
+    const std::string& module_name) {
+  params_ = params;
+  module_name_ = module_name;
+}
+
+PackedFunc AotExecutorFactory::GetFunction(
+    const std::string& name, const tvm::runtime::ObjectPtr<tvm::runtime::Object>& sptr_to_self) {
+  if (name == module_name_) {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_GT(args.num_args, 0) << "Must supply at least one device argument";
+      std::vector<Device> devices;
+      for (int i = 0; i < args.num_args; ++i) {
+        devices.emplace_back(args[i].operator Device());
+      }
+      *rv = this->ExecutorCreate(devices);
+    });
+  } else if (name == "remove_params") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      std::unordered_map<std::string, tvm::runtime::NDArray> empty_params{};
+      auto exec = make_object<AotExecutorFactory>(empty_params, this->module_name_);
+      exec->Import(this->imports_[0]);
+      *rv = Module(exec);
+    });
+  } else {
+    return PackedFunc();
+  }
+}
+
+void AotExecutorFactory::SaveToBinary(dmlc::Stream* stream) {
+  std::vector<std::string> names;
+  std::vector<DLTensor*> arrays;
+  for (const auto& v : params_) {
+    names.emplace_back(v.first);
+    arrays.emplace_back(const_cast<DLTensor*>(v.second.operator->()));
+  }
+  uint64_t sz = arrays.size();
+  ICHECK(sz == names.size());
+  stream->Write(sz);
+  stream->Write(names);
+  for (size_t i = 0; i < sz; ++i) {
+    tvm::runtime::SaveDLTensor(stream, arrays[i]);
+  }
+  stream->Write(module_name_);
+}
+
+Module AotExecutorFactory::ExecutorCreate(const std::vector<Device>& devs) {
+  auto exec = make_object<AotExecutor>(this->imports_[0], devs);
+  // set params
+  SetParams(exec.get(), this->params_);
+  return Module(exec);
+}
+
+Module AotExecutorFactoryModuleLoadBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  std::string module_name;
+  uint64_t sz;
+  ICHECK(stream->Read(&sz));
+  std::vector<std::string> names;
+  ICHECK(stream->Read(&names));
+  ICHECK(sz == names.size());
+  for (size_t i = 0; i < sz; ++i) {
+    tvm::runtime::NDArray temp;
+    temp.Load(stream);
+    params[names[i]] = temp;
+  }
+  ICHECK(stream->Read(&module_name));
+  auto exec = make_object<AotExecutorFactory>(params, module_name);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.aot_executor_factory.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  ICHECK_GE(args.num_args, 2) << "The expected number of arguments for "
+                                 "aot_executor_factory.create needs at least 2, "
+                                 "but it has "
+                              << args.num_args;
+  // The argument order is module, module_name, param0_name, param0_tensor,
+  // [param1_name, param1_tensor], ...
+  ICHECK_EQ((args.size() - 2) % 2, 0);
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  for (size_t i = 2; i < static_cast<size_t>(args.size()); i += 2) {
+    std::string name = args[i].operator String();
+    params[name] = args[i + 1].operator tvm::runtime::NDArray();
+  }
+  auto exec = make_object<AotExecutorFactory>(params, args[1]);
+  exec->Import(args[0]);
+  *rv = Module(exec);
+});
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_AotExecutorFactory")
+    .set_body_typed(AotExecutorFactoryModuleLoadBinary);
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/aot_executor/aot_executor_factory.h b/src/runtime/aot_executor/aot_executor_factory.h
new file mode 100644
index 000000000000..1d6a0a62776e
--- /dev/null
+++ b/src/runtime/aot_executor/aot_executor_factory.h
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/aot_executor/aot_executor_factory.h
+ * \brief Aot executor factory creating aot executor.
+ */
+
+#ifndef TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_FACTORY_H_
+#define TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_FACTORY_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "./aot_executor.h"
+
+namespace tvm {
+namespace runtime {
+
+class TVM_DLL AotExecutorFactory : public runtime::ModuleNode {
+ public:
+  /*!
+   * \brief Construct the AotExecutorFactory.
+   * \param params The params of aot.
+   * \param module_name The module name of aot.
+   */
+  AotExecutorFactory(const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
+                     const std::string& module_name);
+
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const override { return "AotExecutorFactory"; }
+
+  /*!
+   * \brief Save the module to binary stream.
+   * \param stream The binary stream to save to.
+   */
+  void SaveToBinary(dmlc::Stream* stream) override;
+
+  /*!
+   * \brief Create a specific executor module
+   * \param devs The device of the host and devices where the model will be
+   *  executed.
+   * \return created executor module
+   */
+  Module ExecutorCreate(const std::vector<Device>& devs);
+
+  /*!
+   * \brief Set params.
+   * \param aot_executor The aot executor we want to set the params into.
+   * \param params The aot params value we want to set.
+   */
+  void SetParams(AotExecutor* aot_executor,
+                 const std::unordered_map<std::string, tvm::runtime::NDArray>& params) const {
+    std::unordered_map<std::string, tvm::runtime::NDArray> value = params;
+    // upload big arrays first to avoid memory issue in rpc mode
+    std::vector<std::string> keys;
+    for (const auto& p : value) {
+      keys.emplace_back(p.first);
+    }
+    std::sort(std::begin(keys), std::end(keys),
+              [&](const std::string& lhs, const std::string& rhs) -> bool {
+                auto lhs_size = GetDataSize(*value[lhs].operator->());
+                auto rhs_size = GetDataSize(*value[rhs].operator->());
+                return lhs_size > rhs_size;
+              });
+    for (const auto& key : keys) {
+      int in_idx = aot_executor->GetInputIndex(key);
+      if (in_idx >= 0) {
+        aot_executor->SetInput(in_idx, const_cast<DLTensor*>(value[key].operator->()));
+      }
+    }
+  }
+
+ protected:
+  /*! \brief The params. */
+  std::unordered_map<std::string, tvm::runtime::NDArray> params_;
+  /*! \brief module name */
+  std::string module_name_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_AOT_EXECUTOR_AOT_EXECUTOR_FACTORY_H_

From b49bea64a29fe92e728870341ced28eb29a7d6c1 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 29 Nov 2021 20:44:07 -0800
Subject: [PATCH 08/21] Add AOT code-generation.

---
 src/relay/backend/aot_executor_codegen.cc     |  13 +-
 src/runtime/aot_executor/aot_executor.cc      |   1 +
 src/target/metadata_module.cc                 | 176 ++++++++---
 src/target/source/codegen_c_host.cc           |  18 ++
 src/target/source/codegen_c_host.h            |   1 +
 src/target/source/codegen_source_base.h       |  24 ++
 src/target/source/source_module.cc            | 288 ++++++++++++++++++
 src/target/source/source_module.h             |  10 +-
 src/tir/transforms/legalize_packed_calls.cc   |   6 +
 tests/python/relay/aot/test_c_device_api.py   |  29 +-
 tests/python/relay/aot/test_crt_aot.py        |  10 +-
 .../unittest/test_aot_legalize_packed_call.py |   9 +
 12 files changed, 527 insertions(+), 58 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 65f293b15cdb..3fa4fa5e8066 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -352,7 +352,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       if (params_by_expr_.find(arg) != params_by_expr_.end()) {
         auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                            {tir::StringImm(params_by_expr_[arg])});
-        args.push_back(tvm::tir::Cast(DataType::Handle(), param_handle));
+        // NOTE: this cast looks like a no-op, but is required for compilation downstream.
+        // Because DataType::Handle has default bits=64, but CodeGenC does not observe this field,
+        // adding this cast forces the codegen to insert the cast. In this case, a cast is required
+        // because param_handle is actually code-generated as `const void*`, and the `const` piece
+        // needs to be removed.
+        args.push_back(tvm::tir::Cast(DataType::Handle(32, 1), param_handle));
       } else {
         auto sids = FindExpr(arg);
         PushArgs(arg, sids, &args);
@@ -907,9 +912,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Validate choice of use_unpacked_api_ and use_call_cpacked_
     if (runtime_config->name == kTvmRuntimeCrt) {
-      CHECK(interface_api == "c" || static_cast<bool>(use_unpacked_api_) == false)
-          << "Either need interface_api == \"c\" (got: " << interface_api
-          << ") or unpacked-api == false (got: " << use_unpacked_api_
+      CHECK(interface_api == "packed" || static_cast<bool>(use_unpacked_api_) == true)
+          << "Either need interface_api == \"packed\" (got: " << interface_api
+          << ") or unpacked-api == true (got: " << use_unpacked_api_
           << ") when targeting c runtime";
     } else if (runtime_config->name == kTvmRuntimeCpp) {
       CHECK(static_cast<bool>(use_unpacked_api_) == false &&
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 398b0120ab2c..403a8f09e2ae 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -26,6 +26,7 @@
 #include "aot_executor.h"
 
 #include <tvm/runtime/c_runtime_api.h>
+
 #include <memory>
 
 #include "../meta_data.h"
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 14ac3c9b662f..1fec52c46609 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -25,8 +25,10 @@
 
 #include <tvm/relay/runtime.h>
 
+#include <utility>
 #include <vector>
 
+#include "../runtime/const_loader_module.h"
 #include "../runtime/meta_data.h"
 #include "llvm/llvm_module.h"
 #include "source/source_module.h"
@@ -34,6 +36,132 @@
 namespace tvm {
 namespace codegen {
 
+static runtime::Module CreateCrtMetadataModule(
+    runtime::Module target_module, Target target, relay::Runtime runtime,
+    relay::backend::ExecutorCodegenMetadata metadata,
+    Array<runtime::Module> non_crt_exportable_modules,
+    Array<runtime::Module> crt_exportable_modules,
+    const std::unordered_map<std::string, runtime::NDArray>& const_var_ndarray) {
+  if (!non_crt_exportable_modules.empty()) {
+    std::string non_exportable_modules;
+    for (unsigned int i = 0; i < non_crt_exportable_modules.size(); i++) {
+      if (i > 0) {
+        non_exportable_modules += ", ";
+      }
+      auto mod = non_crt_exportable_modules[i];
+      auto pf_sym = mod.GetFunction("get_symbol");
+      if (pf_sym != nullptr) {
+        non_exportable_modules += pf_sym().operator std::string();
+      } else {
+        non_exportable_modules +=
+            std::string{"(module type_key="} + mod->type_key() + std::string{")"};
+      }
+    }
+    CHECK(false) << "These " << non_crt_exportable_modules.size()
+                 << " modules are not exportable to C-runtime: " << non_exportable_modules;
+  }
+
+  if (target->kind->name == "c") {
+    crt_exportable_modules.push_back(target_module);
+    target_module =
+        CreateCSourceCrtMetadataModule(crt_exportable_modules, target, runtime, metadata);
+  } else if (target->kind->name == "llvm") {
+#ifdef TVM_LLVM_VERSION
+    crt_exportable_modules.push_back(target_module);
+    target_module = CreateLLVMCrtMetadataModule(crt_exportable_modules, target, runtime);
+#else   // TVM_LLVM_VERSION
+    LOG(FATAL) << "TVM was not built with LLVM enabled.";
+#endif  // TVM_LLVM_VERSION
+  }
+
+  return target_module;
+}
+
+// TODO(areusch,masahi): Unify metadata representation and remove the need for this function
+static runtime::metadata::Metadata ConvertMetaData(
+    relay::backend::ExecutorCodegenMetadata metadata) {
+  std::vector<runtime::metadata::TensorInfo> inputs;
+  ICHECK(metadata.defined());
+  for (size_t i = 0; i < metadata->inputs.size(); ++i) {
+    auto v = metadata->inputs[i];
+    auto ttype = metadata->input_tensor_types[i];
+    inputs.push_back(
+        runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
+            v->name_hint, relay::backend::ShapeToJSON(ttype->shape), ttype->dtype)));
+  }
+
+  std::vector<runtime::metadata::TensorInfo> outputs;
+  auto output_ttypes = metadata->output_tensor_types;
+  for (size_t i = 0; i < output_ttypes.size(); ++i) {
+    auto ttype = output_ttypes[i];
+    std::stringstream name;
+    name << "output" << i;
+    outputs.push_back(
+        runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
+            name.str(), relay::backend::ShapeToJSON(ttype->shape), ttype->dtype)));
+  }
+
+  std::vector<runtime::metadata::TensorInfo> pools;
+  for (size_t i = 0; i < metadata->pools.size(); ++i) {
+    auto var = metadata->pools[i];
+    pools.push_back(
+        runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
+            var->name_hint,
+            std::vector<int64_t>{metadata->pool_inputs.value()[var]->allocated_size},
+            tvm::runtime::DataType{kDLUInt, 8, 1})));
+  }
+
+  auto n = make_object<target::metadata::InMemoryMetadataNode>(
+      runtime::metadata::kMetadataVersion, inputs, outputs, pools, metadata->mod_name);
+
+  return runtime::metadata::Metadata(std::move(n));
+}
+
+static runtime::Module CreateCppMetadataModule(
+    runtime::Module target_module, Target target, relay::Runtime runtime,
+    relay::backend::ExecutorCodegenMetadata metadata,
+    const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol,
+    Array<runtime::Module> non_crt_exportable_modules,
+    Array<runtime::Module> crt_exportable_modules,
+    const std::unordered_map<std::string, runtime::NDArray>& const_var_ndarray) {
+  if (!non_crt_exportable_modules.empty()) {
+    runtime::Module const_loader_mod =
+        runtime::ConstLoaderModuleCreate(const_var_ndarray, const_vars_by_symbol);
+    const_loader_mod.Import(target_module);
+    for (const auto& it : non_crt_exportable_modules) {
+      const_loader_mod.Import(it);
+    }
+    target_module = const_loader_mod;
+  }
+
+  if (metadata.defined()) {
+    runtime::metadata::Metadata runtime_metadata = ConvertMetaData(metadata);
+
+    if (metadata->executor == runtime::kTvmExecutorAot && runtime->name == relay::kTvmRuntimeCpp) {
+      if (target->kind->name == "c") {
+        auto metadata_module = CreateCSourceCppMetadataModule(runtime_metadata);
+        metadata_module->Import(target_module);
+        target_module = metadata_module;
+      } else {
+        CHECK(false) << "Don't know how to create MetadataModule for target type " << target->str();
+      }
+    }
+  }
+
+  return target_module;
+}
+
+/*!
+ * \brief Create a metadata module wrapper. The helper is used by different
+ *        codegens, such as graph executor codegen and the vm compiler.
+ *
+ * \param params The metadata for initialization of all modules.
+ * \param target_module the internal module that is compiled by tvm.
+ * \param ext_modules The external modules that needs to be imported inside the metadata
+ * module(s).
+ * \param target The target that all the modules are compiled for
+ * \return The created metadata module that manages initialization of metadata.
+ */
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& const_var_ndarray,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
@@ -83,50 +211,16 @@ runtime::Module CreateMetadataModule(
   }
 
   if (is_targeting_crt) {
-    if (!non_crt_exportable_modules.empty()) {
-      std::string non_exportable_modules;
-      for (unsigned int i = 0; i < non_crt_exportable_modules.size(); i++) {
-        if (i > 0) {
-          non_exportable_modules += ", ";
-        }
-        auto mod = non_crt_exportable_modules[i];
-        auto pf_sym = mod.GetFunction("get_symbol");
-        if (pf_sym != nullptr) {
-          non_exportable_modules += pf_sym().operator std::string();
-        } else {
-          non_exportable_modules +=
-              std::string{"(module type_key="} + mod->type_key() + std::string{")"};
-        }
-      }
-      CHECK(false) << "These " << non_crt_exportable_modules.size()
-                   << " modules are not exportable to C-runtime: " << non_exportable_modules;
-    }
-
-    if (target->kind->name == "c") {
-      crt_exportable_modules.push_back(target_module);
-      target_module =
-          CreateCSourceCrtMetadataModule(crt_exportable_modules, target, runtime, metadata);
-    } else if (target->kind->name == "llvm") {
-#ifdef TVM_LLVM_VERSION
-      crt_exportable_modules.push_back(target_module);
-      target_module = CreateLLVMCrtMetadataModule(crt_exportable_modules, target, runtime);
-#else   // TVM_LLVM_VERSION
-      LOG(FATAL) << "TVM was not built with LLVM enabled.";
-#endif  // TVM_LLVM_VERSION
-    }
+    return CreateCrtMetadataModule(target_module, target, runtime, metadata,
+                                   non_crt_exportable_modules, crt_exportable_modules,
+                                   const_var_ndarray);
   } else {
-    if (!non_crt_exportable_modules.empty()) {
-      runtime::Module binary_const_loader_mod =
-          runtime::ConstLoaderModuleCreate(const_var_ndarray, const_vars_by_symbol);
-      binary_const_loader_mod.Import(target_module);
-      for (const auto& it : non_crt_exportable_modules) {
-        binary_const_loader_mod.Import(it);
-      }
-      return binary_const_loader_mod;
-    }
+    return CreateCppMetadataModule(target_module, target, runtime, metadata, const_vars_by_symbol,
+                                   non_crt_exportable_modules, crt_exportable_modules,
+                                   const_var_ndarray);
   }
-  return target_module;
 }
 
 }  // namespace codegen
+
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index d7fb3dcf6d80..7ddea46c07bd 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -23,6 +23,7 @@
 #include "codegen_c_host.h"
 
 #include <tvm/relay/executor.h>
+#include <tvm/relay/runtime.h>
 #include <tvm/runtime/crt/error_codes.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
@@ -51,6 +52,10 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
   CodeGenC::Init(output_ssa);
 }
 
+void CodeGenCHost::InitGlobalContext() {
+  decl_stream << "void* " << tvm::runtime::symbol::tvm_module_ctx << " = NULL;\n";
+}
+
 void CodeGenCHost::DefineModuleName() { decl_stream << "void* " << module_name_ << " = NULL;\n"; }
 
 void CodeGenCHost::AddFunction(const PrimFunc& f) {
@@ -366,6 +371,19 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
     cg.AddFunction(aot_executor_fn);
   }
 
+  // NOTE: it's possible that kRuntime attr is not attached when the mod was built with tvm.build().
+  // See issue #10373.
+  auto opt_runtime = mod->GetAttr<relay::Runtime>(tvm::attr::kRuntime);
+  relay::Runtime runtime;
+  if (opt_runtime.get() != nullptr) {
+    runtime = opt_runtime.value();
+  } else {
+    runtime = relay::Runtime::Create("cpp", {});
+  }
+  if (aot_executor_fn.defined() && runtime->name == relay::kTvmRuntimeCpp) {
+    cg.InitGlobalContext();
+  }
+
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
     ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 44e791ef7bc3..c0e4ee9a263c 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -40,6 +40,7 @@ class CodeGenCHost : public CodeGenC {
   CodeGenCHost();
   void Init(bool output_ssa, bool emit_asserts, std::string target_str);
 
+  void InitGlobalContext();
   void AddFunction(const PrimFunc& f);
 
   void DefineModuleName();
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 8f8f9e1b8bf2..133886758430 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -25,6 +25,7 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 #define TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 
+#include <tvm/runtime/metadata.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
@@ -157,6 +158,19 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
                                     const Array<String>& func_names,
                                     const Array<String>& const_vars = {});
 
+/*!
+ * \brief Wrap the submodules in a metadata module.
+ * \param params The variable to constant mapping that is collected by the host
+ *        module.
+ * \param target_module The main TIR-lowered internal runtime module
+ * \param modules All the external modules that needs to be imported inside the metadata module(s).
+ * \param target The target that all the modules are compiled for
+ * \return The wrapped module.
+ */
+runtime::Module CreateMetadataModule(
+    const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
+    const Array<runtime::Module>& ext_modules, Target target, runtime::metadata::Metadata metadata);
+
 /*!
  * \brief Create a source module for viewing and limited saving for device.
  * \param data The code data to be viewed.
@@ -169,6 +183,16 @@ runtime::Module DeviceSourceModuleCreate(
     std::string data, std::string fmt, std::unordered_map<std::string, runtime::FunctionInfo> fmap,
     std::string type_key, std::function<std::string(const std::string&)> fget_source = nullptr);
 
+/*!
+ * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime.
+ * \param modules The modules to be wrapped.
+ * \param target the target the modules are compiled for.
+ * \param metadata the metadata needed for code generation.
+ * \return The wrapped module.
+ */
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
+                                               runtime::metadata::Metadata metadata);
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 97461ca2091f..e6acc1387806 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -28,13 +28,17 @@
 #include <tvm/runtime/registry.h>
 
 #include <string>
+#include <tuple>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "../../relay/backend/name_transforms.h"
 #include "../../runtime/file_utils.h"
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
+#include "../metadata.h"
 #include "codegen_source_base.h"
 
 namespace tvm {
@@ -518,6 +522,264 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 };
 
+static std::string address_from_parts(const std::vector<std::string>& parts) {
+  std::stringstream ss;
+  for (unsigned int i = 0; i < parts.size(); ++i) {
+    if (i > 0) {
+      ss << "_";
+    }
+    ss << parts[i];
+  }
+  return ss.str();
+}
+
+class MetadataQueuer : public AttrVisitor {
+ public:
+  using QueueItem = std::tuple<std::string, runtime::metadata::MetadataBase>;
+  explicit MetadataQueuer(std::vector<QueueItem>* queue) : queue_{queue} {}
+
+  void Visit(const char* key, double* value) final {}
+  void Visit(const char* key, int64_t* value) final {}
+  void Visit(const char* key, uint64_t* value) final {}
+  void Visit(const char* key, int* value) final {}
+  void Visit(const char* key, bool* value) final {}
+  void Visit(const char* key, std::string* value) final {}
+  void Visit(const char* key, DataType* value) final {}
+  void Visit(const char* key, runtime::NDArray* value) final {}
+  void Visit(const char* key, void** value) final {}
+
+  void Visit(const char* key, ObjectRef* value) final {
+    address_parts_.push_back(key);
+    if (value->as<runtime::metadata::MetadataBaseNode>() != nullptr) {
+      auto metadata = Downcast<runtime::metadata::MetadataBase>(*value);
+      const runtime::metadata::MetadataArrayNode* arr =
+          value->as<runtime::metadata::MetadataArrayNode>();
+      std::cout << "Is array? " << arr << std::endl;
+      if (arr != nullptr) {
+        for (unsigned int i = 0; i < arr->array.size(); i++) {
+          ObjectRef o = arr->array[i];
+          std::cout << "queue-visiting array element " << i << ": " << o->type_index() << " ("
+                    << o.operator->() << ")" << std::endl;
+          if (o.as<runtime::metadata::MetadataBaseNode>() != nullptr) {
+            std::stringstream ss;
+            ss << i;
+            address_parts_.push_back(ss.str());
+            runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
+            ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+            address_parts_.pop_back();
+          }
+        }
+      } else {
+        ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+      }
+
+      queue_->push_back(std::make_tuple(address_from_parts(address_parts_),
+                                        Downcast<runtime::metadata::MetadataBase>(*value)));
+    }
+    address_parts_.pop_back();
+  }
+
+ private:
+  std::vector<QueueItem>* queue_;
+  std::vector<std::string> address_parts_;
+};
+
+class MetadataSerializer : public AttrVisitor {
+ public:
+  static constexpr const char* kGlobalSymbol = "kTvmgenMetadata";
+  using MetadataTypeIndex = ::tvm::runtime::metadata::MetadataTypeIndex;
+
+  MetadataSerializer() : is_first_item_{true} {}
+
+  void WriteComma() {
+    if (is_first_item_) {
+      is_first_item_ = false;
+    } else {
+      code_ << ", " << std::endl;
+    }
+  }
+
+  void WriteKey(const char* key) {
+    if (key != nullptr) {
+      code_ << " /* " << key << "*/";
+    }
+  }
+
+  void Visit(const char* key, double* value) final {
+    WriteComma();
+    code_.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+               std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+    code_ << *value;
+    WriteKey(key);
+  }
+
+  void Visit(const char* key, int64_t* value) final {
+    WriteComma();
+    code_ << *value << "L";
+    WriteKey(key);
+  }
+
+  void Visit(const char* key, uint64_t* value) final {
+    WriteComma();
+    code_ << *value << "UL";
+    WriteKey(key);
+  }
+  void Visit(const char* key, int* value) final {
+    WriteComma();
+    code_ << *value;
+    WriteKey(key);
+  }
+  void Visit(const char* key, bool* value) final {
+    WriteComma();
+    code_ << *value;
+    WriteKey(key);
+  }
+  void Visit(const char* key, std::string* value) final {
+    WriteComma();
+    code_ << "\"" << *value << "\"";
+    WriteKey(key);
+  }
+  void Visit(const char* key, void** value) final {
+    WriteComma();
+    code_ << *value;
+    WriteKey(key);
+  }
+  void Visit(const char* key, DataType* value) final {
+    WriteComma();
+    code_ << "{" << value->code() << ", " << value->bits() << ", " << value->lanes() << "}";
+    WriteKey(key);
+  }
+
+  void Visit(const char* key, runtime::NDArray* value) final {
+    // TODO(areusch): probably we could consolidate --link-params here, tho...
+    ICHECK(false) << "do not support serializing NDArray as metadata";
+  }
+
+  void VisitArray(const runtime::metadata::MetadataArrayNode* array) {
+    std::cout << "visit array " << array << ": " << array->type_index << " " << array->struct_name
+              << "," << array->array.size() << std::endl;
+    auto old_is_first_item = is_first_item_;
+    is_first_item_ = true;
+    for (unsigned int i = 0; i < array->array.size(); ++i) {  // ObjectRef o : *(array->array)) {
+      ObjectRef o = array->array[i];
+      std::cout << "visiting array element " << i << ": " << o->type_index() << " ("
+                << o.operator->() << ")" << std::endl;
+      if (o->IsInstance<IntImmNode>()) {
+        int64_t i = Downcast<Integer>(o);
+        Visit(nullptr, &i);
+        continue;
+      }
+
+      if (o->IsInstance<StringObj>()) {
+        std::string s = Downcast<String>(o);
+        Visit(nullptr, &s);
+        continue;
+      }
+
+      runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
+      std::stringstream i_str;
+      i_str << i;
+      address_.push_back(i_str.str());
+      Visit(nullptr, &metadata);
+      address_.pop_back();
+      //      ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    }
+    is_first_item_ = old_is_first_item;
+  }
+
+  void Visit(const char* key, ObjectRef* value) final {
+    const runtime::metadata::MetadataArrayNode* arr =
+        value->as<runtime::metadata::MetadataArrayNode>();
+    std::cout << "Is array? " << arr << std::endl;
+    if (arr != nullptr) {
+      WriteComma();
+      if (key != nullptr) {
+        address_.push_back(key);
+      }
+      code_ << address_from_parts(address_);
+      if (key != nullptr) {
+        address_.pop_back();
+      }
+      return;
+    }
+
+    runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(*value);
+    if (key != nullptr) {  // NOTE: outermost call passes nullptr key
+      address_.push_back(key);
+    }
+    ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    if (key != nullptr) {  // NOTE: outermost call passes nullptr key
+      address_.pop_back();
+    }
+  }
+
+  void CodegenMetadata(::tvm::runtime::metadata::Metadata metadata) {
+    decl_ << "#include <inttypes.h>" << std::endl
+          << "#include <tvm/runtime/metadata.h>" << std::endl
+          << "#include <tvm/runtime/c_runtime_api.h>" << std::endl;
+    std::vector<MetadataQueuer::QueueItem> queue;
+    MetadataQueuer queuer{&queue};
+    queuer.Visit(kGlobalSymbol, &metadata);
+
+    for (MetadataQueuer::QueueItem item : queue) {
+      auto struct_name = std::get<0>(item);
+      auto obj = std::get<1>(item);
+      auto arr = obj.as<runtime::metadata::MetadataArrayNode>();
+      std::cout << "codegen: " << struct_name;
+      is_first_item_ = true;
+      address_.push_back(struct_name);
+      if (arr != nullptr) {
+        const char* const_part = "const ";
+        if (arr->type_index == MetadataTypeIndex::kString) {
+          const_part = "";
+        }
+        code_ << const_part;
+        switch (arr->type_index) {
+          case MetadataTypeIndex::kUint64:
+            code_ << "uint64_t";
+            break;
+          case MetadataTypeIndex::kInt64:
+            code_ << "int64_t";
+            break;
+          case MetadataTypeIndex::kBool:
+            code_ << "bool";
+            break;
+          case MetadataTypeIndex::kString:
+            code_ << "const char*";
+            break;
+          case MetadataTypeIndex::kHandle:
+            code_ << "void*";
+            break;
+          case MetadataTypeIndex::kMetadata:
+            code_ << "struct " << arr->struct_name;
+            break;
+          default:
+            CHECK(false) << "Unknown type_index in array: " << arr->type_index
+                         << " (struct_name=" << arr->struct_name << ")";
+            break;
+        }
+        code_ << " " << struct_name << "[" << arr->array.size() << "] = {" << std::endl;
+        VisitArray(arr);
+      } else {
+        code_ << "const struct TVMMetadata " << struct_name << " = {" << std::endl;
+        Visit(nullptr, &obj);
+      }
+      address_.pop_back();
+      code_ << "};" << std::endl;
+    }
+  }
+
+  std::string GetOutput() { return decl_.str() + code_.str(); }
+
+ private:
+  std::vector<std::string> address_;
+  std::stringstream decl_;
+  std::stringstream code_;
+  bool is_first_item_;
+  std::unordered_set<std::string> generated_struct_decls_;
+  std::vector<bool> is_defining_struct_;
+};
+
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                                relay::Runtime runtime,
                                                relay::backend::ExecutorCodegenMetadata metadata) {
@@ -539,6 +801,32 @@ runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& mod
   return std::move(csrc_metadata_module);
 }
 
+runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metadata) {
+  MetadataSerializer serializer;
+  serializer.CodegenMetadata(metadata);
+  std::stringstream lookup_func;
+  lookup_func << "#ifdef __cplusplus\n"
+              << "extern \"C\"\n"
+              << "#endif\n";
+
+  lookup_func << "TVM_DLL int32_t get_c_metadata(TVMValue* arg_values, int* arg_tcodes, int "
+                 "num_args, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {"
+              << std::endl;
+  lookup_func << "    ret_values[0].v_handle = (void*) &" << MetadataSerializer::kGlobalSymbol
+              << ";" << std::endl;
+  lookup_func << "    ret_tcodes[0] = kTVMOpaqueHandle;" << std::endl;
+  lookup_func << "    return 0;" << std::endl;
+  lookup_func << "};" << std::endl;
+
+  auto mod = MetadataModuleCreate(metadata);
+  std::vector<String> func_names{"get_c_metadata"};
+  // definer.GetOutput() +
+  auto c = CSourceModuleCreate(serializer.GetOutput() + lookup_func.str(), "c", func_names,
+                               Array<String>());
+  mod->Import(c);
+  return mod;
+}
+
 // supports limited save without cross compile
 class DeviceSourceModuleNode final : public runtime::ModuleNode {
  public:
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index 3b482a107600..2a63a8eeb814 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -26,6 +26,7 @@
 #define TVM_TARGET_SOURCE_SOURCE_MODULE_H_
 
 #include <tvm/relay/runtime.h>
+#include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
@@ -36,17 +37,24 @@ namespace tvm {
 namespace codegen {
 
 /*!
+
  * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime.
  * \param modules The modules to be wrapped.
  * \param target the target the modules are compiled for.
  * \param runtime the runtime to code generate against
- * \param metadata the metadata needed for code generation.
+ * \param metadata Compiler-generated metadata exported to runtime.
  * \return The wrapped module.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                                relay::Runtime runtime,
                                                relay::backend::ExecutorCodegenMetadata metadata);
 
+/*!
+ * \brief Create C++-runtime targeted metadata module for "c" backend.
+ * \param metadata Compiler-generated metadata.
+ */
+runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metadata);
+
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc
index cb2b50260326..2d8b6681fa84 100644
--- a/src/tir/transforms/legalize_packed_calls.cc
+++ b/src/tir/transforms/legalize_packed_calls.cc
@@ -75,6 +75,12 @@ class PackedCallLegalizer : public StmtExprMutator {
             new_stmts.push_back(tir::Evaluate(
                 tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                                {sid_array, 0, tir::builtin::kArrData, call->args[i]})));
+            new_stmts.push_back(tir::Evaluate(
+                tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                               {sid_array, 0, tir::builtin::kArrDeviceType, kDLCPU})));
+            new_stmts.push_back(tir::Evaluate(
+                tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                               {sid_array, 0, tir::builtin::kArrDeviceId, 0})));
             packed_args.push_back(sid_array);
           }
         }
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index 8252ee68ade8..b84e5eb6d775 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -225,17 +225,24 @@ def test_without_device_api_packed_api(non_device_api_main_func):
     """Test a graph without the Device API with the packed internal calls"""
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
-    assert (
-        str(main_func.body)
-        == 'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n'
-        + 'let tvm_value_2 = tir.tvm_stack_alloca("array", 1)\n'
-        + 'let tvm_value_1 = tir.tvm_stack_alloca("array", 1)\n'
-        + 'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n'
-        + "tir.tvm_struct_set(tvm_value_0, 0, 1, x_buffer_var)\n"
-        + "tir.tvm_struct_set(tvm_value_1, 0, 1, y_buffer_var)\n"
-        + "tir.tvm_struct_set(tvm_value_2, 0, 1, output_buffer_var)\n"
-        + "tir.tvm_struct_set(tvm_value_3, 0, 1, tir.reinterpret((uint64)0))\n"
-        + 'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", tvm_value_0, tvm_value_1, tvm_value_2, tvm_value_3)\n'
+    assert str(main_func.body) == (
+        'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n'
+        'let tvm_value_2 = tir.tvm_stack_alloca("array", 1)\n'
+        'let tvm_value_1 = tir.tvm_stack_alloca("array", 1)\n'
+        'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n'
+        "tir.tvm_struct_set(tvm_value_0, 0, 1, x_buffer_var)\n"
+        "tir.tvm_struct_set(tvm_value_0, 0, 10, 1)\n"
+        "tir.tvm_struct_set(tvm_value_0, 0, 9, 0)\n"
+        "tir.tvm_struct_set(tvm_value_1, 0, 1, y_buffer_var)\n"
+        "tir.tvm_struct_set(tvm_value_1, 0, 10, 1)\n"
+        "tir.tvm_struct_set(tvm_value_1, 0, 9, 0)\n"
+        "tir.tvm_struct_set(tvm_value_2, 0, 1, output_buffer_var)\n"
+        "tir.tvm_struct_set(tvm_value_2, 0, 10, 1)\n"
+        "tir.tvm_struct_set(tvm_value_2, 0, 9, 0)\n"
+        "tir.tvm_struct_set(tvm_value_3, 0, 1, tir.reinterpret((uint64)0))\n"
+        "tir.tvm_struct_set(tvm_value_3, 0, 10, 1)\n"
+        "tir.tvm_struct_set(tvm_value_3, 0, 9, 0)\n"
+        'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", tvm_value_0, tvm_value_1, tvm_value_2, tvm_value_3)\n'
     )
 
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 0147b8cf755a..2ce36f19fcc8 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -16,6 +16,7 @@
 # under the License.
 
 from collections import OrderedDict
+import re
 import sys
 
 import numpy as np
@@ -49,7 +50,14 @@ def test_error_c_interface_with_packed_api():
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
 
-    with pytest.raises(tvm.TVMError, match="Packed interface required for packed operators"):
+    with pytest.raises(
+        tvm.TVMError,
+        match=re.escape(
+            'Either need interface_api == "packed" (got: c) or '
+            "unpacked-api == true (got: (bool)0) when targeting "
+            "c runtime"
+        ),
+    ):
         compile_and_run(
             AOTTestModel(
                 module=IRModule.from_expr(func), inputs={}, outputs=generate_ref_data(func, {})
diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py
index 222d647f4ea7..54561ade23e4 100644
--- a/tests/python/unittest/test_aot_legalize_packed_call.py
+++ b/tests/python/unittest/test_aot_legalize_packed_call.py
@@ -57,8 +57,17 @@ def tir_packed_call() -> None:
             with T.let(tvm_value_1, T.tvm_stack_alloca("array", 1, dtype="handle")):
                 with T.let(tvm_value_0, T.tvm_stack_alloca("array", 1, dtype="handle")):
                     T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 1, A, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 10, 1, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 9, 0, dtype="handle"))
+
                     T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 1, B, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 10, 1, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 9, 0, dtype="handle"))
+
                     T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 1, C, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 10, 1, dtype="handle"))
+                    T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 9, 0, dtype="handle"))
+
                     T.evaluate(
                         T.tvm_call_cpacked(
                             "tvm_test_cpacked",

From 033885ccaf48e4897f99d10a6aabaf6c176a7823 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 8 Dec 2021 15:18:28 -0800
Subject: [PATCH 09/21] Add a runtime Module to mux between .text Metadata and
 live Metadata.

---
 src/runtime/meta_data.h | 10 ++++++
 src/runtime/metadata.cc | 71 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index c19229243f1f..2c803193ed23 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -27,6 +27,7 @@
 #include <dmlc/io.h>
 #include <dmlc/json.h>
 #include <tvm/runtime/executor_info.h>
+#include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
@@ -49,6 +50,15 @@ inline String get_name_mangled(const String& module_name, const String& name) {
   return ss.str();
 }
 
+/*!
+ * \brief Create a metadata module object.
+ *
+ * \param metadata Exported metadata structure.
+ *
+ * \return The created metadata module.
+ */
+Module MetadataModuleCreate(metadata::Metadata metadata);
+
 /*! \brief A tag to specify whether or not dynamic shared memory is used */
 constexpr const char* kUseDynamicSharedMemoryTag = "tir.use_dyn_shared_memory";
 
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index df2bb0d461f3..31144a3140cd 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -18,16 +18,23 @@
  */
 
 /*!
- * \file metadata.cc
- * \brief Implementations of the runtime component of Metadata.
+ * \file tvm/runtime/metadata.h
+ * \brief Defines implementations of TVM metadata which can exist in the runtime.
  */
 
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/metadata.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
 
 namespace tvm {
 namespace runtime {
 namespace metadata {
 
+TVM_REGISTER_OBJECT_TYPE(MetadataBaseNode);
+
 ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::inputs() {
   return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->inputs, data_->num_inputs);
 }
@@ -55,5 +62,65 @@ TensorInfo::TensorInfo(const struct ::TVMTensorInfo* data)
 TVM_REGISTER_OBJECT_TYPE(TensorInfoNode);
 
 }  // namespace metadata
+
+class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
+ public:
+  explicit MetadataModuleNode(runtime::metadata::Metadata metadata) {
+    // CHECK((metadata.defined() && code.size() > 0) || (!metadata.defined() && code.size() == 0))
+    //   << "metadata and code must both be either defined (when passed from compiler) or undefined
+    //   "
+    //   << "(when passed from runtime)";
+    metadata_ = metadata;
+    //    code_ = code;
+  }
+
+  const char* type_key() const { return "metadata_module"; }
+
+  static Module LoadFromBinary() {
+    return Module(make_object<MetadataModuleNode>(runtime::metadata::Metadata()));
+  }
+
+  void SaveToBinary(dmlc::Stream* stream) final {}
+
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
+    if (name == "get_metadata") {
+      return PackedFunc([this, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
+        if (!metadata_.defined()) {
+          TVMFunctionHandle f_handle;
+          int32_t ret_code = TVMBackendGetFuncFromEnv(this, "get_c_metadata", &f_handle);
+          CHECK_EQ(ret_code, 0) << "Unable to locate get_c_metadata PackedFunc";
+
+          TVMValue ret_value;
+          int ret_type_code;
+          ret_code = TVMFuncCall(f_handle, nullptr, nullptr, 0, &ret_value, &ret_type_code);
+          CHECK_EQ(ret_code, 0) << "Invoking get_c_metadata: TVMFuncCall returned " << ret_code;
+
+          CHECK_EQ(ret_type_code, kTVMOpaqueHandle)
+              << "Expected kOpaqueHandle returned; got " << ret_type_code;
+          CHECK(ret_value.v_handle != nullptr) << "get_c_metadata returned nullptr";
+
+          metadata_ = runtime::metadata::Metadata(
+              static_cast<const struct ::TVMMetadata*>(ret_value.v_handle));
+        }
+
+        *rv = metadata_;
+        return;
+      });
+    }
+
+    return PackedFunc();
+  }
+
+ private:
+  runtime::metadata::Metadata metadata_;
+};
+
+Module MetadataModuleCreate(metadata::Metadata metadata) {
+  return Module(make_object<MetadataModuleNode>(metadata));
+}
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_metadata_module")
+    .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = MetadataModuleNode::LoadFromBinary(); });
+
 }  // namespace runtime
 }  // namespace tvm

From 3717f0c35e072c05ec989f08caae53e192a34d1b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 8 Dec 2021 16:33:51 -0800
Subject: [PATCH 10/21] Move launch_param to namespace

---
 src/runtime/meta_data.h            | 4 ++++
 src/runtime/thread_storage_scope.h | 3 ++-
 src/target/build_common.h          | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 2c803193ed23..766b93261ac0 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -59,9 +59,13 @@ inline String get_name_mangled(const String& module_name, const String& name) {
  */
 Module MetadataModuleCreate(metadata::Metadata metadata);
 
+namespace launch_param {
+
 /*! \brief A tag to specify whether or not dynamic shared memory is used */
 constexpr const char* kUseDynamicSharedMemoryTag = "tir.use_dyn_shared_memory";
 
+}  // namespace launch_param
+
 /*! \brief function information needed by device */
 struct FunctionInfo {
   std::string name;
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index d577770db1a9..4122f9d0798e 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_THREAD_STORAGE_SCOPE_H_
 #define TVM_RUNTIME_THREAD_STORAGE_SCOPE_H_
 
+#include <tvm/runtime/metadata.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <string>
@@ -205,7 +206,7 @@ class LaunchParamConfig {
     std::vector<bool> filled(6, false);
     for (size_t i = 0; i < launch_param_tags.size(); ++i) {
       const std::string& tag = launch_param_tags[i];
-      if (tag == kUseDynamicSharedMemoryTag) {
+      if (tag == launch_param::kUseDynamicSharedMemoryTag) {
         ICHECK_EQ(i, launch_param_tags.size() - 1)
             << "kUseDynamicSharedMemoryTag should be the last tag in launch_param_tags.";
         use_dyn_shared_memory_ = true;
diff --git a/src/target/build_common.h b/src/target/build_common.h
index c66c2b52822e..6c94ec8703b7 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -58,7 +58,7 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co
     }
     if (auto opt = f->GetAttr<Integer>(tir::attr::kDeviceUseDynSharedMemory)) {
       if (opt.value()) {
-        info.launch_param_tags.push_back(runtime::kUseDynamicSharedMemoryTag);
+        info.launch_param_tags.push_back(runtime::launch_param::kUseDynamicSharedMemoryTag);
       }
     }
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);

From b78b6c4445a3921acc8f4a1850993a56c06e3ede Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 9 Dec 2021 12:08:17 -0800
Subject: [PATCH 11/21] Add test of c++ AOT.

---
 tests/python/relay/aot/test_cpp_aot.py | 94 ++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 tests/python/relay/aot/test_cpp_aot.py

diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
new file mode 100644
index 000000000000..faee1d75b52e
--- /dev/null
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import sys
+import textwrap
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm import relay, TVMError
+from tvm.ir.module import IRModule
+from tvm.relay import backend, testing, transform
+from tvm.relay.testing import byoc
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from aot_test_utils import (
+    AOTTestModel,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    convert_to_relay,
+    compile_and_run,
+    compile_models,
+    parametrize_aot_options,
+)
+
+
+def print_mod_tree(m, indent=0):
+    print(f"{' ' * indent} - {m!r}")
+    for i in m.imported_modules:
+      print_mod_tree(i, indent + 2)
+
+def test_conv2d():
+    RELAY_MODEL = textwrap.dedent("""\
+        #[version = "0.0.5"]
+        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
+            %1 = nn.conv2d(
+                 %data,
+                 %weight,
+                 padding=[2, 2],
+                 channels=8,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+          %1
+        }
+    """)
+    ir_mod = tvm.parser.fromtext(RELAY_MODEL)
+
+    main_func = ir_mod["main"]
+    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+
+    weight_data = np.ones(shape_dict["weight"]).astype(type_dict["weight"])
+    input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
+
+    params = {"weight": weight_data}
+    inputs = {"data": input_data}
+    output_list = generate_ref_data(ir_mod, inputs, params)
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, params=params, target="c",
+                              executor=backend.Executor("aot", {"interface-api": "c"}))
+
+    print_mod_tree(mod.module)
+
+    with tvm.contrib.utils.TempDirectory.set_keep_for_debug():
+        mod.export_library("test.so")
+    mod.export_library("test.tar")
+    runner = tvm.runtime.load_module("test.so")
+    print_mod_tree(runner)
+    runner = tvm.runtime.executor.AotModule(runner["default"](tvm.cpu(0)))
+    runner.set_input(**inputs)
+    runner.run()
+    assert (runner.get_output(0).asnumpy() == output_list[0]).all()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 2384fcc4953fb3aadec119b8f474069a3ee266f5 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 12 Dec 2021 22:25:45 -0800
Subject: [PATCH 12/21] Fix incongruity between kTvmRuntimeCrt constant

---
 include/tvm/relay/runtime.h  | 6 ++++++
 src/relay/backend/runtime.cc | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/tvm/relay/runtime.h b/include/tvm/relay/runtime.h
index a925045f9f41..10e124bc339b 100644
--- a/include/tvm/relay/runtime.h
+++ b/include/tvm/relay/runtime.h
@@ -44,6 +44,12 @@ class AttrRegistry;
 
 namespace relay {
 
+/*! \brief Value used with Runtime::name to indicate the C++ runtime. */
+static constexpr const char* kTvmRuntimeCpp = "cpp";
+
+/*! \brief Value used with Runtime::name to indicate the C runtime. */
+static constexpr const char* kTvmRuntimeCrt = "crt";
+
 /*!
  * \brief Runtime information.
  *
diff --git a/src/relay/backend/runtime.cc b/src/relay/backend/runtime.cc
index 786d6f937f14..923c9b2d5f65 100644
--- a/src/relay/backend/runtime.cc
+++ b/src/relay/backend/runtime.cc
@@ -88,9 +88,9 @@ RuntimeRegEntry& RuntimeRegEntry::RegisterOrGet(const String& name) {
 
 /**********  Register Runtimes and options  **********/
 
-TVM_REGISTER_RUNTIME("crt").add_attr_option<Bool>("system-lib");
+TVM_REGISTER_RUNTIME(kTvmRuntimeCrt).add_attr_option<Bool>("system-lib");
 
-TVM_REGISTER_RUNTIME("cpp").add_attr_option<Bool>("system-lib");
+TVM_REGISTER_RUNTIME(kTvmRuntimeCpp).add_attr_option<Bool>("system-lib");
 
 /**********  Registry  **********/
 

From 5ec990f8828318b1b9c5dd605b28faa1033df2d7 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 15 Feb 2022 19:21:06 -0800
Subject: [PATCH 13/21] Expand ExecutorCodegenMetadata to include AOT runtime
 metadata.

---
 src/relay/backend/aot_executor_codegen.cc   | 16 +++++++++++++---
 src/relay/backend/graph_executor_codegen.cc |  5 +++++
 src/relay/backend/utils.cc                  |  7 +++++--
 src/relay/backend/utils.h                   | 13 ++++++++++---
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 3fa4fa5e8066..4009d1029582 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -723,6 +723,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0,
                                        name + "_buffer", 16, 1, tir::BufferType::kDefault);
       main_buffer_map_.Set(var, buffer);
+      io_tensor_types_.Set(var, Downcast<TensorType>(expr->checked_type()));
     }
   }
 
@@ -831,6 +832,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   Array<tir::Var> main_signature_;
   /*! \brief input and output variables belonging to the main function signature */
   Map<tir::Var, tir::Buffer> main_buffer_map_;
+  /*! \brief maps input and output variables to TensorType which describe them */
+  Map<tir::Var, TensorType> io_tensor_types_;
   /*! \brief target device */
   tec::TargetMap targets_;
   /*! \brief target host */
@@ -1060,6 +1063,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
                         tir_main_func->params.begin() + tir_main_func->params.size() -
                             return_sid_.size() - pool_vars.size() - devices.size());
 
+    Array<TensorType> input_tensor_types;
+    for (auto i : inputs) {
+      input_tensor_types.push_back(io_tensor_types_[i]);
+    }
+
     std::vector<String> output_var_names;
     if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
       Array<String> output_tensor_names = opt.value();
@@ -1079,9 +1087,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       }
     }
 
-    ret.metadata = ExecutorCodegenMetadata(inputs, pool_vars, devices, output_var_names,
-                                           runtime::kTvmExecutorAot, mod_name, interface_api,
-                                           use_unpacked_api_, pool_var_info);
+    Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
+
+    ret.metadata = ExecutorCodegenMetadata(
+        inputs, input_tensor_types, output_var_names, output_tensor_types, pool_vars, devices,
+        runtime::kTvmExecutorAot, mod_name, interface_api, use_unpacked_api_, pool_var_info);
     return ret;
   }
 
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 16d0207ff278..30bc0beeebce 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -290,6 +290,11 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     // This is the point where we separate the functions in the module by target
     ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
     ret.external_mods = external_modules.value();
+    ret.metadata =
+        ExecutorCodegenMetadata({} /* inputs */, {} /* input_tensor_types */, {} /* outputs */,
+                                {} /* output_tensor_types */, {} /* pools */, {} /* devices */,
+                                runtime::kTvmExecutorGraph /* executor */, mod_name_ /* mod_name */,
+                                "packed" /* interface_api */, Bool(false) /* unpacked_api */);
     return ret;
   }
 
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index a3eb00ae984c..990d619e4231 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -179,14 +179,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 ExecutorCodegenMetadata::ExecutorCodegenMetadata(
-    Array<tir::Var> inputs, Array<tir::Var> pools, Array<String> devices, Array<String> outputs,
+    Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
+    Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
     String executor, String mod_name, String interface_api, bool unpacked_api,
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs) {
   auto n = make_object<ExecutorCodegenMetadataNode>();
   n->inputs = inputs;
+  n->input_tensor_types = input_tensor_types;
+  n->outputs = outputs;
+  n->output_tensor_types = output_tensor_types;
   n->pools = pools;
   n->devices = devices;
-  n->outputs = outputs;
   n->executor = executor;
   n->interface_api = interface_api;
   n->unpacked_api = unpacked_api;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index d04bc9bb26ea..3b4d4c18de89 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -64,8 +64,12 @@ class ExecutorCodegenMetadataNode : public Object {
  public:
   /*! \brief input information for the main function */
   Array<tir::Var> inputs;
+  /*! \brief input tensor type information */
+  Array<TensorType> input_tensor_types;
   /*! \brief output information for the main function */
   Array<String> outputs;
+  /*! \brief output tensor type information */
+  Array<TensorType> output_tensor_types;
   /*! \brief pool information for the main function */
   Array<tir::Var> pools;
   /*! \brief device contexts information for the main function */
@@ -83,8 +87,10 @@ class ExecutorCodegenMetadataNode : public Object {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("inputs", &inputs);
-    v->Visit("pools", &pools);
+    v->Visit("input_tensor_types", &input_tensor_types);
     v->Visit("outputs", &outputs);
+    v->Visit("output_tensor_types", &output_tensor_types);
+    v->Visit("pools", &pools);
     v->Visit("devices", &devices);
     v->Visit("executor", &executor);
     v->Visit("unpacked_api", &unpacked_api);
@@ -100,8 +106,9 @@ class ExecutorCodegenMetadataNode : public Object {
  */
 class ExecutorCodegenMetadata : public ObjectRef {
  public:
-  TVM_DLL ExecutorCodegenMetadata(Array<tir::Var> inputs, Array<tir::Var> pools,
-                                  Array<String> devices, Array<String> outputs, String executor,
+  TVM_DLL ExecutorCodegenMetadata(Array<tir::Var> inputs, Array<TensorType> input_tensor_types,
+                                  Array<String> outputs, Array<TensorType> output_tensor_types,
+                                  Array<tir::Var> pools, Array<String> devices, String executor,
                                   String mod_name, String interface_api = "packed",
                                   bool unpacked_api = false,
                                   Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =

From aef76ca8ae6c748945f745976477d66f7cbaa803 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 15 Feb 2022 21:27:29 -0800
Subject: [PATCH 14/21] commit cpp test

---
 tests/python/relay/aot/test_cpp_aot.py | 83 +++++++++++++++++++-------
 1 file changed, 61 insertions(+), 22 deletions(-)

diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index faee1d75b52e..54cdf09639d8 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -39,27 +39,38 @@
 )
 
 
-def print_mod_tree(m, indent=0):
-    print(f"{' ' * indent} - {m!r}")
-    for i in m.imported_modules:
-      print_mod_tree(i, indent + 2)
+enable_usmp = tvm.testing.parameter(True, False)
 
-def test_conv2d():
-    RELAY_MODEL = textwrap.dedent("""\
+
+def test_conv2d(enable_usmp):
+    RELAY_MODEL = textwrap.dedent(
+        """\
         #[version = "0.0.5"]
-        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
+        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
             %1 = nn.conv2d(
                  %data,
                  %weight,
                  padding=[2, 2],
-                 channels=8,
+                 channels=3,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+            %2 = cast(nn.max_pool2d(%1, pool_size=[3, 3]), dtype="int8");
+            %3 = nn.conv2d(
+                 %2,
+                 %weight,
+                 padding=[2, 2],
+                 channels=3,
                  kernel_size=[5, 5],
                  data_layout="NCHW",
                  kernel_layout="OIHW",
                  out_dtype="int32");
-          %1
+            %4 = nn.max_pool2d(%3, pool_size=[3, 3]);
+            %4
         }
-    """)
+    """
+    )
     ir_mod = tvm.parser.fromtext(RELAY_MODEL)
 
     main_func = ir_mod["main"]
@@ -71,23 +82,51 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
     params = {"weight": weight_data}
     inputs = {"data": input_data}
-    output_list = generate_ref_data(ir_mod, inputs, params)
+    ref_outputs = generate_ref_data(ir_mod, inputs, params)
 
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.relay.build(ir_mod, params=params, target="c",
-                              executor=backend.Executor("aot", {"interface-api": "c"}))
+    with tvm.transform.PassContext(
+        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": enable_usmp}
+    ):
+        mod = tvm.relay.build(
+            ir_mod,
+            params=params,
+            target="c",
+            executor=backend.Executor("aot", {"interface-api": "packed"}),
+        )
+
+    temp_dir = tvm.contrib.utils.TempDirectory()
+    test_so_path = temp_dir / "test.so"
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    loaded_mod = tvm.runtime.load_module(test_so_path)
+    runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
+    runner.set_input(**inputs)
+    runner.run()
+    assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
-    print_mod_tree(mod.module)
 
-    with tvm.contrib.utils.TempDirectory.set_keep_for_debug():
-        mod.export_library("test.so")
-    mod.export_library("test.tar")
-    runner = tvm.runtime.load_module("test.so")
-    print_mod_tree(runner)
-    runner = tvm.runtime.executor.AotModule(runner["default"](tvm.cpu(0)))
+def test_mobilenet():
+    ir_mod, params = testing.mobilenet.get_workload(batch_size=1)
+    data_shape = [int(x) for x in ir_mod["main"].checked_type.arg_types[0].shape]
+    data = np.random.uniform(size=data_shape).astype("float32")
+    inputs = {"data": data}
+    ref_outputs = generate_ref_data(ir_mod, inputs, params)
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(
+            ir_mod,
+            params=params,
+            target="c",
+            executor=backend.Executor("aot", {"interface-api": "packed"}),
+        )
+
+    temp_dir = tvm.contrib.utils.TempDirectory()
+    test_so_path = temp_dir / "test.so"
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    loaded_mod = tvm.runtime.load_module(test_so_path)
+    runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
     runner.set_input(**inputs)
     runner.run()
-    assert (runner.get_output(0).asnumpy() == output_list[0]).all()
+    assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
 if __name__ == "__main__":

From 84e5b92e9425d186194a3a86dd2080452c1d692d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 25 Feb 2022 13:33:58 -0800
Subject: [PATCH 15/21] Make Metadata compile under C.

---
 include/tvm/runtime/metadata.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index 49eef34d0819..4a8a4e0b2f60 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -33,12 +33,13 @@
 #include <tvm/runtime/c_runtime_api.h>
 #ifdef __cplusplus
 #include <tvm/runtime/metadata_base.h>
-#endif
 #include <tvm/support/span.h>
+#endif
 
 // Version number recorded in emitted artifacts for runtime checking.
 #define TVM_METADATA_VERSION 1
 
+#ifdef __cplusplus
 namespace tvm {
 namespace runtime {
 namespace metadata {
@@ -51,7 +52,6 @@ static const constexpr int64_t kMetadataVersion = TVM_METADATA_VERSION;
 }  // namespace runtime
 }  // namespace tvm
 
-#ifdef __cplusplus
 extern "C" {
 #endif
 

From ccc6c9c614cf1e5239071822b4f4e053614c66f3 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 25 Feb 2022 17:22:25 -0800
Subject: [PATCH 16/21] Ignore ephemeral metadata_module
 export_model_library_format.

 * This module does not need to be exported, since it is merely a C++
   wrapper around get_c_metadata, and get_metadata is not used in C.
---
 python/tvm/micro/model_library_format.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index d53c4ed49939..6b59b3443078 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -64,6 +64,10 @@ def generate_c_interface_header(
     return metadata_header
 
 
+# List of type_key for modules which are ephemeral and do not need to be exported.
+EPHEMERAL_MODULE_TYPE_KEYS = ("metadata_module",)
+
+
 def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
     """Populate the codegen sub-directory as part of a Model Library Format export.
 
@@ -79,6 +83,11 @@ def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
     """
     dso_modules = mod._collect_dso_modules()
     non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
+
+    # Filter ephemeral modules which cannot be exported.
+    dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+    non_dso_modules = [m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+
     if non_dso_modules:
         raise UnsupportedInModelLibraryFormatError(
             f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"

From c0be55cfc3f439b742a116c77fc018a32ee0115c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 1 Mar 2022 11:49:09 -0800
Subject: [PATCH 17/21] address manupa, kparszsyc, masahi comments.

---
 include/tvm/runtime/metadata.h                |  2 +-
 python/tvm/relay/build_module.py              | 72 +++++++++++++++++--
 python/tvm/runtime/executor/aot_executor.py   | 18 ++---
 src/relay/backend/aot_executor_codegen.cc     |  5 +-
 src/runtime/aot_executor/aot_executor.cc      |  4 +-
 .../aot_executor/aot_executor_factory.cc      |  2 +-
 src/target/metadata_module.cc                 |  2 +
 src/target/source/source_module.cc            |  9 ---
 tests/python/relay/aot/test_cpp_aot.py        | 37 ++++++++++
 9 files changed, 121 insertions(+), 30 deletions(-)

diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index 4a8a4e0b2f60..cd65f6fb7486 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -75,7 +75,7 @@ struct TVMMetadata {
   const struct TVMTensorInfo* outputs;
   /*! \brief Number of elements in `outputs` array. */
   int64_t num_outputs;
-  /*! \brief Memory Pools needed by the AOT run_model function.
+  /*! \brief Memory Pools needed by the AOT main function.
    * The order of the elements is the same as in the arguments to run_model. That is to say,
    * this array specifies the last `num_pools` arguments to run_model.
    */
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 5cfd3a16c3bc..b8ea35e38b6a 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -27,8 +27,9 @@
 from tvm.tir import expr as tvm_expr
 from tvm.target import Target
 from .. import nd as _nd, autotvm, register_func
+from ..runtime.executor import aot_executor as _aot_executor
 from ..target import Target
-from ..contrib import graph_executor as _graph_rt
+from ..contrib import graph_executor as _graph_executor
 from . import _build_module
 from . import ty as _ty
 from . import expr as _expr
@@ -612,7 +613,7 @@ def _make_executor(self, expr=None):
                 "Graph Executor only supports static graphs, got output type", ret_type
             )
         mod = build(self.mod, target=self.target)
-        gmodule = _graph_rt.GraphModule(mod["default"](self.device))
+        gmodule = _graph_executor.GraphModule(mod["default"](self.device))
 
         def _unflatten(flat_iter, cur_type):
             if isinstance(cur_type, _ty.TensorType):
@@ -641,6 +642,66 @@ def _graph_wrapper(*args, **kwargs):
         return _graph_wrapper
 
 
+class AotExecutor(_interpreter.Executor):
+    """Implements the Executor interface for AOT.
+
+    Parameters
+    ----------
+    mod : :py:class:`~tvm.IRModule`
+        The module to support the execution.
+
+    device : :py:class:`Device`
+        The runtime device to run the code on.
+
+    target : :py:class:`Target`
+        The target option to build the function.
+    """
+
+    def __init__(self, mod, device, target):
+        assert mod is not None
+        self.mod = mod
+        self.device = device
+        self.target = target
+
+    def _make_executor(self, expr=None):
+        if expr:
+            self.mod["main"] = expr
+        self.mod = InferType()(self.mod)
+        ret_type = self.mod["main"].checked_type.ret_type
+        if _ty.is_dynamic(ret_type):
+            raise ValueError(
+                "AOT Executor only supports static graphs, got output type", ret_type
+            )
+        mod = build(self.mod, target=self.target)
+        gmodule = _aot_executor.AotModule(mod["default"](self.device))
+
+        def _unflatten(flat_iter, cur_type):
+            if isinstance(cur_type, _ty.TensorType):
+                return next(flat_iter)
+            if isinstance(cur_type, _ty.TupleType):
+                fields = []
+                for field_type in cur_type.fields:
+                    field = _unflatten(flat_iter, field_type)
+                    fields.append(field)
+                return fields
+            raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
+
+        def _aot_wrapper(*args, **kwargs):
+            args = self._convert_args(self.mod["main"], args, kwargs)
+            # Create map of inputs.
+            for i, arg in enumerate(args):
+                gmodule.set_input(i, arg)
+            # Run the module, and fetch the output.
+            gmodule.run()
+            flattened = []
+            for i in range(gmodule.get_num_outputs()):
+                flattened.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
+            unflattened = _unflatten(iter(flattened), ret_type)
+            return unflattened
+
+        return _graph_wrapper
+
+
 # TODO(mbs): Collapse the create_executor/evaluate phases together since a) most callers don't
 # reuse the executor for multiple expressions and b) any preparation necessary for the expression
 # evaluation needs to (currently) be done along with preparation for the module.
@@ -664,9 +725,8 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     Parameters
     ----------
     kind : str
-        The type of executor. Avaliable options are `debug` for the
-        interpreter, `graph` for the graph executor, and `vm` for the virtual
-        machine.
+        The type of executor. Avaliable options are `debug` for the interpreter, `graph` for the
+        graph executor, `aot` for the aot executor, and `vm` for the virtual machine.
 
     mod : :py:class:`~tvm.IRModule`
         The Relay module containing collection of functions
@@ -703,4 +763,6 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
         return GraphExecutor(mod, device, target)
     if kind == "vm":
         return VMExecutor(mod, device, target)
+    if kind == "aot":
+        return AotModule(mod)
     raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/python/tvm/runtime/executor/aot_executor.py b/python/tvm/runtime/executor/aot_executor.py
index 91f056ff25fa..9ef0d1dee894 100644
--- a/python/tvm/runtime/executor/aot_executor.py
+++ b/python/tvm/runtime/executor/aot_executor.py
@@ -29,12 +29,12 @@ class AotModule(object):
     Parameters
     ----------
     module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
+        The internal tvm module that holds the implemented model functions.
 
     Attributes
     ----------
     module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
+        The internal tvm module that holds the implemented model functions.
 
     Examples
     --------
@@ -51,9 +51,9 @@ class AotModule(object):
         # load it back as a runtime
         lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
         # Call the library factory function for default and create
-        # a new runtime.Module, wrap with graph module.
-        gmod = graph_executor.GraphModule(lib["default"](dev))
-        # use the graph module.
+        # a new runtime.Module, wrap with aot module.
+        gmod = tvm.runtime.executor.AotModule(lib["default"](dev))
+        # use the aot  module.
         gmod.set_input("x", data)
         gmod.run()
     """
@@ -85,7 +85,7 @@ def set_input(self, key=None, value=None, **params):
         if key is not None:
             v = self._get_input(key)
             if v is None:
-                raise RuntimeError("Could not find '%s' in graph's inputs" % key)
+                raise RuntimeError("Could not find '%s' in model's inputs" % key)
             v.copyfrom(value)
 
         if params:
@@ -101,7 +101,7 @@ def set_input(self, key=None, value=None, **params):
                     self._get_input(k).copyfrom(params[k])
 
     def run(self, **input_dict):
-        """Run forward execution of the graph
+        """Run forward execution of the model
 
         Parameters
         ----------
@@ -113,7 +113,7 @@ def run(self, **input_dict):
         self._run()
 
     def get_num_outputs(self):
-        """Get the number of outputs from the graph
+        """Get the number of outputs from the model
 
         Returns
         -------
@@ -123,7 +123,7 @@ def get_num_outputs(self):
         return self._get_num_outputs()
 
     def get_num_inputs(self):
-        """Get the number of inputs to the graph
+        """Get the number of inputs to the model
 
         Returns
         -------
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 4009d1029582..636c3caf3159 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -327,7 +327,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
-  void PushArgs(const Expr& expr, std::vector<tir::Var> sids, Array<PrimExpr>* args) {
+  void PushArgs(const Expr& expr, const std::vector<tir::Var>& sids, Array<PrimExpr>* args) {
     const TupleNode* t = expr.as<TupleNode>();
     if (t != nullptr) {
       CHECK_EQ(sids.size(), t->fields.size()) << "Relay tuple does not map 1:1 into TIR; AOT can't "
@@ -920,8 +920,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           << ") or unpacked-api == true (got: " << use_unpacked_api_
           << ") when targeting c runtime";
     } else if (runtime_config->name == kTvmRuntimeCpp) {
-      CHECK(static_cast<bool>(use_unpacked_api_) == false &&
-            static_cast<bool>(use_call_cpacked_) == true)
+      CHECK(static_cast<bool>(use_unpacked_api_) == false)
           << "Need unpacked-api == false (got: " << use_unpacked_api_
           << ") and interface-api == \"packed\" (got: " << interface_api
           << ") when targeting c++ runtime";
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 403a8f09e2ae..2a591092aaaf 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -133,8 +133,8 @@ void AotExecutor::Run() {
   ICHECK(pf != nullptr) << "Module entrypoint is not defined";
 
   const int num_args = args_.size();
-  ::std::unique_ptr<TVMValue> call_values{new TVMValue[num_args]};
-  ::std::unique_ptr<int> call_type_codes{new int[num_args]};
+  auto call_values = ::std::make_unique<TVMValue[]>(num_args);
+  auto call_type_codes = ::std::make_unique<int[]>(num_args);
   for (int i = 0; i < num_args; ++i) {
     auto managed = args_[i].ToDLPack();
     call_values.get()[i].v_handle = &managed->dl_tensor;
diff --git a/src/runtime/aot_executor/aot_executor_factory.cc b/src/runtime/aot_executor/aot_executor_factory.cc
index 4cb3026991fe..7760f0fe6c4d 100644
--- a/src/runtime/aot_executor/aot_executor_factory.cc
+++ b/src/runtime/aot_executor/aot_executor_factory.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file aot_executor_factory.cc
- * \brief Graph executor factory implementations
+ * \brief AOT executor factory implementations
  */
 
 #include "./aot_executor_factory.h"
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 1fec52c46609..ca55b46805d6 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -80,6 +80,8 @@ static runtime::Module CreateCrtMetadataModule(
 // TODO(areusch,masahi): Unify metadata representation and remove the need for this function
 static runtime::metadata::Metadata ConvertMetaData(
     relay::backend::ExecutorCodegenMetadata metadata) {
+  ICHECK_NOTNULL(metadata->pool_inputs);
+
   std::vector<runtime::metadata::TensorInfo> inputs;
   ICHECK(metadata.defined());
   for (size_t i = 0; i < metadata->inputs.size(); ++i) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index e6acc1387806..461f5d9b6086 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -554,12 +554,9 @@ class MetadataQueuer : public AttrVisitor {
       auto metadata = Downcast<runtime::metadata::MetadataBase>(*value);
       const runtime::metadata::MetadataArrayNode* arr =
           value->as<runtime::metadata::MetadataArrayNode>();
-      std::cout << "Is array? " << arr << std::endl;
       if (arr != nullptr) {
         for (unsigned int i = 0; i < arr->array.size(); i++) {
           ObjectRef o = arr->array[i];
-          std::cout << "queue-visiting array element " << i << ": " << o->type_index() << " ("
-                    << o.operator->() << ")" << std::endl;
           if (o.as<runtime::metadata::MetadataBaseNode>() != nullptr) {
             std::stringstream ss;
             ss << i;
@@ -656,14 +653,10 @@ class MetadataSerializer : public AttrVisitor {
   }
 
   void VisitArray(const runtime::metadata::MetadataArrayNode* array) {
-    std::cout << "visit array " << array << ": " << array->type_index << " " << array->struct_name
-              << "," << array->array.size() << std::endl;
     auto old_is_first_item = is_first_item_;
     is_first_item_ = true;
     for (unsigned int i = 0; i < array->array.size(); ++i) {  // ObjectRef o : *(array->array)) {
       ObjectRef o = array->array[i];
-      std::cout << "visiting array element " << i << ": " << o->type_index() << " ("
-                << o.operator->() << ")" << std::endl;
       if (o->IsInstance<IntImmNode>()) {
         int64_t i = Downcast<Integer>(o);
         Visit(nullptr, &i);
@@ -690,7 +683,6 @@ class MetadataSerializer : public AttrVisitor {
   void Visit(const char* key, ObjectRef* value) final {
     const runtime::metadata::MetadataArrayNode* arr =
         value->as<runtime::metadata::MetadataArrayNode>();
-    std::cout << "Is array? " << arr << std::endl;
     if (arr != nullptr) {
       WriteComma();
       if (key != nullptr) {
@@ -725,7 +717,6 @@ class MetadataSerializer : public AttrVisitor {
       auto struct_name = std::get<0>(item);
       auto obj = std::get<1>(item);
       auto arr = obj.as<runtime::metadata::MetadataArrayNode>();
-      std::cout << "codegen: " << struct_name;
       is_first_item_ = true;
       address_.push_back(struct_name);
       if (arr != nullptr) {
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 54cdf09639d8..73d8b2182953 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -16,6 +16,7 @@
 # under the License.
 
 
+import re
 import sys
 import textwrap
 
@@ -39,6 +40,32 @@
 )
 
 
+def test_error_c_interface():
+    interface_api = "c"
+    use_unpacked_api = False
+    test_runner = AOT_DEFAULT_RUNNER
+
+    two = relay.add(relay.const(1), relay.const(1))
+    func = relay.Function([], two)
+
+    with pytest.raises(
+        tvm.TVMError,
+        match=re.escape(
+            'Either need interface_api == "packed" (got: c) or '
+            "unpacked-api == true (got: (bool)0) when targeting "
+            "c runtime"
+        ),
+    ):
+        compile_and_run(
+            AOTTestModel(
+                module=IRModule.from_expr(func), inputs={}, outputs=generate_ref_data(func, {})
+            ),
+            test_runner,
+            interface_api,
+            use_unpacked_api,
+        )
+
+
 enable_usmp = tvm.testing.parameter(True, False)
 
 
@@ -129,5 +156,15 @@ def test_mobilenet():
     assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
+def test_create_executor():
+    x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
+    expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
+    actual = relay.create_executor(
+        "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr))
+    ).evaluate()(np.array([2], dtype="float32"))
+
+    assert np.array([3.], dtype="float32") == actual
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From fda6d533405e9963452e46fd8834974fcefea8a6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 2 Mar 2022 10:16:55 -0800
Subject: [PATCH 18/21] further address comments

---
 include/tvm/runtime/module.h              |  2 ++
 python/tvm/relay/build_module.py          | 18 +++++++++++++---
 src/relay/backend/aot_executor_codegen.cc | 10 ++++-----
 src/runtime/aot_executor/aot_executor.cc  |  7 +++++++
 src/runtime/aot_executor/aot_executor.h   |  7 ++-----
 src/runtime/metadata.cc                   | 19 ++++++-----------
 src/target/metadata.h                     |  6 +++---
 src/target/source/codegen_source_base.h   |  1 +
 src/target/source/source_module.cc        |  9 ++++----
 tests/python/relay/aot/test_cpp_aot.py    | 25 +++++++++++++++++++++--
 10 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 2e2a79b1ca53..a93f1c66c395 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -217,6 +217,8 @@ TVM_DLL bool RuntimeEnabled(const std::string& target);
 
 /*! \brief namespace for constant symbols */
 namespace symbol {
+/*! \brief A PackedFunc that retrieves exported metadata. */
+constexpr const char* tvm_get_c_metadata = "get_c_metadata";
 /*! \brief Global variable to store module context. */
 constexpr const char* tvm_module_ctx = "__tvm_module_ctx";
 /*! \brief Global variable to store device module blob */
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index b8ea35e38b6a..98a42d59d648 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -27,9 +27,11 @@
 from tvm.tir import expr as tvm_expr
 from tvm.target import Target
 from .. import nd as _nd, autotvm, register_func
+from ..runtime import load_module
 from ..runtime.executor import aot_executor as _aot_executor
 from ..target import Target
 from ..contrib import graph_executor as _graph_executor
+from ..contrib import utils as contrib_utils
 from . import _build_module
 from . import ty as _ty
 from . import expr as _expr
@@ -662,6 +664,7 @@ def __init__(self, mod, device, target):
         self.mod = mod
         self.device = device
         self.target = target
+        assert target.attrs.get("executor", "graph") == "aot"
 
     def _make_executor(self, expr=None):
         if expr:
@@ -673,7 +676,16 @@ def _make_executor(self, expr=None):
                 "AOT Executor only supports static graphs, got output type", ret_type
             )
         mod = build(self.mod, target=self.target)
-        gmodule = _aot_executor.AotModule(mod["default"](self.device))
+
+        # NOTE: Given AOT requires use of the "c" backend, must export/import to compile the
+        # generated code.
+        temp_so_dir = contrib_utils.TempDirectory()
+        temp_so = temp_so_dir / "temp.so"
+        mod.export_library(temp_so)
+
+        mod = load_module(temp_so)
+        aot_mod = mod["default"](self.device)
+        gmodule = _aot_executor.AotModule(aot_mod)
 
         def _unflatten(flat_iter, cur_type):
             if isinstance(cur_type, _ty.TensorType):
@@ -699,7 +711,7 @@ def _aot_wrapper(*args, **kwargs):
             unflattened = _unflatten(iter(flattened), ret_type)
             return unflattened
 
-        return _graph_wrapper
+        return _aot_wrapper
 
 
 # TODO(mbs): Collapse the create_executor/evaluate phases together since a) most callers don't
@@ -764,5 +776,5 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     if kind == "vm":
         return VMExecutor(mod, device, target)
     if kind == "aot":
-        return AotModule(mod)
+        return AotExecutor(mod, device, target)
     raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 636c3caf3159..36975482f684 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -915,12 +915,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Validate choice of use_unpacked_api_ and use_call_cpacked_
     if (runtime_config->name == kTvmRuntimeCrt) {
-      CHECK(interface_api == "packed" || static_cast<bool>(use_unpacked_api_) == true)
-          << "Either need interface_api == \"packed\" (got: " << interface_api
-          << ") or unpacked-api == true (got: " << use_unpacked_api_
-          << ") when targeting c runtime";
+      ICHECK(interface_api == "packed" || static_cast<bool>(use_unpacked_api_) == true)
+        << "Either need interface_api == \"packed\" (got: " << interface_api
+        << ") or unpacked-api == true (got: " << use_unpacked_api_
+        << ") when targeting c runtime";
     } else if (runtime_config->name == kTvmRuntimeCpp) {
-      CHECK(static_cast<bool>(use_unpacked_api_) == false)
+      ICHECK(static_cast<bool>(use_unpacked_api_) == false)
           << "Need unpacked-api == false (got: " << use_unpacked_api_
           << ") and interface-api == \"packed\" (got: " << interface_api
           << ") when targeting c++ runtime";
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 2a591092aaaf..489935fab95f 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -41,6 +41,13 @@ AotExecutor::AotExecutor(tvm::runtime::Module module, const std::vector<Device>&
   auto ret_value = fmetadata();
   metadata_ = ret_value.AsObjectRef<tvm::runtime::metadata::Metadata>();
 
+  ICHECK_EQ(devices_.size(), 1) << "Expect exactly 1 device passed.";
+  DLDevice expected_device{kDLCPU, 0};
+  ICHECK_EQ(devices_[0].device_id, expected_device.device_id)
+    << "At this time, AOTExecutor supports only execution on kDLCPU 0";
+  ICHECK_EQ(devices_[0].device_type, expected_device.device_type)
+    << "At this time, AOTExecutor supports only execution on kDLCPU 0";
+
   for (auto input : metadata_->inputs()) {
     // TODO(areusch): Encode device information in Metadata.
     args_.emplace_back(NDArray::Empty(ShapeTuple(input->shape().begin(), input->shape().end()),
diff --git a/src/runtime/aot_executor/aot_executor.h b/src/runtime/aot_executor/aot_executor.h
index a213ef83bdc2..ccbcf8fdf3d2 100644
--- a/src/runtime/aot_executor/aot_executor.h
+++ b/src/runtime/aot_executor/aot_executor.h
@@ -57,11 +57,8 @@ class TVM_DLL AotExecutor : public ModuleNode {
    * \brief Initialize the AOT executor with metadata, runtime::Module, and device.
    * \param module The module containing the compiled functions for the host
    *  processor.
-   * \param devs The device of the host and devices where graph nodes will be
-   *  executed on.
-   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
-   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
-   *  which is not compatible with RPCModules. Default is nullptr.
+   * \param devs A 1-element vector. The Device which AOT compute will run on. Currently, only
+   *  Device(kDLCPU, 0) is supported.
    */
   AotExecutor(tvm::runtime::Module module, const std::vector<Device>& devs);
 
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index 31144a3140cd..97c6073d1884 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -65,14 +65,7 @@ TVM_REGISTER_OBJECT_TYPE(TensorInfoNode);
 
 class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
  public:
-  explicit MetadataModuleNode(runtime::metadata::Metadata metadata) {
-    // CHECK((metadata.defined() && code.size() > 0) || (!metadata.defined() && code.size() == 0))
-    //   << "metadata and code must both be either defined (when passed from compiler) or undefined
-    //   "
-    //   << "(when passed from runtime)";
-    metadata_ = metadata;
-    //    code_ = code;
-  }
+  explicit MetadataModuleNode(runtime::metadata::Metadata metadata) : metadata_{::std::move(metadata)} {}
 
   const char* type_key() const { return "metadata_module"; }
 
@@ -87,17 +80,17 @@ class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
       return PackedFunc([this, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
         if (!metadata_.defined()) {
           TVMFunctionHandle f_handle;
-          int32_t ret_code = TVMBackendGetFuncFromEnv(this, "get_c_metadata", &f_handle);
-          CHECK_EQ(ret_code, 0) << "Unable to locate get_c_metadata PackedFunc";
+          int32_t ret_code = TVMBackendGetFuncFromEnv(this, symbol::tvm_get_c_metadata, &f_handle);
+          ICHECK_EQ(ret_code, 0) << "Unable to locate " << symbol::tvm_get_c_metadata << " PackedFunc";
 
           TVMValue ret_value;
           int ret_type_code;
           ret_code = TVMFuncCall(f_handle, nullptr, nullptr, 0, &ret_value, &ret_type_code);
-          CHECK_EQ(ret_code, 0) << "Invoking get_c_metadata: TVMFuncCall returned " << ret_code;
+          ICHECK_EQ(ret_code, 0) << "Invoking " << symbol::tvm_get_c_metadata << ": TVMFuncCall returned " << ret_code;
 
-          CHECK_EQ(ret_type_code, kTVMOpaqueHandle)
+          ICHECK_EQ(ret_type_code, kTVMOpaqueHandle)
               << "Expected kOpaqueHandle returned; got " << ret_type_code;
-          CHECK(ret_value.v_handle != nullptr) << "get_c_metadata returned nullptr";
+          ICHECK(ret_value.v_handle != nullptr) << symbol::tvm_get_c_metadata << " returned nullptr";
 
           metadata_ = runtime::metadata::Metadata(
               static_cast<const struct ::TVMMetadata*>(ret_value.v_handle));
diff --git a/src/target/metadata.h b/src/target/metadata.h
index e77ea0bce71a..b8ca24580f15 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -105,11 +105,11 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
                        const ::std::vector<::tvm::runtime::metadata::TensorInfo>& pools,
                        const ::tvm::runtime::String mod_name)
       : VisitableMetadataNode{&storage_},
-        inputs_{new struct TVMTensorInfo[inputs.size()]()},
+        inputs_{new struct TVMTensorInfo[inputs.size()]},
         inputs_objs_{inputs},
-        outputs_{new struct TVMTensorInfo[outputs.size()]()},
+        outputs_{new struct TVMTensorInfo[outputs.size()]},
         outputs_objs_{outputs},
-        pools_{new struct TVMTensorInfo[pools.size()]()},
+        pools_{new struct TVMTensorInfo[pools.size()]},
         pools_objs_{pools},
         mod_name_{mod_name},
         storage_{version, nullptr, 0, nullptr, 0, nullptr, 0, mod_name_.c_str()} {
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 133886758430..66287f9ad181 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -165,6 +165,7 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
  * \param target_module The main TIR-lowered internal runtime module
  * \param modules All the external modules that needs to be imported inside the metadata module(s).
  * \param target The target that all the modules are compiled for
+ * \param metadata Metadata which should be exported to the runtime.
  * \return The wrapped module.
  */
 runtime::Module CreateMetadataModule(
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 461f5d9b6086..78ae982e1906 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -23,6 +23,7 @@
  */
 #include "source_module.h"
 
+#include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -655,7 +656,7 @@ class MetadataSerializer : public AttrVisitor {
   void VisitArray(const runtime::metadata::MetadataArrayNode* array) {
     auto old_is_first_item = is_first_item_;
     is_first_item_ = true;
-    for (unsigned int i = 0; i < array->array.size(); ++i) {  // ObjectRef o : *(array->array)) {
+    for (unsigned int i = 0; i < array->array.size(); ++i) {
       ObjectRef o = array->array[i];
       if (o->IsInstance<IntImmNode>()) {
         int64_t i = Downcast<Integer>(o);
@@ -675,7 +676,6 @@ class MetadataSerializer : public AttrVisitor {
       address_.push_back(i_str.str());
       Visit(nullptr, &metadata);
       address_.pop_back();
-      //      ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
     }
     is_first_item_ = old_is_first_item;
   }
@@ -800,7 +800,7 @@ runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metad
               << "extern \"C\"\n"
               << "#endif\n";
 
-  lookup_func << "TVM_DLL int32_t get_c_metadata(TVMValue* arg_values, int* arg_tcodes, int "
+  lookup_func << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_get_c_metadata << "(TVMValue* arg_values, int* arg_tcodes, int "
                  "num_args, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {"
               << std::endl;
   lookup_func << "    ret_values[0].v_handle = (void*) &" << MetadataSerializer::kGlobalSymbol
@@ -810,8 +810,7 @@ runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metad
   lookup_func << "};" << std::endl;
 
   auto mod = MetadataModuleCreate(metadata);
-  std::vector<String> func_names{"get_c_metadata"};
-  // definer.GetOutput() +
+  std::vector<String> func_names{::tvm::runtime::symbol::tvm_get_c_metadata};
   auto c = CSourceModuleCreate(serializer.GetOutput() + lookup_func.str(), "c", func_names,
                                Array<String>());
   mod->Import(c);
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 73d8b2182953..1f88fe0f8437 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -160,10 +160,31 @@ def test_create_executor():
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     actual = relay.create_executor(
-        "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr))
+        "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c -executor=aot"
     ).evaluate()(np.array([2], dtype="float32"))
 
-    assert np.array([3.], dtype="float32") == actual
+    np.isfinite(np.array([3], dtype="float32"))
+
+    np.testing.assert_allclose(actual.numpy(), np.array([3], dtype="float32"))
+
+
+def test_pass_wrong_device_arg():
+    x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
+    expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c",
+                              executor=backend.Executor("aot", {"interface-api": "packed"}))
+
+    temp_dir = tvm.contrib.utils.TempDirectory()
+    test_so_path = temp_dir / "test.so"
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    loaded_mod = tvm.runtime.load_module(test_so_path)
+
+    with pytest.raises(tvm.TVMError) as cm:
+        tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0), tvm.cpu(0)))
+
+        assert "Check failed: devices_.size() == 1 (2 vs. 1) : Expect exactly 1 device passed." in str(cm.exception)
+    # TODO write asserts for # and type of device.
 
 
 if __name__ == "__main__":

From f32ba4b4e7989e3b2b2b888736e6d3edda46e953 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 2 Mar 2022 10:22:33 -0800
Subject: [PATCH 19/21] clang and python format

---
 python/tvm/relay/build_module.py          |  4 +---
 src/relay/backend/aot_executor_codegen.cc |  6 +++---
 src/runtime/aot_executor/aot_executor.cc  |  4 ++--
 src/runtime/metadata.cc                   | 12 ++++++++----
 src/target/source/source_module.cc        |  3 ++-
 tests/python/relay/aot/test_cpp_aot.py    | 12 +++++++++---
 tests/python/relay/test_vm.py             | 11 +++++++++--
 7 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 98a42d59d648..ba7cbd81f1bd 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -672,9 +672,7 @@ def _make_executor(self, expr=None):
         self.mod = InferType()(self.mod)
         ret_type = self.mod["main"].checked_type.ret_type
         if _ty.is_dynamic(ret_type):
-            raise ValueError(
-                "AOT Executor only supports static graphs, got output type", ret_type
-            )
+            raise ValueError("AOT Executor only supports static graphs, got output type", ret_type)
         mod = build(self.mod, target=self.target)
 
         # NOTE: Given AOT requires use of the "c" backend, must export/import to compile the
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 36975482f684..bf4c1d73ae38 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -916,9 +916,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // Validate choice of use_unpacked_api_ and use_call_cpacked_
     if (runtime_config->name == kTvmRuntimeCrt) {
       ICHECK(interface_api == "packed" || static_cast<bool>(use_unpacked_api_) == true)
-        << "Either need interface_api == \"packed\" (got: " << interface_api
-        << ") or unpacked-api == true (got: " << use_unpacked_api_
-        << ") when targeting c runtime";
+          << "Either need interface_api == \"packed\" (got: " << interface_api
+          << ") or unpacked-api == true (got: " << use_unpacked_api_
+          << ") when targeting c runtime";
     } else if (runtime_config->name == kTvmRuntimeCpp) {
       ICHECK(static_cast<bool>(use_unpacked_api_) == false)
           << "Need unpacked-api == false (got: " << use_unpacked_api_
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 489935fab95f..5bd0d80ff95c 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -44,9 +44,9 @@ AotExecutor::AotExecutor(tvm::runtime::Module module, const std::vector<Device>&
   ICHECK_EQ(devices_.size(), 1) << "Expect exactly 1 device passed.";
   DLDevice expected_device{kDLCPU, 0};
   ICHECK_EQ(devices_[0].device_id, expected_device.device_id)
-    << "At this time, AOTExecutor supports only execution on kDLCPU 0";
+      << "At this time, AOTExecutor supports only execution on kDLCPU 0";
   ICHECK_EQ(devices_[0].device_type, expected_device.device_type)
-    << "At this time, AOTExecutor supports only execution on kDLCPU 0";
+      << "At this time, AOTExecutor supports only execution on kDLCPU 0";
 
   for (auto input : metadata_->inputs()) {
     // TODO(areusch): Encode device information in Metadata.
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index 97c6073d1884..90469fabad2c 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -65,7 +65,8 @@ TVM_REGISTER_OBJECT_TYPE(TensorInfoNode);
 
 class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
  public:
-  explicit MetadataModuleNode(runtime::metadata::Metadata metadata) : metadata_{::std::move(metadata)} {}
+  explicit MetadataModuleNode(runtime::metadata::Metadata metadata)
+      : metadata_{::std::move(metadata)} {}
 
   const char* type_key() const { return "metadata_module"; }
 
@@ -81,16 +82,19 @@ class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
         if (!metadata_.defined()) {
           TVMFunctionHandle f_handle;
           int32_t ret_code = TVMBackendGetFuncFromEnv(this, symbol::tvm_get_c_metadata, &f_handle);
-          ICHECK_EQ(ret_code, 0) << "Unable to locate " << symbol::tvm_get_c_metadata << " PackedFunc";
+          ICHECK_EQ(ret_code, 0) << "Unable to locate " << symbol::tvm_get_c_metadata
+                                 << " PackedFunc";
 
           TVMValue ret_value;
           int ret_type_code;
           ret_code = TVMFuncCall(f_handle, nullptr, nullptr, 0, &ret_value, &ret_type_code);
-          ICHECK_EQ(ret_code, 0) << "Invoking " << symbol::tvm_get_c_metadata << ": TVMFuncCall returned " << ret_code;
+          ICHECK_EQ(ret_code, 0) << "Invoking " << symbol::tvm_get_c_metadata
+                                 << ": TVMFuncCall returned " << ret_code;
 
           ICHECK_EQ(ret_type_code, kTVMOpaqueHandle)
               << "Expected kOpaqueHandle returned; got " << ret_type_code;
-          ICHECK(ret_value.v_handle != nullptr) << symbol::tvm_get_c_metadata << " returned nullptr";
+          ICHECK(ret_value.v_handle != nullptr)
+              << symbol::tvm_get_c_metadata << " returned nullptr";
 
           metadata_ = runtime::metadata::Metadata(
               static_cast<const struct ::TVMMetadata*>(ret_value.v_handle));
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 78ae982e1906..7db5d8c83a84 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -800,7 +800,8 @@ runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metad
               << "extern \"C\"\n"
               << "#endif\n";
 
-  lookup_func << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_get_c_metadata << "(TVMValue* arg_values, int* arg_tcodes, int "
+  lookup_func << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_get_c_metadata
+              << "(TVMValue* arg_values, int* arg_tcodes, int "
                  "num_args, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {"
               << std::endl;
   lookup_func << "    ret_values[0].v_handle = (void*) &" << MetadataSerializer::kGlobalSymbol
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 1f88fe0f8437..48057404dd4c 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -172,8 +172,11 @@ def test_pass_wrong_device_arg():
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.relay.build(tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c",
-                              executor=backend.Executor("aot", {"interface-api": "packed"}))
+        mod = tvm.relay.build(
+            tvm.IRModule.from_expr(tvm.relay.Function([x], expr)),
+            target="c",
+            executor=backend.Executor("aot", {"interface-api": "packed"}),
+        )
 
     temp_dir = tvm.contrib.utils.TempDirectory()
     test_so_path = temp_dir / "test.so"
@@ -183,7 +186,10 @@ def test_pass_wrong_device_arg():
     with pytest.raises(tvm.TVMError) as cm:
         tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0), tvm.cpu(0)))
 
-        assert "Check failed: devices_.size() == 1 (2 vs. 1) : Expect exactly 1 device passed." in str(cm.exception)
+        assert (
+            "Check failed: devices_.size() == 1 (2 vs. 1) : Expect exactly 1 device passed."
+            in str(cm.exception)
+        )
     # TODO write asserts for # and type of device.
 
 
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index ebfec0fa23a0..e4666c63c8c5 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -922,12 +922,14 @@ def test_get_input_index(target, dev):
     assert vm_factory.get_input_index(data_0) == 0
     assert vm_factory.get_input_index("invalid") == -1
 
+
 def get_one_input_relay_mod(tensor_type, shape, data_name):
-    x = relay.var(data_name, shape = shape, dtype = tensor_type)
+    x = relay.var(data_name, shape=shape, dtype=tensor_type)
     y = relay.exp(x)
     f = relay.Function([x], y)
     return IRModule.from_expr(f)
 
+
 @tvm.testing.parametrize_targets("llvm")
 def test_one_set_input(target, dev):
     dtype = "float32"
@@ -956,11 +958,13 @@ def test_one_set_input(target, dev):
     assert output.dtype == ref_res.dtype
     tvm.testing.assert_allclose(ref_res_core, output.numpy())
 
+
 def get_multiple_input_relay_mod(tensor_type, shape, data_name0, data_name1):
-    x, y = [relay.var(c, shape=shape, dtype = tensor_type) for c in [data_name0, data_name1]]
+    x, y = [relay.var(c, shape=shape, dtype=tensor_type) for c in [data_name0, data_name1]]
     f = relay.Function([x, y], x + y)
     return IRModule.from_expr(f)
 
+
 @tvm.testing.parametrize_targets("llvm")
 def test_multiple_set_input(target, dev):
     dtype = "float32"
@@ -992,6 +996,7 @@ def test_multiple_set_input(target, dev):
     assert output.dtype == ref_res.dtype
     tvm.testing.assert_allclose(ref_res_core, output.numpy())
 
+
 @tvm.testing.parametrize_targets("llvm")
 def test_one_set_one_input(target, dev):
     dtype = "float32"
@@ -1025,6 +1030,7 @@ def test_one_set_one_input(target, dev):
     assert output.dtype == ref_res.dtype
     tvm.testing.assert_allclose(ref_res_core, output.numpy())
 
+
 @tvm.testing.parametrize_targets("llvm")
 def test_multiple_set_one_input(target, dev):
     dtype = "float32"
@@ -1065,6 +1071,7 @@ def test_multiple_set_one_input(target, dev):
     assert output.dtype == ref_res.dtype
     tvm.testing.assert_allclose(ref_res_core, output.numpy())
 
+
 @tvm.testing.parametrize_targets("llvm")
 def test_benchmark(target, dev):
     mod, params = mlp.get_workload(1)

From 4f658a6b389e5286601662480e6bf063a46dd904 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 2 Mar 2022 16:21:05 -0800
Subject: [PATCH 20/21] Fix broken test

---
 python/tvm/relay/build_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index ba7cbd81f1bd..7872091f1a5d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -679,7 +679,7 @@ def _make_executor(self, expr=None):
         # generated code.
         temp_so_dir = contrib_utils.TempDirectory()
         temp_so = temp_so_dir / "temp.so"
-        mod.export_library(temp_so)
+        mod.export_library(temp_so, cc="gcc", options=["-std=c11"])
 
         mod = load_module(temp_so)
         aot_mod = mod["default"](self.device)

From 2d42f22f143435dc1eb6172628466e1b2ed548ba Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 2 Mar 2022 16:21:12 -0800
Subject: [PATCH 21/21] Address lingering comments from masahi, kparszyzc

---
 src/relay/backend/aot_executor_codegen.cc | 6 +-----
 src/target/metadata_module.cc             | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index bf4c1d73ae38..6e6c4b6dfeb2 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -180,11 +180,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
         return_ids_.push_back(sid);
       }
       return_ttypes_.clear();
-      auto ttypes = FlattenTupleType(e->checked_type());
-      return_ttypes_.reserve(ttypes.size());
-      for (auto ttype : ttypes) {
-        return_ttypes_.push_back(ttype);
-      }
+      return_ttypes_ = FlattenTupleType(e->checked_type());
     }
   }
   /*!
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index ca55b46805d6..8abd18c1d8f3 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -80,10 +80,10 @@ static runtime::Module CreateCrtMetadataModule(
 // TODO(areusch,masahi): Unify metadata representation and remove the need for this function
 static runtime::metadata::Metadata ConvertMetaData(
     relay::backend::ExecutorCodegenMetadata metadata) {
+  ICHECK(metadata.defined());
   ICHECK_NOTNULL(metadata->pool_inputs);
 
   std::vector<runtime::metadata::TensorInfo> inputs;
-  ICHECK(metadata.defined());
   for (size_t i = 0; i < metadata->inputs.size(); ++i) {
     auto v = metadata->inputs[i];
     auto ttype = metadata->input_tensor_types[i];