From d0842df3a96ef4c340bd78a6fc9fc6bc886e29a4 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Fri, 7 May 2021 20:29:00 +0000
Subject: [PATCH 1/9] [AOT] Initial implementation of --no-typed-operators

Based on the discussions in the AOT embedded improvements RFC, this adds a flag to the target which changes the internal operators to an unpacked API. The unpacked API spreads the input buffers across the operator function, for example:

int32_t operator(void* arg0, void* arg1);

As opposed to the traditional packed API:

int32_t operator(void** args);

Uneffected is the entrypoint function, which retains a packed API for
compatibility with other parts of TVM. This is done by changing the
passes taken by none entrypoint (CallingConv::kEntryPoint) functions.
---
 include/tvm/ir/function.h                     |   7 +
 include/tvm/tir/transform.h                   |  11 ++
 python/tvm/tir/transform/transform.py         |  11 ++
 src/driver/driver_api.cc                      |  65 ++++++--
 src/relay/backend/aot_executor_codegen.cc     |  41 +++--
 src/target/target_kind.cc                     |   1 +
 src/tir/transforms/make_packed_api.cc         |   5 +-
 src/tir/transforms/make_unpacked_api.cc       | 131 +++++++++++++++
 tests/python/relay/aot/aot_test_utils.py      |  10 +-
 tests/python/relay/aot/test_crt_aot.py        |  74 ++++++---
 .../test_tir_transform_make_unpacked_api.py   | 149 ++++++++++++++++++
 11 files changed, 450 insertions(+), 55 deletions(-)
 create mode 100644 src/tir/transforms/make_unpacked_api.cc
 create mode 100644 tests/python/unittest/test_tir_transform_make_unpacked_api.py
diff --git a/include/tvm/ir/function.h b/include/tvm/ir/function.h
index 5b9e0714e202..933ea4af6fe8 100644
--- a/include/tvm/ir/function.h
+++ b/include/tvm/ir/function.h
@@ -61,6 +61,13 @@ enum class CallingConv : int {
    * - Implementation: defined by device runtime(e.g. runtime/cuda)
    */
   kDeviceKernelLaunch = 2,
+  /*!
+   * \brief Function that represents the entrypoint to a TVM network
+   *
+   * - This is transformed to either a packed function or a micro entrypoint
+   * - Implementation: Change the passes over this function
+   */
+  kEntryPoint = 3
 };
 
 /*!
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index b1957d2a9cb8..2113d58f1ffa 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -212,6 +212,17 @@ TVM_DLL Pass InstrumentBoundCheckers();
  */
 TVM_DLL Pass MakePackedAPI(int num_unpacked_args);
 
+/*!
+ * \brief Transform the high-level PrimFunc to a C signature that can be used
+ *   to call the operator directly.
+ *
+ *  The main task of this function is to create code that maps the values in the
+ *  api_args to Var that is required by body
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass MakeUnpackedAPI();
+
 /*!
  * \brief Remap the thread axis
  *
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index be55b48da71e..26b22f99c215 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -347,6 +347,17 @@ def MakePackedAPI(num_unpacked_params=0):
     return _ffi_api.MakePackedAPI(num_unpacked_params)
 
 
+def MakeUnpackedAPI():
+    """Transform the PrimFuncs in the module to a C API compatible with internal calls.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.MakeUnpackedAPI()
+
+
 def SplitHostDevice():
     """Split the function into a host function and device functions.
 
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index f30cecbf7f05..2a07ed26a341 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -128,6 +128,22 @@ transform::Pass Filter(FCond fcond) {
   return tir::transform::CreatePrimFuncPass(fpass, 0, "Filter", {});
 }
 
+transform::Pass FilterCallingConv(CallingConv calling_conv, bool should_match) {
+  return Filter([calling_conv, should_match](const tir::PrimFunc& f) {
+    auto actual_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault));
+    bool does_match = actual_conv == calling_conv;
+    return does_match == should_match;
+  });
+}
+
+transform::Pass FilterCallingConv(CallingConv calling_conv) {
+  return FilterCallingConv(calling_conv, true);
+}
+
+transform::Pass FilterNotCallingConv(CallingConv calling_conv) {
+  return FilterCallingConv(calling_conv, false);
+}
+
 IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::string& name,
                const std::unordered_map<te::Tensor, tir::Buffer>& binds) {
   Array<ObjectRef> out_arg_list;
@@ -185,12 +201,10 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
   return mod;
 }
 
-std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
-                                                const Target& target_host_arg,
-                                                const transform::PassContext& pass_ctx) {
-  Target target = target_arg, target_host = target_host_arg;
-  CheckAndUpdateHostConsistency(&target, &target_host);
-  Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
+IRModule MixedPasses(IRModule mod_mixed, const Target& target,
+                     const transform::PassContext& pass_ctx, const tvm::transform::Pass& filter,
+                     bool use_unpacked_api) {
+  Array<tvm::transform::Pass> mixed_pass_list = {filter, BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
   if (pass_ctx->GetConfig<Bool>("tir.detect_global_barrier", Bool(false)).value()) {
@@ -200,16 +214,38 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   mixed_pass_list.push_back(tir::transform::ThreadSync("warp"));
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
-  mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
+
+  if (use_unpacked_api) {
+    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
+  } else {
+    mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
+  }
+
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
+
   auto opt_mixed = transform::Sequential(mixed_pass_list);
-  mod_mixed = opt_mixed(std::move(mod_mixed));
+  return opt_mixed(mod_mixed);
+}
+
+std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
+                                                const Target& target_host_arg,
+                                                const transform::PassContext& pass_ctx) {
+  Target target = target_arg, target_host = target_host_arg;
+  CheckAndUpdateHostConsistency(&target, &target_host);
+
+  // Run default passes over entrypoint function
+  auto entrypoint_filter = FilterCallingConv(CallingConv::kEntryPoint);
+  auto entrypoint_mod = MixedPasses(mod_mixed, target, pass_ctx, entrypoint_filter, false);
+
+  // Create passes for untyped operators but maintain default API for entrypoint
+  auto untyped_operators = target->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
+  auto operator_filter = FilterNotCallingConv(CallingConv::kEntryPoint);
+  mod_mixed = MixedPasses(mod_mixed, target, pass_ctx, operator_filter, untyped_operators);
+
+  mod_mixed->Update(entrypoint_mod);
 
   auto host_pass_list = {
-      Filter([](const tir::PrimFunc& f) {
-        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
-               CallingConv::kDeviceKernelLaunch;
-      }),
+      FilterNotCallingConv(CallingConv::kDeviceKernelLaunch),
       BindTarget(target_host),
       tir::transform::LowerTVMBuiltin(),
       tir::transform::LowerCustomDatatypes(),
@@ -223,10 +259,7 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
 
   // device pipeline
   auto device_pass_list = {
-      Filter([](const tir::PrimFunc& f) {
-        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
-               CallingConv::kDeviceKernelLaunch;
-      }),
+      FilterCallingConv(CallingConv::kDeviceKernelLaunch),
       BindTarget(target),
       tir::transform::LowerWarpMemory(),
       tir::transform::Simplify(),
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a005247d424a..0d438765878f 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -192,29 +192,51 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
   }
 
+  /*!
+   * \brief Unpacks a buffer if operators are using the unpacked C-style interface
+   */
+  PrimExpr UnpackBufferIfUnpackedSignature(tir::Var arg) {
+    auto untyped_operators =
+        target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
+    if (!untyped_operators) {
+      return arg;
+    }
+
+    return tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                     {arg, 0, tir::builtin::kArrData});
+  }
+
   /*!
    * brief Call a function with a given name
    */
   void CreateFuncCall(Call call, std::string func_name) {
+    auto untyped_operators =
+        target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
+
     tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
     std::vector<tir::Stmt> create_func_call_stmts;
 
     // Pack the inputs
     for (Expr arg : call->args) {
       auto var_arg = FindExpr(arg);
-      args.push_back(var_arg[0]);
+      args.push_back(UnpackBufferIfUnpackedSignature(var_arg[0]));
     }
 
     auto ret_expr = Downcast<Expr>(call);
-
     // Pack the return(s) value. A call node can produce multiple outputs
     for (const auto& var : PackSid(ret_expr)) {
-      args.push_back(var);
+      args.push_back(UnpackBufferIfUnpackedSignature(var));
     }
 
-    // Use tvm_call_packed to execute the function
-    create_func_call_stmts.push_back(tir::Evaluate(
-        tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args)));
+    // Use tvm_call_packed to execute the function unless we're calling directly
+    auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
+    if (untyped_operators) {
+      calling_pattern = tvm::tir::builtin::call_extern();
+    }
+
+    create_func_call_stmts.push_back(
+        tir::Evaluate(tvm::tir::Call(DataType::Int(32), calling_pattern, args)));
+
     tir::Stmt body = tir::SeqStmt(create_func_call_stmts);
     stmts_.push_back(body);
   }
@@ -518,6 +540,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Define the PrimFunc attributes
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", runtime::String(runtime::symbol::tvm_run_func_prefix));
+    dict_attrs.Set(tvm::attr::kCallingConv, Integer(CallingConv::kEntryPoint));
 
     // Make the PrimFunc
     return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, tir::Buffer>(),
@@ -575,10 +598,9 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
     storage_device_map_ = (*pf)(func);
 
-    int input_index = 0;
     for (auto input : func->params) {
       input_vars_.push_back(input);
-      main_signature_.push_back(tir::Var(MakeString("input_", input_index), DataType::Handle()));
+      main_signature_.push_back(tir::Var("input", DataType::Handle()));
     }
 
     // Define the storage allocator ids
@@ -592,7 +614,8 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Find the return sid
     return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
     for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
-      main_signature_.push_back(tir::Var(MakeString("output_", output_index), DataType::Handle()));
+      auto output_var = tir::Var("output", DataType::Handle());
+      main_signature_.push_back(output_var);
     }
 
     VisitExpr(func->body);
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 08e998e0f035..355bc4e3b4ff 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -268,6 +268,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
+    .add_attr_option<Bool>("no-typed-operators", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 0cc0086897d8..793db05ad609 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -290,8 +290,9 @@ Pass MakePackedAPI(int num_unpacked_args) {
     for (const auto& kv : mptr->functions) {
       if (auto* n = kv.second.as<PrimFuncNode>()) {
         PrimFunc func = GetRef<PrimFunc>(n);
-        if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
-            CallingConv::kDefault) {
+        auto calling_conv =
+            func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault));
+        if (calling_conv == CallingConv::kDefault || calling_conv == CallingConv::kEntryPoint) {
           auto updated_func = MakePackedAPI(std::move(func), num_unpacked_args);
           updates.push_back({kv.first, updated_func});
         }
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
new file mode 100644
index 000000000000..00133cd5e1ab
--- /dev/null
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file make_unpacked_api.cc Lower PrimFunc to a standard C function API.
+ */
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arg_binder.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
+  auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  ICHECK(global_symbol) << "MakeUnpackedAPI: Expect PrimFunc to have the global_symbol attribute";
+
+  auto target = func->GetAttr<Target>(tvm::attr::kTarget);
+  ICHECK(target.defined()) << "MakeUnpackedAPI: Require the target attribute";
+
+  auto* func_ptr = func.CopyOnWrite();
+
+  // Setup device context
+  int target_device_type = target.value()->kind->device_type;
+  Integer device_type(target_device_type);
+  Integer device_id(0);
+  PrimExpr node = StringImm("default");
+  const Stmt nop = Evaluate(0);
+  std::vector<Stmt> device_init = {AttrStmt(node, attr::device_id, device_id, nop),
+                                   AttrStmt(node, attr::device_type, device_type, nop)};
+
+  // Create arg to buffer binder
+  std::unordered_map<const VarNode*, PrimExpr> vmap;
+  ArgBinder binder(&vmap);
+
+  // Collect variables and buffers to map between
+  Array<Var> args;
+  std::vector<std::pair<Var, Var>> var_def;
+  std::vector<std::pair<Var, Buffer>> buffer_def;
+
+  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
+    Var param = func_ptr->params[i];
+    Var v_arg = Var("arg" + std::to_string(i), param->dtype);
+
+    auto it = func_ptr->buffer_map.find(param);
+    if (it != func_ptr->buffer_map.end()) {
+      buffer_def.emplace_back(v_arg, (*it).second);
+    } else {
+      var_def.emplace_back(v_arg, param);
+    }
+
+    args.push_back(v_arg);
+  }
+
+  // Bind variables then bind buffers to them to ensure correct ordering
+  for (const auto& kv : var_def) {
+    binder.Bind(kv.second, kv.first, kv.first->name_hint, true);
+  }
+  for (const auto& kv : buffer_def) {
+    binder.Bind(kv.second->data, kv.first, kv.first->name_hint, true);
+  }
+
+  func_ptr->body = MergeNest({device_init, binder.init_nest(), binder.asserts()}, func_ptr->body);
+  func_ptr->params = args;
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+
+  // return the function.
+  return std::move(func);
+}
+
+namespace transform {
+
+Pass MakeUnpackedAPI() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
+    IRModuleNode* mptr = m.CopyOnWrite();
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
+
+    for (const auto& kv : mptr->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        PrimFunc func = GetRef<PrimFunc>(n);
+        if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
+            CallingConv::kDefault) {
+          auto updated_func = MakeUnpackedAPI(std::move(func));
+          updates.push_back({kv.first, updated_func});
+        }
+      }
+    }
+
+    for (const auto& pair : updates) {
+      mptr->AddUnchecked(pair.first, pair.second);
+    }
+    return m;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.MakeUnpackedAPI", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.MakeUnpackedAPI").set_body_typed(MakeUnpackedAPI);
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index c1917674873d..a54ffb80f051 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -165,12 +165,18 @@ def extract_main_workspace_sizebytes(extract_dir):
 
 
 def compile_and_run(
-    mod, input_list, output_list, use_calculated_workspaces, params=None, workspace_byte_alignment=8
+    mod,
+    input_list,
+    output_list,
+    target_options,
+    use_calculated_workspaces,
+    params=None,
+    workspace_byte_alignment=8,
 ):
     """
     This method verifies the generated source
     """
-    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}"
+    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment} {target_options}"
     cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
 
     # The calculated workspaces will not account for stack allocator tags used for debugging
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 02b4de3a64f3..7638428eafcf 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,7 +44,8 @@
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_conv_with_params(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_conv_with_params(use_calculated_workspaces, target_options):
     RELAY_MODEL = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
@@ -73,11 +74,12 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     output_list = generate_ref_data(mod, inputs, params)
 
     input_list = [input_data]
-    compile_and_run(mod, input_list, output_list, use_calculated_workspaces, params)
+    compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces, params)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_add_with_params(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_add_with_params(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
     z = relay.add(x, y)
@@ -91,11 +93,14 @@ def test_add_with_params(use_calculated_workspaces):
     output_list = generate_ref_data(func, inputs, params)
 
     input_list = [y_in]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces, params)
+    compile_and_run(
+        func, input_list, output_list, target_options, use_calculated_workspaces, params
+    )
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_conv2d(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_conv2d(use_calculated_workspaces, target_options):
     """Test a subgraph with a single conv2d operator."""
 
     def conv2d_direct():
@@ -137,11 +142,12 @@ def group_conv2d():
     for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
         output_list = generate_ref_data(mod, inputs)
         input_list = [inputs["data"], inputs["weight"]]
-        compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
+        compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_concatenate(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_concatenate(use_calculated_workspaces, target_options):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
     y = relay.var("y", shape=(10, 5), dtype=dtype)
@@ -157,11 +163,12 @@ def test_concatenate(use_calculated_workspaces):
 
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_nested_tuples(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_nested_tuples(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
     x2 = x1 + relay.const(1.0)
@@ -174,39 +181,43 @@ def test_nested_tuples(use_calculated_workspaces):
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [x_data]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_tuple_getitem(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_tuple_getitem(use_calculated_workspaces, target_options):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_id(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_id(use_calculated_workspaces, target_options):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
     one = np.array(1.0, "float32")
     inputs = {"x": one}
     output_list = generate_ref_data(ident, inputs)
     input_list = [one]
-    compile_and_run(ident, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(ident, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_add_const(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_add_const(use_calculated_workspaces, target_options):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_mul_param(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_mul_param(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -215,11 +226,12 @@ def test_mul_param(use_calculated_workspaces):
     inputs = {"x": x_data, "y": y_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_subtract(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_subtract(use_calculated_workspaces, target_options):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
     func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
@@ -227,11 +239,12 @@ def test_subtract(use_calculated_workspaces):
     inputs = {"i": i_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["i"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_tuple_output(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+def test_tuple_output(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -243,15 +256,17 @@ def test_tuple_output(use_calculated_workspaces):
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize(
     "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
 )
-def test_mobilenet(use_calculated_workspaces_and_alignment):
+@pytest.mark.parametrize("target_options", ["--no-typed-operators"])
+def test_mobilenet(use_calculated_workspaces_and_alignment, target_options):
     use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
     workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]
+
     mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
@@ -259,7 +274,13 @@ def test_mobilenet(use_calculated_workspaces_and_alignment):
     output_list = generate_ref_data(mod, inputs, params)
     input_list = [inputs["data"]]
     compile_and_run(
-        mod, input_list, output_list, use_calculated_workspaces, params, workspace_byte_alignment
+        mod,
+        input_list,
+        output_list,
+        target_options,
+        use_calculated_workspaces,
+        params,
+        workspace_byte_alignment,
     )
 
 
@@ -318,7 +339,8 @@ def visit_call(self, call):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_byoc_utvm(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", [""])
+def test_byoc_utvm(use_calculated_workspaces, target_options):
     """This is a simple test case to check BYOC capabilities of AOT"""
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -361,7 +383,7 @@ def test_byoc_utvm(use_calculated_workspaces):
     output_list = generate_ref_data(mod, map_inputs)
     input_list = [map_inputs["x"]]
     input_list.extend([map_inputs["w{}".format(i)] for i in range(8)])
-    compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_transform_make_unpacked_api.py b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
new file mode 100644
index 000000000000..92526652d4c3
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm import te
+import numpy
+
+
+@pytest.fixture
+def mod_without_attrs():
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
+    A = ib.pointer("float32", name="A")
+    stmt = ib.get()
+    return tvm.IRModule.from_expr(tvm.tir.PrimFunc([A], stmt))
+
+
+@pytest.fixture
+def mod(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(
+        mod_without_attrs
+    )
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    return mod
+
+
+def test_fails_if_not_global_symbol(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(
+        mod_without_attrs
+    )
+    with pytest.raises(tvm.TVMError, match="Expect PrimFunc to have the global_symbol attribute"):
+        f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+
+
+def test_fails_if_no_target(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod_without_attrs)
+    with pytest.raises(tvm.TVMError, match="Require the target attribute"):
+        f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+
+
+@tvm.testing.parametrize_targets("c", "llvm", "cuda")
+def test_device_setup(mod, target, dev):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target(target)))(mod)
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 1
+    assert f.params[0].name == "arg0"
+    assert f.body.node == "default"
+    assert f.body.attr_key == "device_id"
+    assert f.body.value == 0
+    assert f.body.body.node == "default"
+    assert f.body.body.attr_key == "device_type"
+    assert f.body.body.value == dev.device_type
+
+
+def test_argument_mapping(mod):
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 1
+    assert f.params[0].name == "arg0"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+
+
+def test_argument_mapping_multiple():
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
+    A = ib.pointer("float32", name="A")
+    B = ib.pointer("float32", name="B")
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 2
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.body.var.name == "B"
+    assert f.body.body.body.body.value.name == "arg1"
+
+
+def test_argument_mapping_multiple_matching():
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
+    A = ib.pointer("float32", name="A")
+    B = ib.pointer("float32", name="B")
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, A], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 2
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.body.condition.a.name == "A"
+    assert f.body.body.body.body.condition.b.name == "arg1"
+
+
+def test_body(mod):
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
+    A = ib.pointer("float32", name="A")
+    B = ib.pointer("float32", name="B")
+    C = ib.pointer("float32", name="C")
+    C[0] = A[0] + B[0]
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B, C], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 3
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.params[2].name == "arg2"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.body.var.name == "B"
+    assert f.body.body.body.body.value.name == "arg1"
+    assert f.body.body.body.body.body.var.name == "C"
+    assert f.body.body.body.body.body.value.name == "arg2"
+    assert f.body.body.body.body.body.body.buffer_var.name == "C"
+    assert f.body.body.body.body.body.body.value.a.buffer_var.name == "A"
+    assert f.body.body.body.body.body.body.value.a.index == 0
+    assert f.body.body.body.body.body.body.value.b.buffer_var.name == "B"
+    assert f.body.body.body.body.body.body.value.b.index == 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 1f03e8f5d417fa34bde36513ede2a6656b6ff4ea Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Tue, 11 May 2021 09:27:32 +0000
Subject: [PATCH 2/9] Move entrypoint generation outside of main passes

This removes the logic for deciding the entrypoint from the compiler
passes and instead moves it into the metadata code generation. By moving
the generation, we can generate a variety of entrypoints on top of the
compiler output (such as the micro entrypoint discussed in the RFC).
---
 include/tvm/ir/function.h                     |  7 --
 src/driver/driver_api.cc                      | 58 +++++----------
 src/relay/backend/aot_executor_codegen.cc     | 70 ++++++++-----------
 src/target/source/source_module.cc            | 50 +++++++++++--
 src/tir/transforms/make_packed_api.cc         |  5 +-
 src/tir/transforms/make_unpacked_api.cc       |  8 ++-
 .../test_tir_transform_make_unpacked_api.py   | 13 ++++
 7 files changed, 114 insertions(+), 97 deletions(-)

diff --git a/include/tvm/ir/function.h b/include/tvm/ir/function.h
index 933ea4af6fe8..5b9e0714e202 100644
--- a/include/tvm/ir/function.h
+++ b/include/tvm/ir/function.h
@@ -61,13 +61,6 @@ enum class CallingConv : int {
    * - Implementation: defined by device runtime(e.g. runtime/cuda)
    */
   kDeviceKernelLaunch = 2,
-  /*!
-   * \brief Function that represents the entrypoint to a TVM network
-   *
-   * - This is transformed to either a packed function or a micro entrypoint
-   * - Implementation: Change the passes over this function
-   */
-  kEntryPoint = 3
 };
 
 /*!
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2a07ed26a341..2d3b6f47d8a7 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -128,22 +128,6 @@ transform::Pass Filter(FCond fcond) {
   return tir::transform::CreatePrimFuncPass(fpass, 0, "Filter", {});
 }
 
-transform::Pass FilterCallingConv(CallingConv calling_conv, bool should_match) {
-  return Filter([calling_conv, should_match](const tir::PrimFunc& f) {
-    auto actual_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault));
-    bool does_match = actual_conv == calling_conv;
-    return does_match == should_match;
-  });
-}
-
-transform::Pass FilterCallingConv(CallingConv calling_conv) {
-  return FilterCallingConv(calling_conv, true);
-}
-
-transform::Pass FilterNotCallingConv(CallingConv calling_conv) {
-  return FilterCallingConv(calling_conv, false);
-}
-
 IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::string& name,
                const std::unordered_map<te::Tensor, tir::Buffer>& binds) {
   Array<ObjectRef> out_arg_list;
@@ -201,10 +185,12 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
   return mod;
 }
 
-IRModule MixedPasses(IRModule mod_mixed, const Target& target,
-                     const transform::PassContext& pass_ctx, const tvm::transform::Pass& filter,
-                     bool use_unpacked_api) {
-  Array<tvm::transform::Pass> mixed_pass_list = {filter, BindTarget(target),
+std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
+                                                const Target& target_host_arg,
+                                                const transform::PassContext& pass_ctx) {
+  Target target = target_arg, target_host = target_host_arg;
+  CheckAndUpdateHostConsistency(&target, &target_host);
+  Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
   if (pass_ctx->GetConfig<Bool>("tir.detect_global_barrier", Bool(false)).value()) {
@@ -215,7 +201,7 @@ IRModule MixedPasses(IRModule mod_mixed, const Target& target,
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
-  if (use_unpacked_api) {
+  if (target->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
     mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
   } else {
     mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
@@ -224,28 +210,13 @@ IRModule MixedPasses(IRModule mod_mixed, const Target& target,
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
 
   auto opt_mixed = transform::Sequential(mixed_pass_list);
-  return opt_mixed(mod_mixed);
-}
-
-std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
-                                                const Target& target_host_arg,
-                                                const transform::PassContext& pass_ctx) {
-  Target target = target_arg, target_host = target_host_arg;
-  CheckAndUpdateHostConsistency(&target, &target_host);
-
-  // Run default passes over entrypoint function
-  auto entrypoint_filter = FilterCallingConv(CallingConv::kEntryPoint);
-  auto entrypoint_mod = MixedPasses(mod_mixed, target, pass_ctx, entrypoint_filter, false);
-
-  // Create passes for untyped operators but maintain default API for entrypoint
-  auto untyped_operators = target->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
-  auto operator_filter = FilterNotCallingConv(CallingConv::kEntryPoint);
-  mod_mixed = MixedPasses(mod_mixed, target, pass_ctx, operator_filter, untyped_operators);
-
-  mod_mixed->Update(entrypoint_mod);
+  mod_mixed = opt_mixed(std::move(mod_mixed));
 
   auto host_pass_list = {
-      FilterNotCallingConv(CallingConv::kDeviceKernelLaunch),
+      Filter([](const tir::PrimFunc& f) {
+        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
+               CallingConv::kDeviceKernelLaunch;
+      }),
       BindTarget(target_host),
       tir::transform::LowerTVMBuiltin(),
       tir::transform::LowerCustomDatatypes(),
@@ -259,7 +230,10 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
 
   // device pipeline
   auto device_pass_list = {
-      FilterCallingConv(CallingConv::kDeviceKernelLaunch),
+      Filter([](const tir::PrimFunc& f) {
+        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
+               CallingConv::kDeviceKernelLaunch;
+      }),
       BindTarget(target),
       tir::transform::LowerWarpMemory(),
       tir::transform::Simplify(),
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 0d438765878f..f1a92998e3e0 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -137,10 +137,17 @@ class AOTExecutorCodegen : public ExprVisitor {
       // Pack the sid inside the TVMValue
       auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
-      tvm::PrimExpr set_tensor =
-          tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                         {sid_array, 0, tir::builtin::kArrData, sid_value});
-      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+
+      if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+        stmts_.push_back(tir::LetStmt(sid_array, sid_value, tir::Evaluate(0)));
+      } else {
+        tvm::PrimExpr set_tensor =
+            tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                           {sid_array, 0, tir::builtin::kArrData, sid_value});
+        stmts_.push_back(
+            tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+      }
+
       sid_vars.push_back(sid_array);
     }
     return sid_vars;
@@ -161,16 +168,15 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                        {tir::StringImm(params_by_expr_[expr])});
 
-    tvm::PrimExpr set_param_array =
-        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                       {param_array, 0, tir::builtin::kArrData, param_handle});
-    lookup_call.push_back(tir::Evaluate(set_param_array));
-
-    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+    if (!target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+      tvm::PrimExpr set_param_array =
+          tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                        {param_array, 0, tir::builtin::kArrData, param_handle});
+      stmts_.push_back(tir::LetStmt(param_array, StackAlloca("arg_value", 1), tir::Evaluate(set_param_array)));
+    } else {
+      stmts_.push_back(tir::LetStmt(param_array, param_handle, tir::Evaluate(0)));
+    }
 
-    // Allocate the DLTensors on the stack
-    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), lookup_body);
-    stmts_.push_back(lookup_body);
     return param_array;
   }
 
@@ -192,45 +198,28 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
   }
 
-  /*!
-   * \brief Unpacks a buffer if operators are using the unpacked C-style interface
-   */
-  PrimExpr UnpackBufferIfUnpackedSignature(tir::Var arg) {
-    auto untyped_operators =
-        target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
-    if (!untyped_operators) {
-      return arg;
-    }
-
-    return tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                     {arg, 0, tir::builtin::kArrData});
-  }
-
   /*!
    * brief Call a function with a given name
    */
   void CreateFuncCall(Call call, std::string func_name) {
-    auto untyped_operators =
-        target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false));
-
     tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
     std::vector<tir::Stmt> create_func_call_stmts;
 
     // Pack the inputs
     for (Expr arg : call->args) {
       auto var_arg = FindExpr(arg);
-      args.push_back(UnpackBufferIfUnpackedSignature(var_arg[0]));
+      args.push_back(var_arg[0]);
     }
 
     auto ret_expr = Downcast<Expr>(call);
     // Pack the return(s) value. A call node can produce multiple outputs
     for (const auto& var : PackSid(ret_expr)) {
-      args.push_back(UnpackBufferIfUnpackedSignature(var));
+      args.push_back(var);
     }
 
     // Use tvm_call_packed to execute the function unless we're calling directly
     auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
-    if (untyped_operators) {
+    if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
       calling_pattern = tvm::tir::builtin::call_extern();
     }
 
@@ -248,16 +237,20 @@ class AOTExecutorCodegen : public ExprVisitor {
    * copy-on-write fashion.
    */
   void CopyToOutput(te::Var out, te::Var in, size_t size) {
-    auto retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                     {in, 0, tir::builtin::kArrData});
-
     // Define intermediate DLTensor to load/store the data
     auto tmp0 = te::Var("tmp0", DataType::Handle());
     auto tmp1 = te::Var("tmp1", DataType::Handle());
     te::Var loop_idx("i", DataType::Int(32));
     auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, tir::const_true());
-    auto tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                  {out, 0, tir::builtin::kArrData});
+
+    PrimExpr retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                         {in, 0, tir::builtin::kArrData});
+    PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                      {out, 0, tir::builtin::kArrData});
+    if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+      retval_get = in;
+      tostore = out;
+    }
 
     // Copy the variable from the input to the output
     tir::Stmt copy = tir::For(
@@ -540,7 +533,6 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Define the PrimFunc attributes
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", runtime::String(runtime::symbol::tvm_run_func_prefix));
-    dict_attrs.Set(tvm::attr::kCallingConv, Integer(CallingConv::kEntryPoint));
 
     // Make the PrimFunc
     return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, tir::Buffer>(),
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 661df9305036..2006c3457e75 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -192,17 +192,59 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
+  void GenerateUntypedEntrypoint() {
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
+    int total_args = (metadata_->num_inputs + metadata_->num_outputs);
+    for (int i = 0; i < total_args; ++i) {
+      code_ << "arg" << i;
+      if (i + 1 != total_args) {
+        code_ << ",";
+      }
+    }
+    code_ << ");\n";
+    code_ << "static int32_t _tvm_entrypoint";
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle) {\n";
+    code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
+    for (int i = 0; i < metadata_->num_inputs; ++i) {
+      code_ << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
+    }
+    for (int i = 0; i < metadata_->num_outputs; ++i) {
+      int j = metadata_->num_inputs + i;
+      code_ << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data";
+      if (i + 1 != metadata_->num_outputs) {
+        code_ << ",";
+      }
+    }
+    code_ << ");\n";
+    code_ << "}\n";
+  }
+
+  void GenerateTypedEntrypoint() {
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle);\n";
+    code_ << "static int32_t _tvm_entrypoint";
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle) {\n";
+    code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(args, type_code, num_args, out_value, out_type_code, resource_handle);\n";
+    code_ << "}\n";
+  }
+
   void GenerateAOTDescriptor() {
     code_ << "#include \"tvm/runtime/crt/internal/aot_executor/aot_executor.h\"\n";
     code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
     code_ << "#endif\n";
-    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
-    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
-             "out_type_code, void* resource_handle);\n";
+    if (target_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+      GenerateUntypedEntrypoint();
+    } else {
+      GenerateTypedEntrypoint();
+    }
     code_ << "const tvm_model_t network = {\n"
-          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_run_func_prefix << ",\n"
+          << "    .run_func = &_tvm_entrypoint,\n"
           << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
           << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
           << "};\n";
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 793db05ad609..0cc0086897d8 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -290,9 +290,8 @@ Pass MakePackedAPI(int num_unpacked_args) {
     for (const auto& kv : mptr->functions) {
       if (auto* n = kv.second.as<PrimFuncNode>()) {
         PrimFunc func = GetRef<PrimFunc>(n);
-        auto calling_conv =
-            func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault));
-        if (calling_conv == CallingConv::kDefault || calling_conv == CallingConv::kEntryPoint) {
+        if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
+            CallingConv::kDefault) {
           auto updated_func = MakePackedAPI(std::move(func), num_unpacked_args);
           updates.push_back({kv.first, updated_func});
         }
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index 00133cd5e1ab..154d0bfa5787 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -56,8 +56,7 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
   Integer device_id(0);
   PrimExpr node = StringImm("default");
   const Stmt nop = Evaluate(0);
-  std::vector<Stmt> device_init = {AttrStmt(node, attr::device_id, device_id, nop),
-                                   AttrStmt(node, attr::device_type, device_type, nop)};
+  std::vector<Stmt> device_init;
 
   // Create arg to buffer binder
   std::unordered_map<const VarNode*, PrimExpr> vmap;
@@ -90,6 +89,11 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
     binder.Bind(kv.second->data, kv.first, kv.first->name_hint, true);
   }
 
+  if (buffer_def.size()) {
+    device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
+    device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
+  }
+
   func_ptr->body = MergeNest({device_init, binder.init_nest(), binder.asserts()}, func_ptr->body);
   func_ptr->params = args;
   func_ptr->ret_type = PrimType(DataType::Int(32));
diff --git a/tests/python/unittest/test_tir_transform_make_unpacked_api.py b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
index 92526652d4c3..8e53cd729c01 100644
--- a/tests/python/unittest/test_tir_transform_make_unpacked_api.py
+++ b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
@@ -68,6 +68,19 @@ def test_device_setup(mod, target, dev):
     assert f.body.body.value == dev.device_type
 
 
+def test_no_buffers_no_device_setup():
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 0
+    assert f.body.value == 0
+
+
 def test_argument_mapping(mod):
     f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
     assert len(f.params) == 1

From 00000ba42d7d72b05045c918b32e6ada941a26c9 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Wed, 12 May 2021 14:51:31 +0000
Subject: [PATCH 3/9] Use buffers in make_unpacked_api tests

---
 .../test_tir_transform_make_unpacked_api.py   | 45 ++++++++-----------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/tests/python/unittest/test_tir_transform_make_unpacked_api.py b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
index 8e53cd729c01..9d917466758b 100644
--- a/tests/python/unittest/test_tir_transform_make_unpacked_api.py
+++ b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
@@ -24,8 +24,7 @@
 @pytest.fixture
 def mod_without_attrs():
     ib = tvm.tir.ir_builder.create()
-    n = tvm.runtime.convert(4)
-    A = ib.pointer("float32", name="A")
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
     stmt = ib.get()
     return tvm.IRModule.from_expr(tvm.tir.PrimFunc([A], stmt))
 
@@ -70,15 +69,16 @@ def test_device_setup(mod, target, dev):
 
 def test_no_buffers_no_device_setup():
     ib = tvm.tir.ir_builder.create()
-    n = tvm.runtime.convert(4)
+    A = ib.pointer("float32", name="A")
     stmt = ib.get()
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A], stmt))
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
 
     f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
-    assert len(f.params) == 0
-    assert f.body.value == 0
+    assert len(f.params) == 1
+    assert f.body.var.name == "A"
+    assert f.body.value.name == "arg0"
 
 
 def test_argument_mapping(mod):
@@ -91,9 +91,9 @@ def test_argument_mapping(mod):
 
 def test_argument_mapping_multiple():
     ib = tvm.tir.ir_builder.create()
-    n = tvm.runtime.convert(4)
-    A = ib.pointer("float32", name="A")
-    B = ib.pointer("float32", name="B")
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
+
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B], stmt))
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
@@ -111,9 +111,8 @@ def test_argument_mapping_multiple():
 
 def test_argument_mapping_multiple_matching():
     ib = tvm.tir.ir_builder.create()
-    n = tvm.runtime.convert(4)
-    A = ib.pointer("float32", name="A")
-    B = ib.pointer("float32", name="B")
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, A], stmt))
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
@@ -129,13 +128,12 @@ def test_argument_mapping_multiple_matching():
     assert f.body.body.body.body.condition.b.name == "arg1"
 
 
-def test_body(mod):
+def test_body():
     ib = tvm.tir.ir_builder.create()
-    n = tvm.runtime.convert(4)
-    A = ib.pointer("float32", name="A")
-    B = ib.pointer("float32", name="B")
-    C = ib.pointer("float32", name="C")
-    C[0] = A[0] + B[0]
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
+    C = ib.buffer_ptr(A)
+
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B, C], stmt))
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
@@ -146,16 +144,11 @@ def test_body(mod):
     assert f.params[1].name == "arg1"
     assert f.params[2].name == "arg2"
     assert f.body.body.body.var.name == "A"
-    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.value.name == "arg2"
     assert f.body.body.body.body.var.name == "B"
     assert f.body.body.body.body.value.name == "arg1"
-    assert f.body.body.body.body.body.var.name == "C"
-    assert f.body.body.body.body.body.value.name == "arg2"
-    assert f.body.body.body.body.body.body.buffer_var.name == "C"
-    assert f.body.body.body.body.body.body.value.a.buffer_var.name == "A"
-    assert f.body.body.body.body.body.body.value.a.index == 0
-    assert f.body.body.body.body.body.body.value.b.buffer_var.name == "B"
-    assert f.body.body.body.body.body.body.value.b.index == 0
+    assert f.body.body.body.body.body.condition.a.name == "A"
+    assert f.body.body.body.body.body.condition.b.name == "arg0"
 
 
 if __name__ == "__main__":

From af6e7665487e4c1db08db5f55c1c7878cf082144 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Thu, 13 May 2021 08:42:41 +0000
Subject: [PATCH 4/9] Enable --no-typed-operators for llvm

---
 src/target/target_kind.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 355bc4e3b4ff..a60ae64597e8 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -258,6 +258,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
+    .add_attr_option<Bool>("no-typed-operators", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)

From c468c463dc38719269e2700d18491c0458961339 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Thu, 13 May 2021 14:28:26 +0000
Subject: [PATCH 5/9] Change --no-typed-operators to --typed-operators=0 to
 match other options

---
 src/driver/driver_api.cc                  |  6 +++---
 src/relay/backend/aot_executor_codegen.cc | 17 ++++++++--------
 src/target/source/source_module.cc        |  6 +++---
 src/target/target_kind.cc                 |  4 ++--
 tests/python/relay/aot/test_crt_aot.py    | 24 +++++++++++------------
 5 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2d3b6f47d8a7..d873b16663b5 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -201,10 +201,10 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
-  if (target->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
-    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
-  } else {
+  if (target->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
     mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
+  } else {
+    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
   }
 
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index f1a92998e3e0..4c0e2131dbf6 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -138,14 +138,14 @@ class AOTExecutorCodegen : public ExprVisitor {
       auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
 
-      if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
-        stmts_.push_back(tir::LetStmt(sid_array, sid_value, tir::Evaluate(0)));
-      } else {
+      if (target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
         tvm::PrimExpr set_tensor =
             tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                            {sid_array, 0, tir::builtin::kArrData, sid_value});
         stmts_.push_back(
             tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+      } else {
+        stmts_.push_back(tir::LetStmt(sid_array, sid_value, tir::Evaluate(0)));
       }
 
       sid_vars.push_back(sid_array);
@@ -168,11 +168,12 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                        {tir::StringImm(params_by_expr_[expr])});
 
-    if (!target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+    if (target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
       tvm::PrimExpr set_param_array =
           tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                        {param_array, 0, tir::builtin::kArrData, param_handle});
-      stmts_.push_back(tir::LetStmt(param_array, StackAlloca("arg_value", 1), tir::Evaluate(set_param_array)));
+                         {param_array, 0, tir::builtin::kArrData, param_handle});
+      stmts_.push_back(
+          tir::LetStmt(param_array, StackAlloca("arg_value", 1), tir::Evaluate(set_param_array)));
     } else {
       stmts_.push_back(tir::LetStmt(param_array, param_handle, tir::Evaluate(0)));
     }
@@ -219,7 +220,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     // Use tvm_call_packed to execute the function unless we're calling directly
     auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
-    if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+    if (!target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
       calling_pattern = tvm::tir::builtin::call_extern();
     }
 
@@ -247,7 +248,7 @@ class AOTExecutorCodegen : public ExprVisitor {
                                          {in, 0, tir::builtin::kArrData});
     PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
                                       {out, 0, tir::builtin::kArrData});
-    if (target_host_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
+    if (!target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
       retval_get = in;
       tostore = out;
     }
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 2006c3457e75..25db049965f0 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -238,10 +238,10 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
     code_ << "#endif\n";
-    if (target_->GetAttr<Bool>("no-typed-operators").value_or(Bool(false))) {
-      GenerateUntypedEntrypoint();
-    } else {
+    if (target_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
       GenerateTypedEntrypoint();
+    } else {
+      GenerateUntypedEntrypoint();
     }
     code_ << "const tvm_model_t network = {\n"
           << "    .run_func = &_tvm_entrypoint,\n"
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index a60ae64597e8..3bc227635a5c 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -258,7 +258,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
-    .add_attr_option<Bool>("no-typed-operators", Bool(false))
+    .add_attr_option<Bool>("typed-operators")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -269,7 +269,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
-    .add_attr_option<Bool>("no-typed-operators", Bool(false))
+    .add_attr_option<Bool>("typed-operators")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 7638428eafcf..6b9f3ede0966 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,7 +44,7 @@
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_conv_with_params(use_calculated_workspaces, target_options):
     RELAY_MODEL = """
 #[version = "0.0.5"]
@@ -78,7 +78,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_add_with_params(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -99,7 +99,7 @@ def test_add_with_params(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_conv2d(use_calculated_workspaces, target_options):
     """Test a subgraph with a single conv2d operator."""
 
@@ -146,7 +146,7 @@ def group_conv2d():
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_concatenate(use_calculated_workspaces, target_options):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
@@ -167,7 +167,7 @@ def test_concatenate(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_nested_tuples(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
@@ -185,7 +185,7 @@ def test_nested_tuples(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_tuple_getitem(use_calculated_workspaces, target_options):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
@@ -194,7 +194,7 @@ def test_tuple_getitem(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_id(use_calculated_workspaces, target_options):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
@@ -206,7 +206,7 @@ def test_id(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_add_const(use_calculated_workspaces, target_options):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
@@ -216,7 +216,7 @@ def test_add_const(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_mul_param(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
@@ -230,7 +230,7 @@ def test_mul_param(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_subtract(use_calculated_workspaces, target_options):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
@@ -243,7 +243,7 @@ def test_subtract(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
 def test_tuple_output(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
@@ -262,7 +262,7 @@ def test_tuple_output(use_calculated_workspaces, target_options):
 @pytest.mark.parametrize(
     "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
 )
-@pytest.mark.parametrize("target_options", ["--no-typed-operators"])
+@pytest.mark.parametrize("target_options", ["--typed-operators=0"])
 def test_mobilenet(use_calculated_workspaces_and_alignment, target_options):
     use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
     workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]

From 95b020461baf2eff7eecf7919bfc65da67e0f7cd Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Fri, 21 May 2021 16:40:25 +0100
Subject: [PATCH 6/9] Refactor typed-operators lookup into use_typed_operators_

(Also contains minor clean up of output variables)
---
 src/relay/backend/aot_executor_codegen.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 4c0e2131dbf6..99c1e075d379 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -138,7 +138,7 @@ class AOTExecutorCodegen : public ExprVisitor {
       auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
 
-      if (target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
+      if (use_typed_operators_) {
         tvm::PrimExpr set_tensor =
             tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                            {sid_array, 0, tir::builtin::kArrData, sid_value});
@@ -168,7 +168,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                        {tir::StringImm(params_by_expr_[expr])});
 
-    if (target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
+    if (use_typed_operators_) {
       tvm::PrimExpr set_param_array =
           tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                          {param_array, 0, tir::builtin::kArrData, param_handle});
@@ -220,7 +220,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     // Use tvm_call_packed to execute the function unless we're calling directly
     auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
-    if (!target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
+    if (!use_typed_operators_) {
       calling_pattern = tvm::tir::builtin::call_extern();
     }
 
@@ -248,7 +248,7 @@ class AOTExecutorCodegen : public ExprVisitor {
                                          {in, 0, tir::builtin::kArrData});
     PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
                                       {out, 0, tir::builtin::kArrData});
-    if (!target_host_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
+    if (!use_typed_operators_) {
       retval_get = in;
       tostore = out;
     }
@@ -551,6 +551,8 @@ class AOTExecutorCodegen : public ExprVisitor {
   TargetsMap targets_;
   /*! \brief target host */
   Target target_host_;
+  /*! \brief untyped operators flag */
+  Bool use_typed_operators_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -580,10 +582,11 @@ class AOTExecutorCodegen : public ExprVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
-      : mod_(mod), return_sid_() {
+      : mod_(mod), use_typed_operators_(true) {
     compile_engine_ = CompileEngine::Global();
     targets_ = targets;
     target_host_ = target_host;
+    use_typed_operators_ = target_host->GetAttr<Bool>("typed-operators").value_or(Bool(true));
   }
 
   LoweredOutput Codegen(relay::Function func) {
@@ -607,8 +610,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Find the return sid
     return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
     for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
-      auto output_var = tir::Var("output", DataType::Handle());
-      main_signature_.push_back(output_var);
+      main_signature_.push_back(tir::Var("output", DataType::Handle()));
     }
 
     VisitExpr(func->body);

From 7344269ab2a8f4c00b48d542f649f16b6104013d Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Wed, 26 May 2021 13:09:48 +0000
Subject: [PATCH 7/9] Rename --typed-operators to --unpacked-api

(Also moves the entrypoint name to a constant)
---
 include/tvm/runtime/module.h              |  2 ++
 src/driver/driver_api.cc                  |  6 +++---
 src/relay/backend/aot_executor_codegen.cc | 14 ++++++-------
 src/target/source/source_module.cc        | 16 +++++++--------
 src/target/target_kind.cc                 |  4 ++--
 tests/cpp/target_test.cc                  |  3 ++-
 tests/python/relay/aot/test_crt_aot.py    | 24 +++++++++++------------
 7 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 689fe6fa53fc..5c70b2d5955b 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -232,6 +232,8 @@ constexpr const char* tvm_param_prefix = "__tvm_param__";
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
 /*! \brief The main AOT executor function */
 constexpr const char* tvm_run_func_prefix = "tvm__run_func";
+/*! \brief The entrypoint function to the generated network */
+constexpr const char* tvm_entrypoint_name = "tvm_entrypoint";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index d873b16663b5..7c304727080e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -201,10 +201,10 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
-  if (target->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
-    mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
-  } else {
+  if (target->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
     mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
+  } else {
+    mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
   }
 
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 99c1e075d379..efed44280685 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -138,7 +138,7 @@ class AOTExecutorCodegen : public ExprVisitor {
       auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
 
-      if (use_typed_operators_) {
+      if (!use_unpacked_api_) {
         tvm::PrimExpr set_tensor =
             tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                            {sid_array, 0, tir::builtin::kArrData, sid_value});
@@ -168,7 +168,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                        {tir::StringImm(params_by_expr_[expr])});
 
-    if (use_typed_operators_) {
+    if (!use_unpacked_api_) {
       tvm::PrimExpr set_param_array =
           tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
                          {param_array, 0, tir::builtin::kArrData, param_handle});
@@ -220,7 +220,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     // Use tvm_call_packed to execute the function unless we're calling directly
     auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
-    if (!use_typed_operators_) {
+    if (use_unpacked_api_) {
       calling_pattern = tvm::tir::builtin::call_extern();
     }
 
@@ -248,7 +248,7 @@ class AOTExecutorCodegen : public ExprVisitor {
                                          {in, 0, tir::builtin::kArrData});
     PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
                                       {out, 0, tir::builtin::kArrData});
-    if (!use_typed_operators_) {
+    if (use_unpacked_api_) {
       retval_get = in;
       tostore = out;
     }
@@ -552,7 +552,7 @@ class AOTExecutorCodegen : public ExprVisitor {
   /*! \brief target host */
   Target target_host_;
   /*! \brief untyped operators flag */
-  Bool use_typed_operators_;
+  Bool use_unpacked_api_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -582,11 +582,11 @@ class AOTExecutorCodegen : public ExprVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
-      : mod_(mod), use_typed_operators_(true) {
+      : mod_(mod), use_unpacked_api_(false) {
     compile_engine_ = CompileEngine::Global();
     targets_ = targets;
     target_host_ = target_host;
-    use_typed_operators_ = target_host->GetAttr<Bool>("typed-operators").value_or(Bool(true));
+    use_unpacked_api_ = target_host->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
   }
 
   LoweredOutput Codegen(relay::Function func) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 25db049965f0..11d79464105c 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -192,7 +192,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
-  void GenerateUntypedEntrypoint() {
+  void GenerateEntrypointForUnpackedAPI() {
     code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
     int total_args = (metadata_->num_inputs + metadata_->num_outputs);
     for (int i = 0; i < total_args; ++i) {
@@ -202,7 +202,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       }
     }
     code_ << ");\n";
-    code_ << "static int32_t _tvm_entrypoint";
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
@@ -220,11 +220,11 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "}\n";
   }
 
-  void GenerateTypedEntrypoint() {
+  void GenerateEntrypointForPackedAPI() {
     code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle);\n";
-    code_ << "static int32_t _tvm_entrypoint";
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix;
@@ -238,13 +238,13 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
     code_ << "#endif\n";
-    if (target_->GetAttr<Bool>("typed-operators").value_or(Bool(true))) {
-      GenerateTypedEntrypoint();
+    if (target_->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
+      GenerateEntrypointForUnpackedAPI();
     } else {
-      GenerateUntypedEntrypoint();
+      GenerateEntrypointForPackedAPI();
     }
     code_ << "const tvm_model_t network = {\n"
-          << "    .run_func = &_tvm_entrypoint,\n"
+          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_entrypoint_name << ",\n"
           << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
           << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
           << "};\n";
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 3bc227635a5c..5ee15b122c25 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -258,7 +258,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
-    .add_attr_option<Bool>("typed-operators")
+    .add_attr_option<Bool>("unpacked-api", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -269,7 +269,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
-    .add_attr_option<Bool>("typed-operators")
+    .add_attr_option<Bool>("unpacked-api", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 8dba462132ac..353023573923 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -147,9 +147,10 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 2U);
+  ICHECK_EQ(target->attrs.size(), 3U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
   ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
+  ICHECK_EQ(target->GetAttr<Bool>("unpacked-api"), false);
 }
 
 TEST(TargetKindRegistryListTargetKinds, Basic) {
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 6b9f3ede0966..c936e3f6dc7b 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,7 +44,7 @@
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_conv_with_params(use_calculated_workspaces, target_options):
     RELAY_MODEL = """
 #[version = "0.0.5"]
@@ -78,7 +78,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_add_with_params(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -99,7 +99,7 @@ def test_add_with_params(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_conv2d(use_calculated_workspaces, target_options):
     """Test a subgraph with a single conv2d operator."""
 
@@ -146,7 +146,7 @@ def group_conv2d():
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_concatenate(use_calculated_workspaces, target_options):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
@@ -167,7 +167,7 @@ def test_concatenate(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_nested_tuples(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
@@ -185,7 +185,7 @@ def test_nested_tuples(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_tuple_getitem(use_calculated_workspaces, target_options):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
@@ -194,7 +194,7 @@ def test_tuple_getitem(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_id(use_calculated_workspaces, target_options):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
@@ -206,7 +206,7 @@ def test_id(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_add_const(use_calculated_workspaces, target_options):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
@@ -216,7 +216,7 @@ def test_add_const(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_mul_param(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
@@ -230,7 +230,7 @@ def test_mul_param(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_subtract(use_calculated_workspaces, target_options):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
@@ -243,7 +243,7 @@ def test_subtract(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
 def test_tuple_output(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
@@ -262,7 +262,7 @@ def test_tuple_output(use_calculated_workspaces, target_options):
 @pytest.mark.parametrize(
     "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
 )
-@pytest.mark.parametrize("target_options", ["--typed-operators=0"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api"])
 def test_mobilenet(use_calculated_workspaces_and_alignment, target_options):
     use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
     workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]

From d1e0ab212729b383d05613bc76f97530ea74dc9a Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Tue, 1 Jun 2021 08:25:57 +0000
Subject: [PATCH 8/9] Move all properties into init list to avoid double init

---
 src/relay/backend/aot_executor_codegen.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index efed44280685..44adb508f887 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -582,12 +582,11 @@ class AOTExecutorCodegen : public ExprVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
-      : mod_(mod), use_unpacked_api_(false) {
-    compile_engine_ = CompileEngine::Global();
-    targets_ = targets;
-    target_host_ = target_host;
-    use_unpacked_api_ = target_host->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
-  }
+      : mod_(mod),
+        targets_(targets),
+        target_host_(target_host),
+        use_unpacked_api_(target_host->GetAttr<Bool>("unpacked-api").value_or(Bool(false))),
+        compile_engine_(CompileEngine::Global()) {}
 
   LoweredOutput Codegen(relay::Function func) {
     // Get the module, storage map and token sizes

From f874d7241328221816e5f1660efd436010cf5b8f Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <christopher.sidebottom@arm.com>
Date: Wed, 2 Jun 2021 11:14:46 +0000
Subject: [PATCH 9/9] Remove AutoTVM breaking default and improve clarity

---
 include/tvm/runtime/module.h              |  2 --
 src/relay/backend/aot_executor_codegen.cc |  9 ++++++++-
 src/target/source/source_module.cc        |  6 +++---
 src/target/target_kind.cc                 |  4 ++--
 tests/cpp/target_test.cc                  |  3 +--
 tests/python/relay/aot/test_crt_aot.py    | 22 +++++++++++-----------
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 5c70b2d5955b..689fe6fa53fc 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -232,8 +232,6 @@ constexpr const char* tvm_param_prefix = "__tvm_param__";
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
 /*! \brief The main AOT executor function */
 constexpr const char* tvm_run_func_prefix = "tvm__run_func";
-/*! \brief The entrypoint function to the generated network */
-constexpr const char* tvm_entrypoint_name = "tvm_entrypoint";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 44adb508f887..66294d1dd076 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -551,7 +551,14 @@ class AOTExecutorCodegen : public ExprVisitor {
   TargetsMap targets_;
   /*! \brief target host */
   Target target_host_;
-  /*! \brief untyped operators flag */
+  /*!
+   * \brief unpacked api toggle
+   * When set to true the code generated will use unpacked calls to functions:
+   * func(void* arg0, void* arg1)
+   * Rather than packed calls:
+   * func(void* args)
+   * Defaults to using the packed calling convention
+   */
   Bool use_unpacked_api_;
 
   /*!
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 11d79464105c..992df61980f8 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -202,7 +202,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       }
     }
     code_ << ");\n";
-    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_entrypoint_name;
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
@@ -224,7 +224,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle);\n";
-    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_entrypoint_name;
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix;
@@ -244,7 +244,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       GenerateEntrypointForPackedAPI();
     }
     code_ << "const tvm_model_t network = {\n"
-          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_entrypoint_name << ",\n"
+          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_module_main << ",\n"
           << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
           << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
           << "};\n";
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 5ee15b122c25..b2a813a874e6 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -258,7 +258,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
-    .add_attr_option<Bool>("unpacked-api", Bool(false))
+    .add_attr_option<Bool>("unpacked-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -269,7 +269,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
-    .add_attr_option<Bool>("unpacked-api", Bool(false))
+    .add_attr_option<Bool>("unpacked-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 353023573923..8dba462132ac 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -147,10 +147,9 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 3U);
+  ICHECK_EQ(target->attrs.size(), 2U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
   ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
-  ICHECK_EQ(target->GetAttr<Bool>("unpacked-api"), false);
 }
 
 TEST(TargetKindRegistryListTargetKinds, Basic) {
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index c936e3f6dc7b..4f8de450d9f1 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,7 +44,7 @@
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_conv_with_params(use_calculated_workspaces, target_options):
     RELAY_MODEL = """
 #[version = "0.0.5"]
@@ -78,7 +78,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_add_with_params(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -99,7 +99,7 @@ def test_add_with_params(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_conv2d(use_calculated_workspaces, target_options):
     """Test a subgraph with a single conv2d operator."""
 
@@ -146,7 +146,7 @@ def group_conv2d():
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_concatenate(use_calculated_workspaces, target_options):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
@@ -167,7 +167,7 @@ def test_concatenate(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_nested_tuples(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
@@ -185,7 +185,7 @@ def test_nested_tuples(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_tuple_getitem(use_calculated_workspaces, target_options):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
@@ -194,7 +194,7 @@ def test_tuple_getitem(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_id(use_calculated_workspaces, target_options):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
@@ -206,7 +206,7 @@ def test_id(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_add_const(use_calculated_workspaces, target_options):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
@@ -216,7 +216,7 @@ def test_add_const(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_mul_param(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
@@ -230,7 +230,7 @@ def test_mul_param(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_subtract(use_calculated_workspaces, target_options):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
@@ -243,7 +243,7 @@ def test_subtract(use_calculated_workspaces, target_options):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["", "--unpacked-api"])
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
 def test_tuple_output(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()