diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index b1957d2a9cb8..2113d58f1ffa 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -212,6 +212,17 @@ TVM_DLL Pass InstrumentBoundCheckers();
  */
 TVM_DLL Pass MakePackedAPI(int num_unpacked_args);
 
+/*!
+ * \brief Transform the high-level PrimFunc to a C signature that can be used
+ *   to call the operator directly.
+ *
+ *  The main task of this function is to create code that maps the values in the
+ *  api_args to Var that is required by body
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass MakeUnpackedAPI();
+
 /*!
  * \brief Remap the thread axis
  *
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index be55b48da71e..26b22f99c215 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -347,6 +347,17 @@ def MakePackedAPI(num_unpacked_params=0):
     return _ffi_api.MakePackedAPI(num_unpacked_params)
 
 
+def MakeUnpackedAPI():
+    """Transform the PrimFuncs in the module to a C API compatible with internal calls.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.MakeUnpackedAPI()
+
+
 def SplitHostDevice():
     """Split the function into a host function and device functions.
 
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index f30cecbf7f05..7c304727080e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -200,8 +200,15 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   mixed_pass_list.push_back(tir::transform::ThreadSync("warp"));
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
-  mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
+
+  if (target->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
+    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
+  } else {
+    mixed_pass_list.push_back(tir::transform::MakePackedAPI(0));
+  }
+
   mixed_pass_list.push_back(tir::transform::SplitHostDevice());
+
   auto opt_mixed = transform::Sequential(mixed_pass_list);
   mod_mixed = opt_mixed(std::move(mod_mixed));
 
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a005247d424a..66294d1dd076 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -137,10 +137,17 @@ class AOTExecutorCodegen : public ExprVisitor {
       // Pack the sid inside the TVMValue
       auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
-      tvm::PrimExpr set_tensor =
-          tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                         {sid_array, 0, tir::builtin::kArrData, sid_value});
-      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+
+      if (!use_unpacked_api_) {
+        tvm::PrimExpr set_tensor =
+            tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                           {sid_array, 0, tir::builtin::kArrData, sid_value});
+        stmts_.push_back(
+            tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+      } else {
+        stmts_.push_back(tir::LetStmt(sid_array, sid_value, tir::Evaluate(0)));
+      }
+
       sid_vars.push_back(sid_array);
     }
     return sid_vars;
@@ -161,16 +168,16 @@ class AOTExecutorCodegen : public ExprVisitor {
     auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                        {tir::StringImm(params_by_expr_[expr])});
 
-    tvm::PrimExpr set_param_array =
-        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                       {param_array, 0, tir::builtin::kArrData, param_handle});
-    lookup_call.push_back(tir::Evaluate(set_param_array));
-
-    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+    if (!use_unpacked_api_) {
+      tvm::PrimExpr set_param_array =
+          tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                         {param_array, 0, tir::builtin::kArrData, param_handle});
+      stmts_.push_back(
+          tir::LetStmt(param_array, StackAlloca("arg_value", 1), tir::Evaluate(set_param_array)));
+    } else {
+      stmts_.push_back(tir::LetStmt(param_array, param_handle, tir::Evaluate(0)));
+    }
 
-    // Allocate the DLTensors on the stack
-    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), lookup_body);
-    stmts_.push_back(lookup_body);
     return param_array;
   }
 
@@ -206,15 +213,20 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
 
     auto ret_expr = Downcast<Expr>(call);
-
     // Pack the return(s) value. A call node can produce multiple outputs
     for (const auto& var : PackSid(ret_expr)) {
       args.push_back(var);
     }
 
-    // Use tvm_call_packed to execute the function
-    create_func_call_stmts.push_back(tir::Evaluate(
-        tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args)));
+    // Use tvm_call_packed to execute the function unless we're calling directly
+    auto calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
+    if (use_unpacked_api_) {
+      calling_pattern = tvm::tir::builtin::call_extern();
+    }
+
+    create_func_call_stmts.push_back(
+        tir::Evaluate(tvm::tir::Call(DataType::Int(32), calling_pattern, args)));
+
     tir::Stmt body = tir::SeqStmt(create_func_call_stmts);
     stmts_.push_back(body);
   }
@@ -226,16 +238,20 @@ class AOTExecutorCodegen : public ExprVisitor {
    * copy-on-write fashion.
    */
   void CopyToOutput(te::Var out, te::Var in, size_t size) {
-    auto retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                     {in, 0, tir::builtin::kArrData});
-
     // Define intermediate DLTensor to load/store the data
     auto tmp0 = te::Var("tmp0", DataType::Handle());
     auto tmp1 = te::Var("tmp1", DataType::Handle());
     te::Var loop_idx("i", DataType::Int(32));
     auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, tir::const_true());
-    auto tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                  {out, 0, tir::builtin::kArrData});
+
+    PrimExpr retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                         {in, 0, tir::builtin::kArrData});
+    PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                      {out, 0, tir::builtin::kArrData});
+    if (use_unpacked_api_) {
+      retval_get = in;
+      tostore = out;
+    }
 
     // Copy the variable from the input to the output
     tir::Stmt copy = tir::For(
@@ -535,6 +551,15 @@ class AOTExecutorCodegen : public ExprVisitor {
   TargetsMap targets_;
   /*! \brief target host */
   Target target_host_;
+  /*!
+   * \brief unpacked api toggle
+   * When set to true the code generated will use unpacked calls to functions:
+   * func(void* arg0, void* arg1)
+   * Rather than packed calls:
+   * func(void* args)
+   * Defaults to using the packed calling convention
+   */
+  Bool use_unpacked_api_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -564,21 +589,20 @@ class AOTExecutorCodegen : public ExprVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
-      : mod_(mod), return_sid_() {
-    compile_engine_ = CompileEngine::Global();
-    targets_ = targets;
-    target_host_ = target_host;
-  }
+      : mod_(mod),
+        targets_(targets),
+        target_host_(target_host),
+        use_unpacked_api_(target_host->GetAttr<Bool>("unpacked-api").value_or(Bool(false))),
+        compile_engine_(CompileEngine::Global()) {}
 
   LoweredOutput Codegen(relay::Function func) {
     // Get the module, storage map and token sizes
     auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
     storage_device_map_ = (*pf)(func);
 
-    int input_index = 0;
     for (auto input : func->params) {
       input_vars_.push_back(input);
-      main_signature_.push_back(tir::Var(MakeString("input_", input_index), DataType::Handle()));
+      main_signature_.push_back(tir::Var("input", DataType::Handle()));
     }
 
     // Define the storage allocator ids
@@ -592,7 +616,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Find the return sid
     return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
     for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
-      main_signature_.push_back(tir::Var(MakeString("output_", output_index), DataType::Handle()));
+      main_signature_.push_back(tir::Var("output", DataType::Handle()));
     }
 
     VisitExpr(func->body);
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 661df9305036..992df61980f8 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -192,17 +192,59 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
+  void GenerateEntrypointForUnpackedAPI() {
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
+    int total_args = (metadata_->num_inputs + metadata_->num_outputs);
+    for (int i = 0; i < total_args; ++i) {
+      code_ << "arg" << i;
+      if (i + 1 != total_args) {
+        code_ << ",";
+      }
+    }
+    code_ << ");\n";
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle) {\n";
+    code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix << "(";
+    for (int i = 0; i < metadata_->num_inputs; ++i) {
+      code_ << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
+    }
+    for (int i = 0; i < metadata_->num_outputs; ++i) {
+      int j = metadata_->num_inputs + i;
+      code_ << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data";
+      if (i + 1 != metadata_->num_outputs) {
+        code_ << ",";
+      }
+    }
+    code_ << ");\n";
+    code_ << "}\n";
+  }
+
+  void GenerateEntrypointForPackedAPI() {
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle);\n";
+    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle) {\n";
+    code_ << "return " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(args, type_code, num_args, out_value, out_type_code, resource_handle);\n";
+    code_ << "}\n";
+  }
+
   void GenerateAOTDescriptor() {
     code_ << "#include \"tvm/runtime/crt/internal/aot_executor/aot_executor.h\"\n";
     code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
     code_ << "#endif\n";
-    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
-    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
-             "out_type_code, void* resource_handle);\n";
+    if (target_->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
+      GenerateEntrypointForUnpackedAPI();
+    } else {
+      GenerateEntrypointForPackedAPI();
+    }
     code_ << "const tvm_model_t network = {\n"
-          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_run_func_prefix << ",\n"
+          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_module_main << ",\n"
           << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
           << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
           << "};\n";
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 08e998e0f035..b2a813a874e6 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -258,6 +258,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
+    .add_attr_option<Bool>("unpacked-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -268,6 +269,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("march")
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
+    .add_attr_option<Bool>("unpacked-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
new file mode 100644
index 000000000000..154d0bfa5787
--- /dev/null
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file make_unpacked_api.cc Lower PrimFunc to a standard C function API.
+ */
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arg_binder.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
+  auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  ICHECK(global_symbol) << "MakeUnpackedAPI: Expect PrimFunc to have the global_symbol attribute";
+
+  auto target = func->GetAttr<Target>(tvm::attr::kTarget);
+  ICHECK(target.defined()) << "MakeUnpackedAPI: Require the target attribute";
+
+  auto* func_ptr = func.CopyOnWrite();
+
+  // Setup device context
+  int target_device_type = target.value()->kind->device_type;
+  Integer device_type(target_device_type);
+  Integer device_id(0);
+  PrimExpr node = StringImm("default");
+  const Stmt nop = Evaluate(0);
+  std::vector<Stmt> device_init;
+
+  // Create arg to buffer binder
+  std::unordered_map<const VarNode*, PrimExpr> vmap;
+  ArgBinder binder(&vmap);
+
+  // Collect variables and buffers to map between
+  Array<Var> args;
+  std::vector<std::pair<Var, Var>> var_def;
+  std::vector<std::pair<Var, Buffer>> buffer_def;
+
+  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
+    Var param = func_ptr->params[i];
+    Var v_arg = Var("arg" + std::to_string(i), param->dtype);
+
+    auto it = func_ptr->buffer_map.find(param);
+    if (it != func_ptr->buffer_map.end()) {
+      buffer_def.emplace_back(v_arg, (*it).second);
+    } else {
+      var_def.emplace_back(v_arg, param);
+    }
+
+    args.push_back(v_arg);
+  }
+
+  // Bind variables then bind buffers to them to ensure correct ordering
+  for (const auto& kv : var_def) {
+    binder.Bind(kv.second, kv.first, kv.first->name_hint, true);
+  }
+  for (const auto& kv : buffer_def) {
+    binder.Bind(kv.second->data, kv.first, kv.first->name_hint, true);
+  }
+
+  if (buffer_def.size()) {
+    device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
+    device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
+  }
+
+  func_ptr->body = MergeNest({device_init, binder.init_nest(), binder.asserts()}, func_ptr->body);
+  func_ptr->params = args;
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+
+  // return the function.
+  return std::move(func);
+}
+
+namespace transform {
+
+Pass MakeUnpackedAPI() {
+  auto pass_func = [](IRModule m, PassContext ctx) {
+    IRModuleNode* mptr = m.CopyOnWrite();
+    std::vector<std::pair<GlobalVar, PrimFunc>> updates;
+
+    for (const auto& kv : mptr->functions) {
+      if (auto* n = kv.second.as<PrimFuncNode>()) {
+        PrimFunc func = GetRef<PrimFunc>(n);
+        if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
+            CallingConv::kDefault) {
+          auto updated_func = MakeUnpackedAPI(std::move(func));
+          updates.push_back({kv.first, updated_func});
+        }
+      }
+    }
+
+    for (const auto& pair : updates) {
+      mptr->AddUnchecked(pair.first, pair.second);
+    }
+    return m;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.MakeUnpackedAPI", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.MakeUnpackedAPI").set_body_typed(MakeUnpackedAPI);
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index c1917674873d..a54ffb80f051 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -165,12 +165,18 @@ def extract_main_workspace_sizebytes(extract_dir):
 
 
 def compile_and_run(
-    mod, input_list, output_list, use_calculated_workspaces, params=None, workspace_byte_alignment=8
+    mod,
+    input_list,
+    output_list,
+    target_options,
+    use_calculated_workspaces,
+    params=None,
+    workspace_byte_alignment=8,
 ):
     """
     This method verifies the generated source
     """
-    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}"
+    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment} {target_options}"
     cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
 
     # The calculated workspaces will not account for stack allocator tags used for debugging
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 02b4de3a64f3..4f8de450d9f1 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,7 +44,8 @@
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_conv_with_params(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_conv_with_params(use_calculated_workspaces, target_options):
     RELAY_MODEL = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
@@ -73,11 +74,12 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     output_list = generate_ref_data(mod, inputs, params)
 
     input_list = [input_data]
-    compile_and_run(mod, input_list, output_list, use_calculated_workspaces, params)
+    compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces, params)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_add_with_params(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_add_with_params(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
     z = relay.add(x, y)
@@ -91,11 +93,14 @@ def test_add_with_params(use_calculated_workspaces):
     output_list = generate_ref_data(func, inputs, params)
 
     input_list = [y_in]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces, params)
+    compile_and_run(
+        func, input_list, output_list, target_options, use_calculated_workspaces, params
+    )
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_conv2d(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_conv2d(use_calculated_workspaces, target_options):
     """Test a subgraph with a single conv2d operator."""
 
     def conv2d_direct():
@@ -137,11 +142,12 @@ def group_conv2d():
     for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
         output_list = generate_ref_data(mod, inputs)
         input_list = [inputs["data"], inputs["weight"]]
-        compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
+        compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_concatenate(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_concatenate(use_calculated_workspaces, target_options):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
     y = relay.var("y", shape=(10, 5), dtype=dtype)
@@ -157,11 +163,12 @@ def test_concatenate(use_calculated_workspaces):
 
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_nested_tuples(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_nested_tuples(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
     x2 = x1 + relay.const(1.0)
@@ -174,39 +181,43 @@ def test_nested_tuples(use_calculated_workspaces):
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [x_data]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_tuple_getitem(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_tuple_getitem(use_calculated_workspaces, target_options):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_id(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_id(use_calculated_workspaces, target_options):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
     one = np.array(1.0, "float32")
     inputs = {"x": one}
     output_list = generate_ref_data(ident, inputs)
     input_list = [one]
-    compile_and_run(ident, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(ident, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_add_const(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_add_const(use_calculated_workspaces, target_options):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
     output_list = generate_ref_data(func, {})
     input_list = []
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_mul_param(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_mul_param(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -215,11 +226,12 @@ def test_mul_param(use_calculated_workspaces):
     inputs = {"x": x_data, "y": y_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_subtract(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_subtract(use_calculated_workspaces, target_options):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
     func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
@@ -227,11 +239,12 @@ def test_subtract(use_calculated_workspaces):
     inputs = {"i": i_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["i"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_tuple_output(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
+def test_tuple_output(use_calculated_workspaces, target_options):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -243,15 +256,17 @@ def test_tuple_output(use_calculated_workspaces):
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"]]
-    compile_and_run(func, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 @pytest.mark.parametrize(
     "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
 )
-def test_mobilenet(use_calculated_workspaces_and_alignment):
+@pytest.mark.parametrize("target_options", ["--unpacked-api"])
+def test_mobilenet(use_calculated_workspaces_and_alignment, target_options):
     use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
     workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]
+
     mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
@@ -259,7 +274,13 @@ def test_mobilenet(use_calculated_workspaces_and_alignment):
     output_list = generate_ref_data(mod, inputs, params)
     input_list = [inputs["data"]]
     compile_and_run(
-        mod, input_list, output_list, use_calculated_workspaces, params, workspace_byte_alignment
+        mod,
+        input_list,
+        output_list,
+        target_options,
+        use_calculated_workspaces,
+        params,
+        workspace_byte_alignment,
     )
 
 
@@ -318,7 +339,8 @@ def visit_call(self, call):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-def test_byoc_utvm(use_calculated_workspaces):
+@pytest.mark.parametrize("target_options", [""])
+def test_byoc_utvm(use_calculated_workspaces, target_options):
     """This is a simple test case to check BYOC capabilities of AOT"""
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -361,7 +383,7 @@ def test_byoc_utvm(use_calculated_workspaces):
     output_list = generate_ref_data(mod, map_inputs)
     input_list = [map_inputs["x"]]
     input_list.extend([map_inputs["w{}".format(i)] for i in range(8)])
-    compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
+    compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_transform_make_unpacked_api.py b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
new file mode 100644
index 000000000000..9d917466758b
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm import te
+import numpy
+
+
+@pytest.fixture
+def mod_without_attrs():
+    ib = tvm.tir.ir_builder.create()
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    stmt = ib.get()
+    return tvm.IRModule.from_expr(tvm.tir.PrimFunc([A], stmt))
+
+
+@pytest.fixture
+def mod(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(
+        mod_without_attrs
+    )
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    return mod
+
+
+def test_fails_if_not_global_symbol(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(
+        mod_without_attrs
+    )
+    with pytest.raises(tvm.TVMError, match="Expect PrimFunc to have the global_symbol attribute"):
+        f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+
+
+def test_fails_if_no_target(mod_without_attrs):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod_without_attrs)
+    with pytest.raises(tvm.TVMError, match="Require the target attribute"):
+        f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+
+
+@tvm.testing.parametrize_targets("c", "llvm", "cuda")
+def test_device_setup(mod, target, dev):
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target(target)))(mod)
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 1
+    assert f.params[0].name == "arg0"
+    assert f.body.node == "default"
+    assert f.body.attr_key == "device_id"
+    assert f.body.value == 0
+    assert f.body.body.node == "default"
+    assert f.body.body.attr_key == "device_type"
+    assert f.body.body.value == dev.device_type
+
+
+def test_no_buffers_no_device_setup():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 1
+    assert f.body.var.name == "A"
+    assert f.body.value.name == "arg0"
+
+
+def test_argument_mapping(mod):
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 1
+    assert f.params[0].name == "arg0"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+
+
+def test_argument_mapping_multiple():
+    ib = tvm.tir.ir_builder.create()
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
+
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 2
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.body.var.name == "B"
+    assert f.body.body.body.body.value.name == "arg1"
+
+
+def test_argument_mapping_multiple_matching():
+    ib = tvm.tir.ir_builder.create()
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, A], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 2
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg0"
+    assert f.body.body.body.body.condition.a.name == "A"
+    assert f.body.body.body.body.condition.b.name == "arg1"
+
+
+def test_body():
+    ib = tvm.tir.ir_builder.create()
+    A = tvm.tir.decl_buffer(name="A", shape=[1])
+    B = tvm.tir.decl_buffer(name="B", shape=[1])
+    C = ib.buffer_ptr(A)
+
+    stmt = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, B, C], stmt))
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", tvm.target.Target("llvm")))(mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
+    f = tvm.tir.transform.MakeUnpackedAPI()(mod)["main"]
+    assert len(f.params) == 3
+    assert f.params[0].name == "arg0"
+    assert f.params[1].name == "arg1"
+    assert f.params[2].name == "arg2"
+    assert f.body.body.body.var.name == "A"
+    assert f.body.body.body.value.name == "arg2"
+    assert f.body.body.body.body.var.name == "B"
+    assert f.body.body.body.body.value.name == "arg1"
+    assert f.body.body.body.body.body.condition.a.name == "A"
+    assert f.body.body.body.body.body.condition.b.name == "arg0"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])