From 9308a3f82111404e4b90b439df4adca8b46ff9d1 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Thu, 4 Nov 2021 19:30:43 +0000
Subject: [PATCH 01/15] USMP integration with AoT executor codegen

This commit intergrates USMP with the AoT
executor codegen. Additionally, this commit
introduces two PassContext options to
disable_usmp and disable_storage_rewrite.

Change-Id: I98d905fab7f49c2de52e126115953a40b0821e21
---
 include/tvm/tir/transform.h                   |   7 +
 include/tvm/tir/usmp/algo/algo.h              |  62 +++++
 include/tvm/tir/usmp/analysis.h               |  49 ++++
 include/tvm/tir/usmp/transform.h              |  66 +++++
 include/tvm/tir/usmp/utils.h                  |  37 ++-
 src/driver/driver_api.cc                      |   7 +-
 src/relay/backend/aot_executor_codegen.cc     | 216 +++++++++++------
 src/runtime/meta_data.h                       |  18 +-
 src/target/source/codegen_source_base.cc      |  14 +-
 src/target/source/codegen_source_base.h       |   6 +
 src/target/source/source_module.cc            | 227 ++++++++++++++----
 src/tir/transforms/make_unpacked_api.cc       |  31 +--
 src/tir/usmp/analysis/extract_buffer_info.cc  |  19 +-
 src/tir/usmp/transform/assign_pool_info.cc    | 120 +++++++++
 .../convert_pool_allocations_to_offsets.cc    |  47 +++-
 src/tir/usmp/unified_static_memory_planner.cc |  96 ++++++++
 src/tir/usmp/utils.cc                         |  82 ++++++-
 tests/python/contrib/test_ethosu/infra.py     |   4 +-
 tests/python/relay/aot/aot_test_utils.py      |   6 +-
 tests/python/relay/aot/corstone300.ld         |   7 +
 tests/python/relay/aot/test_crt_aot.py        |  35 ++-
 21 files changed, 971 insertions(+), 185 deletions(-)
 create mode 100644 include/tvm/tir/usmp/algo/algo.h
 create mode 100644 include/tvm/tir/usmp/analysis.h
 create mode 100644 include/tvm/tir/usmp/transform.h
 create mode 100644 src/tir/usmp/transform/assign_pool_info.cc
 create mode 100644 src/tir/usmp/unified_static_memory_planner.cc

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 7a6cfa364447..97d750ce6aad 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -484,6 +484,13 @@ TVM_DLL Pass MergeDynamicSharedMemoryAllocations();
  */
 TVM_DLL Pass ConvertForLoopsToSerial();
 
+/*!
+ * \brief This is the unified static memory planner pass that will
+ * plan for memory intra- and inter- PrimFuncs together.
+ * \return The pass.
+ */
+TVM_DLL Pass UnifiedStaticMemoryPlanner();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/include/tvm/tir/usmp/algo/algo.h b/include/tvm/tir/usmp/algo/algo.h
new file mode 100644
index 000000000000..8a2e3475ce19
--- /dev/null
+++ b/include/tvm/tir/usmp/algo/algo.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/usmp/algo/algo.h
+ * \brief The memory planning algorithm for USMP
+ */
+
+#ifndef TVM_TIR_USMP_ALGO_ALGO_H_
+#define TVM_TIR_USMP_ALGO_ALGO_H_
+
+#include <tvm/tir/usmp/utils.h>
+
+namespace tvm {
+namespace tir {
+namespace usmp {
+namespace algo {
+
+/*!
+ * \brief The Greedy-by-Size algorithm to plan memory
+ *
+ * This will perform a greedy algorithm in deciding the offsets
+ * within provided Pools, using the size of the buffer.
+ *
+ * \return A Map of BufferInfo objects and their associated PoolAllocation
+ */
+Map<BufferInfo, PoolAllocation> GreedyBySize(const Array<BufferInfo>& buffer_info_arr,
+                                             const Integer& memory_pressure);
+
+/*!
+ * \brief The Greedy-by-Conflicts algorithm to plan memory
+ *
+ * This will perform a greedy algorithm in deciding the offsets
+ * within provided Pools, using the number of liveness conflicts of the buffer.
+ *
+ * \return A Map of BufferInfo objects and their associated PoolAllocation
+ */
+Map<BufferInfo, PoolAllocation> GreedyByConflicts(const Array<BufferInfo>& buffer_info_arr,
+                                                  const Integer& memory_pressure);
+
+}  // namespace algo
+}  // namespace usmp
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_USMP_ALGO_ALGO_H_
diff --git a/include/tvm/tir/usmp/analysis.h b/include/tvm/tir/usmp/analysis.h
new file mode 100644
index 000000000000..a24851d33182
--- /dev/null
+++ b/include/tvm/tir/usmp/analysis.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/usmp/analysis.h
+ * \brief The analysis passes for TIR-based Unified Static Memory Planner
+ */
+
+#ifndef TVM_TIR_USMP_ANALYSIS_H_
+#define TVM_TIR_USMP_ANALYSIS_H_
+
+#include <tvm/tir/function.h>
+#include <tvm/tir/usmp/utils.h>
+
+namespace tvm {
+namespace tir {
+namespace usmp {
+
+/*!
+ * \brief Extract BufferInfo objects from a TIR IRModule
+ *
+ * This pass would extract the buffer information of allocate nodes
+ * including liveness conflict with other buffer info objects.
+ *
+ * \return A Map of BufferInfo objects and their associated Stmts
+ */
+BufferInfoAnalysis ExtractBufferInfo(const PrimFunc& main_func, const IRModule& mod);
+
+}  // namespace usmp
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_USMP_ANALYSIS_H_
diff --git a/include/tvm/tir/usmp/transform.h b/include/tvm/tir/usmp/transform.h
new file mode 100644
index 000000000000..7bc08ea3933c
--- /dev/null
+++ b/include/tvm/tir/usmp/transform.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/usmp/transform.h
+ * \brief The transform passes for TIR-based Unified Static Memory Planner
+ */
+
+#ifndef TVM_TIR_USMP_TRANSFORM_H_
+#define TVM_TIR_USMP_TRANSFORM_H_
+
+#include <tvm/tir/usmp/utils.h>
+
+namespace tvm {
+namespace tir {
+namespace usmp {
+namespace transform {
+
+using Pass = tvm::transform::Pass;
+
+/*!
+ * \brief Convert the analyzed PoolAllocation to offsets from pool variables
+ *
+ * This pass would convert the IRModule that contains all PrimFuncs that contains
+ * the associated PoolAllocations to be read from being offset from the input var
+ * of the PrimFunc.
+ *
+ * \return the pass
+ */
+TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map<tir::Stmt, PoolAllocation>& pool_allocations,
+                                             Bool emit_tvmscript_printable = Bool(false));
+
+/*!
+ * \brief Assign PoolInfo objects to tir.allocate nodes depending on the PrimFunc's target
+ *
+ * This pass would assign PoolInfo objects to tir.allocate nodes depending on the each target
+ * that each PrimFunc would belong to. If there are not any pools provided in the IRModule,
+ * this pass would create a global workspace pool that every target could access for as the
+ * default behaviour.
+ *
+ * \return the pass
+ */
+TVM_DLL Pass AssignPoolInfo();
+
+}  // namespace transform
+}  // namespace usmp
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_USMP_TRANSFORM_H_
diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index 30c8f2ddea49..cf02596cb46c 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -26,6 +26,7 @@
 #define TVM_TIR_USMP_UTILS_H_
 
 #include <tvm/ir/expr.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt.h>
 
@@ -59,22 +60,26 @@ struct PoolInfoNode : public Object {
   Integer size_hint_bytes;
   /*! \brief The accessibility from each Target*/
   Map<Target, String> target_access;  // 'rw' or 'ro'
+  /*! \brief Whether pool is internally generated*/
+  Bool is_internal = Bool(false);
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pool_name", &pool_name);
     v->Visit("size_hint_bytes", &size_hint_bytes);
     v->Visit("target_access", &target_access);
+    v->Visit("is_internal", &is_internal);
   }
 
   bool SEqualReduce(const PoolInfoNode* other, SEqualReducer equal) const {
     return equal(pool_name, other->pool_name) && equal(size_hint_bytes, other->size_hint_bytes) &&
-           equal(target_access, other->target_access);
+           equal(target_access, other->target_access) && equal(is_internal, other->is_internal);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
     hash_reduce(pool_name);
     hash_reduce(size_hint_bytes);
     hash_reduce(target_access);
+    hash_reduce(is_internal);
   }
 
   static constexpr const char* _type_key = "tir.usmp.PoolInfo";
@@ -89,7 +94,8 @@ static const int kUnrestrictedPoolSizeHint = -1;
 class PoolInfo : public ObjectRef {
  public:
   TVM_DLL PoolInfo(String pool_name, Map<Target, String> target_access,
-                   Integer size_hint_bytes = kUnrestrictedPoolSizeHint);
+                   Integer size_hint_bytes = kUnrestrictedPoolSizeHint,
+                   Bool is_internal = Bool(false));
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PoolInfo, ObjectRef, PoolInfoNode);
 };
 
@@ -268,7 +274,16 @@ class AllocatedPoolInfo : public ObjectRef {
  *
  * \param buffer_info_map IR-bound BufferInfo map
  */
-Array<BufferInfo> CreateArrayBufferInfo(const Map<Stmt, BufferInfo>& buffer_info_map);
+Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info_map);
+
+/*!
+ * \brief Calculate workspace required to execute a IRModule with main expressed in TIR
+ *
+ * \param mod the IRModule with TIR-based main function
+ */
+Integer CalculateModuleWorkspaceSize(const IRModule& mod);
+
+void PrintConflicts(const Array<BufferInfo>& buffer_info_arr);
 
 /*!
  * \brief The allocate node attribute to indicate candidate memory pools.
@@ -284,6 +299,16 @@ static constexpr const char* kPoolCandidatesAllocateAttr = "candidate_memory_poo
  */
 Integer CalculateExtentsSize(const AllocateNode* op);
 
+/*!
+ * \brief Joins the Stmt nodes with PoolAllocation objects
+ *
+ * \param buffer_info_to_stmt the map of BufferInfo objects to Stmt nodes
+ * \param buffer_info_to_pool_allocation the map of BufferInfo objects to PoolAllocation objects
+ */
+Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
+    const Map<BufferInfo, Stmt>& buffer_info_to_stmt,
+    const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation);
+
 }  // namespace usmp
 }  // namespace tir
 
@@ -294,6 +319,12 @@ namespace attr {
  */
 static constexpr const char* kPoolArgs = "pool_args";
 
+/*!
+ * \brief This is a BaseFunc attribute to indicate which input var represent
+ * a PoolInfo Object in the form of a Map<Var, PoolInfo>.
+ */
+static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos";
+
 }  // namespace attr
 
 }  // namespace tvm
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7dc7b28b968b..e750344f4f0c 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -44,6 +44,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.detect_global_barrier", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
@@ -191,6 +192,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   transform::PassContext pass_ctx = transform::PassContext::Current();
 
   bool disable_vectorize = pass_ctx->GetConfig<Bool>("tir.disable_vectorize", Bool(false)).value();
+  bool disable_storage_rewrite =
+      pass_ctx->GetConfig<Bool>("tir.disable_storage_rewrite", Bool(false)).value();
   bool instrument_bound_checkers =
       pass_ctx->GetConfig<Bool>("tir.instrument_bound_checkers", Bool(false)).value();
 
@@ -260,7 +263,9 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(tir::transform::VectorizeLoop(!disable_vectorize));
   pass_list.push_back(tir::transform::InjectVirtualThread());
   pass_list.push_back(tir::transform::InjectDoubleBuffer());
-  pass_list.push_back(tir::transform::StorageRewrite());
+  if (!disable_storage_rewrite) {
+    pass_list.push_back(tir::transform::StorageRewrite());
+  }
   pass_list.push_back(tir::transform::UnrollLoop());
 
   // Add user-defined phase-2 passes
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index d901f8a26c4f..38b1cee6b9a2 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -35,6 +35,7 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/transform.h>
+#include <tvm/tir/usmp/utils.h>
 
 #include <algorithm>
 #include <list>
@@ -290,7 +291,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid);
       if (output_iter != return_sid_.end()) {
         int output_index = std::distance(return_sid_.begin(), output_iter);
-        buffer_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        buffer_vars.push_back(GetBufferVarForIO(input_vars_.size() + output_index));
         continue;
       }
 
@@ -308,7 +309,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     if (input_iter != input_vars_.end()) {
       // Input variable
       int main_index = std::distance(input_vars_.begin(), input_iter);
-      return {main_signature_[main_index]};
+      return {GetBufferVarForIO(main_index)};
     } else {
       // Storage identifier (i.e., intermediate memory)
       return PackSid(arg);
@@ -331,7 +332,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       if (params_by_expr_.find(arg) != params_by_expr_.end()) {
         auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                            {tir::StringImm(params_by_expr_[arg])});
-        args.push_back(param_handle);
+        args.push_back(tvm::tir::Cast(DataType::Handle(), param_handle));
       } else {
         auto var_arg = FindExpr(arg);
         for (const auto& var : var_arg) {
@@ -405,26 +406,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     auto tmp1 = te::Var("tmp1", DataType::Handle());
     te::Var loop_idx("i", DataType::Int(32));
     auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, tir::const_true());
-
-    PrimExpr retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                         {in, 0, tir::builtin::kArrData});
-    PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
-                                      {out, 0, tir::builtin::kArrData});
-    if (use_unpacked_api_) {
-      tostore = out;
-    }
-
-    // Do not pack the input if the flag is set or the caller
-    // explicitly asked to do so (e.g., copying a param to the output)
-    if (use_unpacked_api_ || !pack_input) {
-      retval_get = in;
-    }
-
     // Copy the variable from the input to the output
-    tir::Stmt copy = tir::For(
-        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
-        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, tir::const_true()));
-    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+    tir::Stmt copy =
+        tir::For(loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+                 tir::Store(tmp1, tir::Let(tmp0, in, retval_i), loop_idx, tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, out, copy));
   }
 
   /*
@@ -546,12 +532,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       if (params_by_expr_.find(expr) != params_by_expr_.end()) {
         auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                            {tir::StringImm(params_by_expr_[expr])});
-        CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle,
-                     /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]);
+        CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), param_handle,
+                     /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]);
       } else {
         auto var_expr = FindExpr(expr);
-        CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0],
-                     /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]);
+        CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), var_expr[0],
+                     /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]);
       }
     }
   }
@@ -572,7 +558,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       int output_index = std::distance(return_sid_.begin(), output_iter);
       auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(),
                                          {tir::StringImm(params_by_expr_[expr])});
-      CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, false,
+      CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), param_handle, false,
                    sinfo->storage_sizes_in_bytes[0]);
     }
   }
@@ -645,32 +631,120 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         // TODO(giuseros): we should allocate this once outside the PrimFunc
         // so we don't pay the price of allocation for every inference
         if (!allocated[sid]) {
-          body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body);
+          PointerType ptype = Downcast<PointerType>(sids_table_[sid]->type_annotation);
+          DataType element_type = Downcast<PrimType>(ptype->element_type)->dtype;
+          body = tir::Allocate(sids_table_[sid], element_type, {size}, tir::const_true(), body);
         }
         allocated[sid] = true;
       }
     }
 
-    // Define the attributes
-    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
-    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
-
     // Define the PrimFunc attributes
     Map<String, ObjectRef> dict_attrs;
     String run_func_name =
         runtime::get_name_mangled(mod_name, runtime::symbol::tvm_run_func_suffix);
     dict_attrs.Set("global_symbol", run_func_name);
     dict_attrs.Set("runner_function", Bool(true));
+    dict_attrs.Set(tvm::attr::kTarget, target_host_);
 
     tir::Stmt device_activations = GenerateAllDeviceHook("Activate");
     tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate");
     tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations});
 
     // Make the PrimFunc
-    return tir::PrimFunc(main_signature_, final_body, VoidType(), Map<tir::Var, tir::Buffer>(),
+    return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_,
                          DictAttrs(dict_attrs));
   }
 
+  /*!
+   * brief Access IO vars using the buffer vars and
+   * not the actual var.
+   */
+  tir::Var GetBufferVarForIO(int index) { return main_buffer_map_[main_signature_[index]]->data; }
+
+  /*!
+   * brief Create tir::Var for input/output while updating
+   * the buffer_maps.
+   */
+  void CreateIOVar(const Expr& expr, std::string name) {
+    if (expr->IsInstance<TupleNode>()) {
+      Tuple tuple = Downcast<Tuple>(expr);
+      for (unsigned i = 0; i < tuple->fields.size(); i++) {
+        CreateIOVar(tuple->fields[i], name + std::to_string(i) + "_");
+      }
+    } else {
+      tir::Var var = tir::Var(name, DataType::Handle());
+      main_signature_.push_back(var);
+      auto tensor_type = expr->checked_type().as<TensorTypeNode>();
+      DataType elem_type = tensor_type->dtype;
+      tir::Var buffer_var =
+          tir::Var(name + "_buffer_var", PointerType(PrimType(elem_type), "global"));
+      tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0,
+                                       name + "_buffer", 16, 1, tir::BufferType::kDefault);
+      main_buffer_map_.Set(var, buffer);
+    }
+  }
+
+  /*!
+   * brief This function is a wrapper to run memory planning
+   * followed by recording the latest workspaces required.
+   */
+  IRModule PlanMemoryLoweredModule(const IRModule& mod) {
+    transform::PassContext pass_ctx = transform::PassContext::Current();
+    bool disable_usmp = pass_ctx->GetConfig<Bool>("tir.usmp.disable", Bool(false)).value();
+
+    IRModule lowered_mod = mod->ShallowCopy();
+    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    Integer workspace_byte_alignment =
+        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    if (!disable_usmp) {
+      lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
+      // Update workspace size based on the pool allocations.
+      Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+          lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+      int main_workspace_size = 0;
+      if (allocated_pool_infos) {
+        for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info :
+             allocated_pool_infos.value()) {
+          main_workspace_size += allocated_pool_info->allocated_size->value;
+        }
+      }
+      for (const auto& kv : function_metadata_) {
+        if (lowered_mod->ContainGlobalVar(kv.first) &&
+            lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
+          tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
+          Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
+          const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+          kv.second->workspace_sizes.Set(tgt, ws);
+        }
+      }
+      backend::FunctionInfo main_func_info =
+          lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
+      main_func_info->workspace_sizes.Set(target_host_, main_workspace_size);
+      function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
+    } else {
+      // Running StorageRewrite just on the main function
+      tir::PrimFunc tir_main_func =
+          Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      IRModule main_func_mod;
+      main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+                            tir_main_func);
+      main_func_mod = tir::transform::StorageRewrite()(main_func_mod);
+      lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+                          main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      tir_main_func =
+          Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      // Use the PrimFunc to calculate the workspace required to service the allocates
+      Integer main_workspace_size =
+          CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
+      backend::FunctionInfo main_func_info =
+          lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
+      main_func_info->workspace_sizes.Set(target_host_, main_workspace_size);
+      function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
+    }
+    return lowered_mod;
+  }
+
  protected:
   /*! \brief mod */
   runtime::Module* mod_;
@@ -682,6 +756,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   Map<GlobalVar, tir::Var> device_contexts_;
   /*! \brief input and output variables belonging to the main function signature */
   Array<tir::Var> main_signature_;
+  /*! \brief input and output variables belonging to the main function signature */
+  Map<tir::Var, tir::Buffer> main_buffer_map_;
   /*! \brief target device */
   tec::TargetMap targets_;
   /*! \brief target host */
@@ -773,7 +849,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     for (auto input : lowered_main_func->params) {
       input_vars_.push_back(input);
-      main_signature_.push_back(tir::Var("input", DataType::Handle()));
+      std::string input_name = input->name_hint();
+      CreateIOVar(input, input_name);
     }
 
     // Define the storage allocator ids
@@ -792,9 +869,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Retrieve the return sids
     return_sid_ = final_aot_allocator.GetReturnIds();
-    for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
-      main_signature_.push_back(tir::Var("output", DataType::Handle()));
-    }
+    // Insert outputs to main func signature
+    CreateIOVar(lowered_main_func->body, "output");
 
     CollectDeviceVariables(lowered_mod->GetAttr<Map<GlobalVar, String>>("device_contexts").value());
     VisitExpr(lowered_main_func->body);
@@ -802,7 +878,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // Create the runner function. Please note that the function is not legal yet
     // because the packed calls arguments are not wrapped in TVMValues. To make this happen we need
     // to run the LegalizePackedCalls pass.
-    auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
     LoweredOutput ret;
 
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
@@ -812,36 +887,23 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
     }
 
-    // Build the TIR IRModule for the main AOT function
-    Map<GlobalVar, BaseFunc> symbol_map;
-    symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func);
-    IRModule mod_run(symbol_map, {}, {}, {}, mod->attrs);
-    VLOG(1) << "main module:" << std::endl << PrettyPrint(mod_run);
-
-    // Apply storage rewrite pass to the runner function to do memory planning
-    auto storage_rewrite = tir::transform::StorageRewrite();
-    mod_run = storage_rewrite(mod_run);
-    // The workspace for main function should be calculated after performing storage_rewrite for
-    // the top level TIR function.
-    Integer main_workspace_size = CalculateWorkspaceBytes(
-        Downcast<tir::PrimFunc>(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)),
-        workspace_byte_alignment);
-
-    Optional<backend::FunctionInfo> main_func_info =
-        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
-
-    main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size);
-    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
+    // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main
+    // function and replacing it with its TIR version. We should try to make this a Pass.
+    lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
+    auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
+    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func);
+    // Parallel for loops are not supported in AoT codegen.
+    lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
+    lowered_mod = PlanMemoryLoweredModule(lowered_mod);
+    ret.function_metadata = std::move(function_metadata_);
 
     // Legalize AOT if needed. This means that all the packed calls
     // need to be wrapped in TVMValues (unless use_unpacked_api is set)
     if (!use_unpacked_api_) {
       auto pack_calls = tir::transform::LegalizePackedCalls();
-      mod_run = pack_calls(mod_run);
+      lowered_mod = pack_calls(lowered_mod);
     }
 
-    ret.function_metadata = std::move(function_metadata_);
-
     Optional<Array<tvm::runtime::Module>> external_modules =
         lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
     ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
@@ -859,20 +921,28 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     ret.external_mods = external_modules.value();
 
-    if (ret.lowered_funcs.find(target_host_) != ret.lowered_funcs.end()) {
-      VLOG(1) << "merging main into existing module for host target";
-      ret.lowered_funcs[target_host_]->Update(mod_run);
-    } else {
-      VLOG(1) << "adding main into new module for host target";
-      ret.lowered_funcs.Set(target_host_, mod_run);
+    Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
+    std::vector<tir::Var> pool_vars;
+    tir::PrimFunc tir_main_func =
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+        tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+    int main_workspace_size = 0;
+    if (allocated_pool_infos) {
+      for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
+        pool_vars.push_back(allocated_pool_info->pool_var.value());
+        pool_var_info.Set(allocated_pool_info->pool_var.value(), allocated_pool_info);
+        main_workspace_size += allocated_pool_info->allocated_size->value;
+      }
     }
-
-    std::vector<String> input_var_names(input_vars_.size());
-    std::transform(input_vars_.begin(), input_vars_.end(), input_var_names.begin(),
-                   [](Var input_var) -> String { return input_var->name_hint(); });
+    Array<String> devices = ListDevices();
+    Array<tir::Var> inputs =
+        Array<tir::Var>(tir_main_func->params.begin(),
+                        tir_main_func->params.begin() + tir_main_func->params.size() -
+                            return_sid_.size() - pool_vars.size() - devices.size());
     ret.metadata =
-        runtime::Metadata(input_var_names, ListDevices(), return_sid_.size(),
-                          runtime::kTvmExecutorAot, mod_name, interface_api, use_unpacked_api_);
+        runtime::Metadata(inputs, pool_vars, devices, return_sid_.size(), runtime::kTvmExecutorAot,
+                          mod_name, interface_api, use_unpacked_api_, pool_var_info);
     return ret;
   }
 
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 8996d1b76e1f..80d3eeb09dd8 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -30,6 +30,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/tir/usmp/utils.h>
 
 #include <string>
 #include <unordered_map>
@@ -55,9 +56,11 @@ inline String get_name_mangled(const String& module_name, const String& name) {
 class MetadataNode : public Object {
  public:
   /*! \brief input information for the main function */
-  Array<String> inputs;
+  Array<tir::Var> inputs;
+  /*! \brief pool information for the main function */
+  Array<tir::Var> pools;
   /*! \brief number of outputs of the main function */
-  int num_outputs = 1;
+  unsigned int num_outputs = 1;
   /*! \brief device contexts information for the main function */
   Array<String> devices;
   /*! \brief the executor to be used to run the model */
@@ -66,6 +69,8 @@ class MetadataNode : public Object {
   String interface_api;
   /*! \brief The internal API (packed or unpacked) in use */
   bool unpacked_api;
+  /*! \brief the input var names that correspond to pool_inputs */
+  Optional<Map<tir::Var, tir::usmp::AllocatedPoolInfo>> pool_inputs;
 
   String mod_name = "";
 
@@ -79,16 +84,21 @@ class MetadataNode : public Object {
  */
 class Metadata : public ObjectRef {
  public:
-  TVM_DLL Metadata(Array<String> inputs, Array<String> devices, int num_outputs, String executor,
-                   String mod_name, String interface_api = "packed", bool unpacked_api = false) {
+  TVM_DLL Metadata(Array<tir::Var> inputs, Array<tir::Var> pools, Array<String> devices,
+                   int num_outputs, String executor, String mod_name,
+                   String interface_api = "packed", bool unpacked_api = false,
+                   Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
+                       Map<tir::Var, tir::usmp::AllocatedPoolInfo>()) {
     auto n = make_object<MetadataNode>();
     n->inputs = inputs;
+    n->pools = pools;
     n->devices = devices;
     n->num_outputs = num_outputs;
     n->executor = executor;
     n->interface_api = interface_api;
     n->unpacked_api = unpacked_api;
     n->mod_name = mod_name;
+    n->pool_inputs = pool_inputs;
     data_ = std::move(n);
   }
 
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 9f0cf9a70b61..518da725d08f 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -74,13 +74,13 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
   std::string key = v->name_hint;
   std::string vid = GetUniqueName(key);
   var_idmap_[v] = vid;
-  return vid;
+  return SanitiseName(vid);
 }
 
 std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const {
   auto it = var_idmap_.find(v);
   ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
-  return it->second;
+  return SanitiseName(it->second);
 }
 
 void CodeGenSourceBase::PrintIndent() {
@@ -113,5 +113,15 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
+std::string CodeGenSourceBase::SanitiseName(std::string name) const {
+  std::replace_if(
+      name.begin(), name.end(),
+      [](char c) {
+        { return !std::isalnum(c); }
+      },
+      '_');
+  return name;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index d938469b8969..0a3066db51a6 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -53,6 +53,12 @@ class CodeGenSourceBase {
    */
   void MarkConst(std::string value);
 
+  /*!
+   * \brief Sanitize names by removing illegal characters
+   * \param name The name to be sanitised.
+   */
+  std::string SanitiseName(std::string name) const;
+
  protected:
   /*! \brief entry in ssa assign map */
   struct SSAEntry {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index e01a3d93d087..9ecbf173cbbc 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -34,6 +34,7 @@
 #include "../../runtime/file_utils.h"
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
+#include "codegen_c.h"
 #include "codegen_source_base.h"
 
 namespace tvm {
@@ -165,6 +166,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   Target target_;
   relay::Runtime runtime_;
   runtime::Metadata metadata_;
+  CodeGenC codegen_c_;
 
   void CreateFuncRegistry() {
     code_ << "#include <tvm/runtime/crt/module.h>\n";
@@ -197,45 +199,161 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
+  String GenerateDLTensorStructWrapper(String reference_arg) {
+    code_ << "DLTensor " << reference_arg << "_dlt = {\n";
+    code_ << ".data = &" << reference_arg << "\n";
+    code_ << "};\n";
+    code_ << "TVMValue " << reference_arg << "_tvmv = {\n";
+    code_ << ".v_handle = &" << reference_arg << "_dlt\n";
+    code_ << "};\n";
+    return reference_arg + "_tvmv";
+  }
+
+  void GenerateInternalWorkspaceBuffers() {
+    if (metadata_->pool_inputs.defined()) {
+      for (const auto& kv : metadata_->pool_inputs.value()) {
+        tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second;
+        if (allocated_pool_info->pool_info->is_internal) {
+          code_ << "__attribute__((section(\".bss.tvm\"), ";
+          code_ << "aligned(" << 16 << ")))\n";
+          code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "["
+                << allocated_pool_info->allocated_size->value << "];\n";
+        }
+      }
+    }
+  }
+
+  bool IsInternalWorkspaceBuffer(const tir::Var& pool_var) {
+    if (metadata_->pool_inputs.defined()) {
+      Map<tir::Var, tir::usmp::AllocatedPoolInfo> allocated_pool_infos =
+          metadata_->pool_inputs.value();
+      if (allocated_pool_infos.find(pool_var) != allocated_pool_infos.end()) {
+        tir::usmp::AllocatedPoolInfo allocate_pool_info = allocated_pool_infos[pool_var];
+        if (allocate_pool_info->pool_info->is_internal) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   void GenerateEntrypointForUnpackedAPI(const std::string& entrypoint_name,
                                         const std::string& run_func) {
     code_ << "TVM_DLL int32_t " << run_func << "(";
-    unsigned int total_args = (metadata_->inputs.size() + metadata_->num_outputs);
-    for (unsigned int i = 0; i < total_args; ++i) {
-      code_ << "void* arg" << i;
-      if (i + 1 != total_args) {
-        code_ << ",";
+
+    {
+      std::stringstream call_args_ss;
+      for (const tir::Var& input_var : metadata_->inputs) {
+        if (input_var->type_annotation.defined()) {
+          codegen_c_.PrintType(input_var->type_annotation, call_args_ss);
+        } else {
+          codegen_c_.PrintType(input_var.dtype(), call_args_ss);
+        }
+        call_args_ss << " " << input_var->name_hint << ",";
+      }
+      for (unsigned int i = 0; i < metadata_->num_outputs; ++i) {
+        call_args_ss << "void* output" << i << ",";
+      }
+      for (const tir::Var& pool_var : metadata_->pools) {
+        if (pool_var->type_annotation.defined()) {
+          codegen_c_.PrintType(pool_var->type_annotation, call_args_ss);
+        } else {
+          codegen_c_.PrintType(pool_var.dtype(), call_args_ss);
+        }
+        call_args_ss << " " << pool_var->name_hint << ",";
       }
+      std::string call_args_str = call_args_ss.str();
+      call_args_str.pop_back();
+      code_ << call_args_str;
     }
+
     code_ << ");\n";
     code_ << "int32_t " << entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << run_func << "(";
-    for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) {
-      code_ << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
+
+    {
+      std::stringstream call_args_ss;
+      for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) {
+        call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
+      }
+      for (unsigned int i = 0; i < metadata_->num_outputs; ++i) {
+        int j = metadata_->inputs.size() + i;
+        call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data,";
+      }
+      for (const tir::Var& pool_var : metadata_->pools) {
+        if (IsInternalWorkspaceBuffer(pool_var)) {
+          call_args_ss << "&" << metadata_->pool_inputs.value()[pool_var]->pool_info->pool_name
+                       << ",";
+        }
+      }
+      std::string call_args_str = call_args_ss.str();
+      call_args_str.pop_back();
+      code_ << call_args_str;
+      code_ << ");\n";
+      code_ << "}\n";
+    }
+  }
+
+  std::unordered_map<int, ObjectRef> GenerateRunFuncToEntryPointArgMap() {
+    std::unordered_map<int, ObjectRef> run_func_to_entry_point_args;
+    int entrypoint_arg_count = 0;
+    int run_func_arg_count = 0;
+
+    for (unsigned int i = 0; i < metadata_->inputs.size(); i++) {
+      run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
+      entrypoint_arg_count++;
+      run_func_arg_count++;
     }
-    for (int i = 0; i < metadata_->num_outputs; ++i) {
-      int j = metadata_->inputs.size() + i;
-      code_ << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data";
-      if (i + 1 != metadata_->num_outputs) {
-        code_ << ",";
+    for (unsigned int i = 0; i < metadata_->num_outputs; i++) {
+      run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
+      entrypoint_arg_count++;
+      run_func_arg_count++;
+    }
+    for (const tir::Var& pool_var : metadata_->pools) {
+      if (IsInternalWorkspaceBuffer(pool_var)) {
+        tir::usmp::AllocatedPoolInfo allocated_pool_info = metadata_->pool_inputs.value()[pool_var];
+        run_func_to_entry_point_args[run_func_arg_count] =
+            allocated_pool_info->pool_info->pool_name;
+        run_func_arg_count++;
       }
     }
-    code_ << ");\n";
-    code_ << "}\n";
+    return run_func_to_entry_point_args;
   }
 
   void GenerateEntrypointForPackedAPI(const std::string& entrypoint_name,
                                       const std::string& run_func) {
     code_ << "TVM_DLL int32_t " << run_func;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
-             "out_type_code, void* resource_handle);\n";
+             "out_type_code, void* resource_handle);\n\n";
+
     code_ << "int32_t " << entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
+
+    // We are creating a copy of the set of pointers
+    size_t number_of_io_tensors =
+        metadata_->inputs.size() + metadata_->num_outputs + metadata_->pools.size();
+    code_ << "TVMValue tensors[" << number_of_io_tensors << "];\n";
+
+    std::unordered_map<int, ObjectRef> run_func_to_entry_point_args =
+        GenerateRunFuncToEntryPointArgMap();
+    for (unsigned int i = 0; i < number_of_io_tensors; i++) {
+      if (run_func_to_entry_point_args.find(i) != run_func_to_entry_point_args.end()) {
+        if (run_func_to_entry_point_args[i]->IsInstance<StringObj>()) {
+          String pool_name = Downcast<String>(run_func_to_entry_point_args[i]);
+          String pool_name_tvmv = GenerateDLTensorStructWrapper(pool_name);
+          code_ << "tensors[" << i << "] = " << pool_name_tvmv << ";\n";
+        } else {
+          code_ << "tensors[" << i << "] = ((TVMValue*)args)["
+                << run_func_to_entry_point_args[Integer(i)] << "];\n";
+        }
+      }
+    }
+
     code_ << "return " << run_func;
-    code_ << "(args, type_code, num_args, out_value, out_type_code, resource_handle);\n";
+    code_ << "((void*)tensors, type_code, num_args, out_value, out_type_code, resource_handle);\n";
     code_ << "}\n";
   }
 
@@ -245,14 +363,35 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
                                     const std::string& mod_name) {
     code_ << "#include <" << mod_name << ".h>\n";
     code_ << "TVM_DLL int32_t " << run_func << "(";
-    unsigned int total_args =
-        (metadata_->inputs.size() + metadata_->devices.size() + metadata_->num_outputs);
-    for (unsigned int i = 0; i < total_args; ++i) {
-      code_ << "void* arg" << i;
-      if (i + 1 != total_args) {
-        code_ << ",";
+    {
+      std::stringstream call_args_ss;
+      for (const tir::Var& input_var : metadata_->inputs) {
+        if (input_var->type_annotation.defined()) {
+          codegen_c_.PrintType(input_var->type_annotation, call_args_ss);
+        } else {
+          codegen_c_.PrintType(input_var.dtype(), call_args_ss);
+        }
+        call_args_ss << " " << codegen_c_.SanitiseName(input_var->name_hint) << ",";
+      }
+      for (unsigned int i = 0; i < metadata_->num_outputs; ++i) {
+        call_args_ss << "void* output" << i << ",";
+      }
+      for (const tir::Var& pool_var : metadata_->pools) {
+        if (pool_var->type_annotation.defined()) {
+          codegen_c_.PrintType(pool_var->type_annotation, call_args_ss);
+        } else {
+          codegen_c_.PrintType(pool_var.dtype(), call_args_ss);
+        }
+        call_args_ss << " " << pool_var->name_hint << ",";
+      }
+      for (const String& device : metadata_->devices) {
+        call_args_ss << "void* " << device << ",";
       }
+      std::string call_args_str = call_args_ss.str();
+      call_args_str.pop_back();
+      code_ << call_args_str;
     }
+
     code_ << ");\n";
     code_ << "int32_t " << entrypoint_name << "(";
     code_ << "struct " << runtime::get_name_mangled(mod_name, "inputs") << "* inputs,";
@@ -265,32 +404,32 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 
     code_ << ") {"
           << "return " << run_func << "(";
-    for (const auto& input : metadata_->inputs) {
-      std::string sanitised_input = input;
-      std::replace_if(sanitised_input.begin(), sanitised_input.end(), isNotAlnum, '_');
-      code_ << "inputs->" << sanitised_input << ",";
-    }
-    if (metadata_->num_outputs == 1) {
-      code_ << "outputs->output";
-    } else {
-      for (int i = 0; i < metadata_->num_outputs; ++i) {
-        code_ << "outputs->output" << i;
-        if (i + 1 != metadata_->num_outputs) {
-          code_ << ",";
+
+    {
+      std::stringstream call_args_ss;
+      for (const auto& input : metadata_->inputs) {
+        call_args_ss << "inputs->" << codegen_c_.SanitiseName(input->name_hint) << ",";
+      }
+      if (metadata_->num_outputs == 1) {
+        call_args_ss << "outputs->output,";
+      } else {
+        for (unsigned int i = 0; i < metadata_->num_outputs; ++i) {
+          call_args_ss << "outputs->output" << i << ",";
         }
       }
-    }
-
-    if (!metadata_->devices.empty()) {
-      code_ << ",";
-      for (const String& device : metadata_->devices) {
-        code_ << "devices->" << device;
-        if (device != metadata_->devices.back()) {
-          code_ << ",";
+      for (const tir::Var& pool_var : metadata_->pools) {
+        if (IsInternalWorkspaceBuffer(pool_var)) {
+          call_args_ss << "&" << metadata_->pool_inputs.value()[pool_var]->pool_info->pool_name
+                       << ",";
         }
       }
+      for (const String& device : metadata_->devices) {
+        call_args_ss << "devices->" << device << ",";
+      }
+      std::string call_args_str = call_args_ss.str();
+      call_args_str.pop_back();
+      code_ << call_args_str;
     }
-
     code_ << ");\n";
     code_ << "}\n";
   }
@@ -309,6 +448,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "extern \"C\" {\n";
     code_ << "#endif\n";
 
+    GenerateInternalWorkspaceBuffers();
+
     if (metadata_->unpacked_api) {
       if (metadata_->interface_api == "c") {
         GenerateCInterfaceEntrypoint(entrypoint_mangled, run_func_mangled, metadata_->mod_name);
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index 169983a525df..d118547517fe 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -57,33 +57,22 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
   const Stmt nop = Evaluate(0);
   std::vector<Stmt> device_init;
 
-  // Create arg to buffer binder
-  std::unordered_map<const VarNode*, PrimExpr> vmap;
-  ArgBinder binder(&vmap);
 
   // Collect variables and buffers to map between
   Array<Var> args;
-  std::vector<std::pair<Var, Var>> var_def;
-  bool buffer_map_found = false;
-
-  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
-    Var param = func_ptr->params[i];
-
-    auto it = func_ptr->buffer_map.find(param);
-    if (it != func_ptr->buffer_map.end()) {
-      args.push_back((*it).second->data);
-      buffer_map_found = true;
-    } else {
-      args.push_back(param);
-    }
+  // We only iterate the function params upto number of buffer_map keys
+  // because if there exist a resource handle, it will not have a buffer
+  for (unsigned int i = 0; i < func->buffer_map.size(); i++) {
+    Var param = func->params[i];
+    args.push_back(func->buffer_map[param]->data);
   }
-
-  if (buffer_map_found) {
-    device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
-    device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
+  if (func->params.size() == func->buffer_map.size() + 1) {
+    args.push_back(func->params.back());
   }
+  device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
+  device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
 
-  func_ptr->body = MergeNest({device_init, binder.init_nest(), binder.asserts()}, func_ptr->body);
+  func_ptr->body = MergeNest(device_init, func_ptr->body);
   func_ptr->params = args;
   func_ptr->ret_type = PrimType(DataType::Int(32));
 
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index ea53f27e5558..fb4fb52c507e 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -31,6 +31,7 @@
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/usmp/analysis.h>
 #include <tvm/tir/usmp/utils.h>
 
 #include <stack>
@@ -58,7 +59,9 @@ class BufferInfoExtractor : public StmtExprVisitor {
  public:
   explicit BufferInfoExtractor(const IRModule& module) : module_(module) {
     for (const auto& gv_func : module_->functions) {
-      functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+      if (gv_func.second->IsInstance<PrimFuncNode>()) {
+        functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+      }
     }
     // Pushing a scope info for the initial body of the main function
     scope_stack_.push(ScopeInfo());
@@ -342,16 +345,24 @@ void BufferInfoExtractor::VisitExpr_(const VarNode* op) {
 
 Array<Var> static GetMatchedBuffers(const PrimFunc& func) {
   Array<Var> buffer_vars;
-  for (const auto& param : func->params) {
+  for (unsigned int i = 0; i < func->params.size() - 1; i++) {
+    Var param = func->params[i];
     buffer_vars.push_back(func->buffer_map[param]->data);
   }
+  Var last_param = func->params.back();
+  // Checks whether last var is present in the buffer map
+  // because it could be the resource handle
+  if (func->buffer_map.find(last_param) != func->buffer_map.end()) {
+    buffer_vars.push_back(func->buffer_map[last_param]->data);
+  }
   return buffer_vars;
 }
 
 void BufferInfoExtractor::UpdateAliases(const Array<PrimExpr>& args, const PrimFunc& func) {
   auto param_buffers = GetMatchedBuffers(func);
-  ICHECK(args.size() == param_buffers.size());
-  for (size_t i = 0; i < args.size(); i++) {
+  // Last var could be a resource handle that does not have a Buffer
+  ICHECK(args.size() == param_buffers.size() || args.size() - 1 == param_buffers.size());
+  for (size_t i = 0; i < param_buffers.size(); i++) {
     auto arg = args[i];
     auto param_buf = param_buffers[i];
     // If tir.allocates are passed in to functions
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
new file mode 100644
index 000000000000..ef6559b27caa
--- /dev/null
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/target/target.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/usmp/algo/algo.h>
+#include <tvm/tir/usmp/analysis.h>
+#include <tvm/tir/usmp/transform.h>
+#include <tvm/tir/usmp/utils.h>
+
+#include <stack>
+#include <string>
+
+namespace tvm {
+namespace tir {
+namespace usmp {
+
+/*! \brief Assign PoolInfo objects to allocate that does not have any.
+ * The schedulers have the oppurtunity to assign PoolInfo objects to
+ * allocate nodes. However, each allocate node is expected to have
+ * at least one PoolInfo node assigned to it. If it was not the case,
+ * this Pass will assign all PoolInfo objects that the target could
+ * access.*/
+class PoolInfoAssigner : public StmtExprMutator {
+ public:
+  explicit PoolInfoAssigner(const IRModule& module) {
+    PrimFunc main_func =
+        Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    ICHECK(main_func.defined()) << "main function is not in the module";
+    Optional<Target> target_host = main_func->GetAttr<Target>(tvm::attr::kTarget);
+    ICHECK(target_host) << "main function does not have a target attr";
+    Array<usmp::PoolInfo> pool_infos =
+        module->GetAttr<Array<usmp::PoolInfo>>(tvm::attr::kPoolInfoIRModuleAttr)
+            .value_or({usmp::PoolInfo("global_workspace",
+                                      {{target_host.value(), usmp::kTargetPoolReadWriteAccess}},
+                                      usmp::kUnrestrictedPoolSizeHint, Bool(true))});
+    for (const usmp::PoolInfo& pool_info : pool_infos) {
+      for (const auto& kv : pool_info->target_access) {
+        Target tgt = kv.first;
+        if (target_pool_infos_.find(tgt) == target_pool_infos_.end()) {
+          target_pool_infos_.Set(tgt, Array<usmp::PoolInfo>());
+        }
+        Array<usmp::PoolInfo> pool_info_arr = target_pool_infos_[tgt];
+        pool_info_arr.push_back(pool_info);
+        target_pool_infos_.Set(tgt, pool_info_arr);
+      }
+    }
+    mod_ = module->ShallowCopy();
+  }
+
+  IRModule operator()();
+
+ private:
+  Stmt VisitStmt_(const AllocateNode* op) override;
+
+  IRModule mod_;
+  Map<Target, Array<PoolInfo>> target_pool_infos_;
+  PrimFunc func_;
+};
+
+Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) {
+  Optional<Target> tgt = func_->GetAttr<Target>(tvm::attr::kTarget).value();
+  ICHECK(tgt) << "The following PrimFunc does not have a target attr: \n" << func_;
+  Map<String, ObjectRef> annotations = Map<String, ObjectRef>(op->annotations);
+  if (op->annotations.find(kPoolCandidatesAllocateAttr) == op->annotations.end()) {
+    annotations.Set(kPoolCandidatesAllocateAttr, target_pool_infos_[tgt.value()]);
+  }
+  Stmt body = VisitStmt(op->body);
+  auto allocate =
+      Allocate(op->buffer_var, op->dtype, op->extents, op->condition, body, annotations);
+  return allocate;
+}
+
+IRModule PoolInfoAssigner::operator()() {
+  for (const auto& kv : mod_->functions) {
+    GlobalVar gv = kv.first;
+    if (kv.second->IsInstance<PrimFuncNode>()) {
+      func_ = Downcast<PrimFunc>(kv.second);
+      Stmt body = this->VisitStmt(func_->body);
+      PrimFunc new_prim_func =
+          PrimFunc(func_->params, body, func_->ret_type, func_->buffer_map, func_->attrs);
+      mod_->Update(gv, new_prim_func);
+    }
+  }
+  return mod_;
+}
+
+namespace transform {
+
+tvm::transform::Pass AssignPoolInfo() {
+  auto pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
+    return PoolInfoAssigner(m)();
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.usmp.AssignPoolInfo", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.usmp.transform.AssignPoolInfo").set_body_typed(AssignPoolInfo);
+
+}  // namespace transform
+
+}  // namespace usmp
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 5ebf3c557b06..cd797681d474 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -28,6 +28,7 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
+#include <tvm/tir/usmp/transform.h>
 #include <tvm/tir/usmp/utils.h>
 
 #include <stack>
@@ -120,12 +121,15 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
   /*! \brief This is a helper to append the pool args to
    * the callsite of the function.
    */
-  Array<PrimExpr> AppendPoolParamsToArgs(const Array<PrimExpr>& args);
+  Array<PrimExpr> AppendPoolParamsToArgs(Array<PrimExpr> args, const PrimFunc& func);
   /*! \brief Some arguments that used to be Allocate nodes
    * should be replaced by Let nodes in the pass that loads
    * the space from a pool variable.
    */
   Array<PrimExpr> ReplaceAllocateArgsWithLetArgs(const Array<PrimExpr>& args);
+  /*! \brief Obtain a resource handle if its there
+   */
+  Optional<Var> GetResourceHandle(const PrimFunc& func);
 
   /*! \brief The tir::Var map to PoolInfo objects */
   Map<tir::Var, PoolInfo> primfunc_args_to_pool_info_map_;
@@ -151,10 +155,23 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
   std::unordered_set<PrimFunc, ObjectPtrHash, ObjectPtrEqual> visited_primfuncs;
 };
 
+Optional<Var> PoolAllocationToOffsetConverter::GetResourceHandle(const PrimFunc& func) {
+  if (func->buffer_map.find(func->params.back()) == func->buffer_map.end()) {
+    return func->params.back();
+  }
+  return Optional<Var>();
+}
+
 PoolAllocationToOffsetConverter::ScopeInfo PoolAllocationToOffsetConverter::UpdateFunctionScopeInfo(
     const PrimFunc& original_func) {
   ScopeInfo si;
+
+  Optional<Var> resource_handle = GetResourceHandle(original_func);
   si.params = original_func->params;
+  if (resource_handle) {
+    si.params.pop_back();
+    ICHECK(si.params.size() == original_func->params.size() - 1);
+  }
   si.buffer_map = original_func->buffer_map;
   Map<tir::Var, PoolInfo> ret;
   for (const AllocatedPoolInfo& allocated_pool_info : allocated_pool_ordering_) {
@@ -179,6 +196,9 @@ PoolAllocationToOffsetConverter::ScopeInfo PoolAllocationToOffsetConverter::Upda
     si.buffer_map.Set(pool_var, Buffer(buffer_var, elem_dtype, {pool_size}, {1}, 1, buffer_var_name,
                                        16, 1, BufferType::kDefault));
   }
+  if (resource_handle) {
+    si.params.push_back(resource_handle.value());
+  }
   return si;
 }
 
@@ -199,7 +219,7 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams(
     PrimFunc ret =
         PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map, original_attrs);
     if (!emit_tvmscript_printable_) {
-      return WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params);
+      ret = WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params);
     }
     visited_primfuncs.insert(ret);
     return ret;
@@ -207,9 +227,14 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams(
   return original_primfunc;
 }
 
-Array<PrimExpr> PoolAllocationToOffsetConverter::AppendPoolParamsToArgs(
-    const Array<PrimExpr>& args) {
+Array<PrimExpr> PoolAllocationToOffsetConverter::AppendPoolParamsToArgs(Array<PrimExpr> args,
+                                                                        const PrimFunc& func) {
   Array<PrimExpr> new_args;
+  PrimExpr resource_handle_arg;
+  if (args.size() == func->params.size() + 1) {
+    resource_handle_arg = args.back();
+    args.pop_back();
+  }
   for (const auto& arg : args) {
     new_args.push_back(VisitExpr(arg));
   }
@@ -219,6 +244,9 @@ Array<PrimExpr> PoolAllocationToOffsetConverter::AppendPoolParamsToArgs(
     Buffer buffer_var = top_scope.buffer_map[pool_var];
     new_args.push_back(buffer_var->data);
   }
+  if (resource_handle_arg.defined()) {
+    new_args.push_back(resource_handle_arg);
+  }
   return new_args;
 }
 
@@ -240,12 +268,13 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::call_extern()) || op->op.same_as(builtin::tvm_call_cpacked())) {
     String func_name = Downcast<StringImm>(op->args[0])->value;
     Array<PrimExpr> new_args;
-    if (module_->ContainGlobalVar(func_name)) {
+    if (module_->ContainGlobalVar(func_name) &&
+        module_->Lookup(func_name)->IsInstance<PrimFuncNode>()) {
       GlobalVar gv = module_->GetGlobalVar(func_name);
       PrimFunc func = Downcast<PrimFunc>(module_->Lookup(gv));
       PrimFunc prim_func = CreatePrimFuncWithPoolParams(func);
       module_->Update(gv, prim_func);
-      new_args = AppendPoolParamsToArgs(op->args);
+      new_args = AppendPoolParamsToArgs(op->args, prim_func);
       new_args = ReplaceAllocateArgsWithLetArgs(new_args);
     } else {
       new_args = ReplaceAllocateArgsWithLetArgs(op->args);
@@ -255,8 +284,7 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) {
   if (op->op->IsInstance<PrimFuncNode>()) {
     PrimFunc func = Downcast<PrimFunc>(op->op);
     PrimFunc prim_func = CreatePrimFuncWithPoolParams(func);
-    Array<PrimExpr> new_args = AppendPoolParamsToArgs(op->args);
-    new_args = AppendPoolParamsToArgs(new_args);
+    Array<PrimExpr> new_args = AppendPoolParamsToArgs(op->args, prim_func);
     new_args = ReplaceAllocateArgsWithLetArgs(new_args);
     return Call(op->dtype, prim_func, new_args);
   }
@@ -329,8 +357,7 @@ IRModule PoolAllocationToOffsetConverter::operator()() {
 namespace transform {
 
 tvm::transform::Pass ConvertPoolAllocationsToOffsets(
-    const Map<tir::Stmt, PoolAllocation>& pool_allocations,
-    Bool emit_tvmscript_printable = Bool(false)) {
+    const Map<tir::Stmt, PoolAllocation>& pool_allocations, Bool emit_tvmscript_printable) {
   auto pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
     return Downcast<IRModule>(PoolAllocationToOffsetConverter(
         m, pool_allocations, emit_tvmscript_printable->value != 0)());
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
new file mode 100644
index 000000000000..9ef88f9acca9
--- /dev/null
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/analysis/usmp/unified_static_memory_planner.cc
+ * \brief This is the pass that integrates the USMP passes to
+ * a single composite pass.
+ */
+
+#include <tvm/target/target.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/usmp/algo/algo.h>
+#include <tvm/tir/usmp/analysis.h>
+#include <tvm/tir/usmp/transform.h>
+#include <tvm/tir/usmp/utils.h>
+
+#include <string>
+
+namespace tvm {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.disable", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String);
+
+namespace tir {
+namespace usmp {
+
+static constexpr const char* kDefaultAlgo = "greedy_by_size";
+
+static std::unordered_map<String, std::function<Map<BufferInfo, PoolAllocation>(
+                                      const Array<BufferInfo>&, const Integer&)>>
+    algorithms{{"greedy_by_size", algo::GreedyBySize},
+               {"greedy_by_conflicts", algo::GreedyByConflicts}};
+
+IRModule PlanMemory(const IRModule& mod, String algo) {
+  VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod);
+  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+  BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod);
+  Array<BufferInfo> buffer_info_arr =
+      CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
+  Map<BufferInfo, PoolAllocation> buffer_info_pool_allocations =
+      algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure);
+  Map<Stmt, PoolAllocation> stmt_pool_allocations = AssignStmtPoolAllocations(
+      buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
+  IRModule ret = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(mod);
+  tir::PrimFunc tir_main_func =
+      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+  Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+      tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+  if (allocated_pool_infos) {
+    for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
+      VLOG(1) << "pool_size = " << allocated_pool_info->allocated_size;
+    }
+  }
+  return ret;
+}
+
+}  // namespace usmp
+
+namespace transform {
+
+tvm::transform::Pass UnifiedStaticMemoryPlanner() {
+  auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
+    auto algorithm_str = ctx->GetConfig("tir.usmp.algorithm", String(usmp::kDefaultAlgo));
+    return Downcast<IRModule>(
+        usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo))));
+  };
+
+  return tvm::transform::Sequential(
+      {tvm::tir::usmp::transform::AssignPoolInfo(),
+       tvm::transform::CreateModulePass(usmp_main_pass_func, 0,
+                                        "tir.transform.UnifiedStaticMemoryPlanner", {})});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.UnifiedStaticMemoryPlanner")
+    .set_body_typed(UnifiedStaticMemoryPlanner);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 14b3d26641a3..36aa3b6d768a 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -24,7 +24,11 @@
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/usmp/utils.h>
 
 namespace tvm {
@@ -88,11 +92,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ",\n  memory_pressure=" << node->memory_pressure << ")";
     });
 
-PoolInfo::PoolInfo(String pool_name, Map<Target, String> target_access, Integer size_hint_bytes) {
+PoolInfo::PoolInfo(String pool_name, Map<Target, String> target_access, Integer size_hint_bytes,
+                   Bool is_internal) {
   auto poolinfo_node = make_object<PoolInfoNode>();
   poolinfo_node->pool_name = pool_name;
   poolinfo_node->size_hint_bytes = size_hint_bytes;
   poolinfo_node->target_access = target_access;
+  poolinfo_node->is_internal = is_internal;
   data_ = std::move(poolinfo_node);
 }
 
@@ -168,6 +174,20 @@ Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info
   return ret;
 }
 
+void PrintConflicts(const Array<BufferInfo>& bi_arr) {
+  for (const auto& bi : bi_arr) {
+    std::stringstream ss;
+    ss << "buf=" << bi->name_hint;
+    ss << "|conflicts=";
+    for (const auto& conflict_bi : bi->conflicts) {
+      BufferInfo cbi = Downcast<BufferInfo>(conflict_bi);
+      ss << cbi->name_hint << ",";
+    }
+    ss << "\n";
+    LOG(INFO) << ss.str();
+  }
+}
+
 Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
     const Map<BufferInfo, Stmt>& buffer_info_to_stmt,
     const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation) {
@@ -195,6 +215,66 @@ Integer CalculateExtentsSize(const AllocateNode* op) {
   return Integer(num_elements * element_size_bytes);
 }
 
+class ModuleWorkspaceSizeCalculator : public StmtExprVisitor {
+ public:
+  explicit ModuleWorkspaceSizeCalculator(const IRModule& module) : mod_(module) {
+    for (const auto& gv_func : mod_->functions) {
+      functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+    }
+    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    ICHECK(main_func_.defined()) << "main function is not in the module";
+    Optional<Target> target_host = main_func_->GetAttr<Target>(tvm::attr::kTarget);
+    ICHECK(target_host) << "main function does not have a target attr";
+    target_host_ = target_host.value();
+  }
+
+  Integer operator()() {
+    UpdateWorkspaceData(main_func_);
+    return Integer(max_workspace_size);
+  }
+
+ private:
+  void UpdateWorkspaceData(const PrimFunc& func) {
+    Target tgt = func->GetAttr<Target>(tvm::attr::kTarget).value_or(target_host_);
+    Integer workspace_byte_alignment =
+        tgt->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Integer workspace_req = CalculateWorkspaceBytes(func, workspace_byte_alignment);
+    if (workspace_req) {
+      current_workspace_size_ += workspace_req->value;
+    }
+    if (max_workspace_size < current_workspace_size_) {
+      max_workspace_size = current_workspace_size_;
+    }
+    this->VisitStmt(func->body);
+    if (workspace_req) {
+      current_workspace_size_ -= workspace_req->value;
+    }
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    if (op->op.same_as(builtin::call_extern())) {
+      PrimFunc func = functions_.at(Downcast<StringImm>(op->args[0])->value);
+      UpdateWorkspaceData(func);
+    } else if (op->op->IsInstance<PrimFuncNode>()) {
+      PrimFunc func = Downcast<PrimFunc>(op->op);
+      UpdateWorkspaceData(func);
+    } else {
+      StmtExprVisitor::VisitExpr_(op);
+    }
+  }
+
+  IRModule mod_;
+  Target target_host_;
+  PrimFunc main_func_;
+  Map<String, PrimFunc> functions_;
+  size_t current_workspace_size_ = 0;
+  size_t max_workspace_size = 0;
+};
+
+Integer CalculateModuleWorkspaceSize(const IRModule& mod) {
+  return ModuleWorkspaceSizeCalculator(mod)();
+}
+
 TVM_REGISTER_GLOBAL("tir.usmp.CreateArrayBufferInfo")
     .set_body_typed([](Map<BufferInfo, Stmt> buffer_info_map) {
       return (CreateArrayBufferInfo(buffer_info_map));
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 96c8433a6384..36786ed8f2fe 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -214,7 +214,9 @@ def create_test_runner(accel="ethos-u55-256"):
         pass_config={
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
-            }
+            },
+            # TODO(@manupa-arm): enable USMP once TIR to CS translator is capable of handling let bindings
+            "tir.usmp.disable": True,
         },
     )
 
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 3f448ca4a7d9..c65888953c33 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -702,9 +702,8 @@ def run_and_check(
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
-        workspace_bytes += model.extra_memory_in_bytes
-        if interface_api == "packed":
-            workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
+        # TODO(@manupa-arm): remove the stack allocator once the microNPU uses target hooks
+        workspace_bytes = model.extra_memory_in_bytes
 
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
@@ -815,6 +814,7 @@ def compile_and_run(
         target=target,
         target_opts=target_opts,
     )
+
     run_and_check(
         models=compiled_test_mods,
         runner=runner,
diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld
index ddf55b868780..e066b1538481 100644
--- a/tests/python/relay/aot/corstone300.ld
+++ b/tests/python/relay/aot/corstone300.ld
@@ -249,6 +249,13 @@ SECTIONS
     . = ALIGN(16);
   } > SRAM AT > SRAM
 
+  .bss.tvm :
+  {
+    . = ALIGN(16);
+    *(.bss.tvm)
+    . = ALIGN(16);
+  } > DDR
+
   .bss.NoInit :
   {
     . = ALIGN(16);
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 8a2b1f1bb84d..6d2806b8c8e9 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -515,8 +515,8 @@ def test_quant_mobilenet_tfl():
 
     import tvm.relay.testing.tf as tf_testing
 
-    interface_api = "packed"
-    use_unpacked_api = False
+    use_unpacked_api = True
+    interface_api = "c"
     test_runner = AOT_DEFAULT_RUNNER
 
     tflite_model_file = tf_testing.get_workload_official(
@@ -658,14 +658,14 @@ def test_deprecated_target_arguments(capsys):
 
 
 @pytest.mark.parametrize(
-    "workspace_byte_alignment,main_workspace_size,sum_workspace_size",
+    "workspace_byte_alignment,main_workspace_size",
     [
-        (8, 10368, 15200),
-        (16, 10368, 15232),
-        (256, 10752, 17408),
+        (8, 17280),
+        (16, 17280),
+        (256, 17792),
     ],
 )
-def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_workspace_size):
+def test_memory_planning(workspace_byte_alignment, main_workspace_size):
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -675,22 +675,18 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
             "workspace-byte-alignment": workspace_byte_alignment,
         },
     )
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "tir.disable_storage_rewrite": True,
+            "tir.usmp.algorithm": "greedy_by_conflicts",
+        },
+    ):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
-
     assert (
         sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
     )
-    assert (
-        sum(
-            [
-                size
-                for metadata in lib.function_metadata.values()
-                for size in metadata.workspace_sizes.values()
-            ]
-        )
-        == sum_workspace_size
-    )
 
 
 def test_aot_codegen_backend_alloc_workspace_calls():
@@ -726,6 +722,7 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl
         models=AOTTestModel(module=relay_mod, inputs=None, outputs=None),
         interface_api="c",
         use_unpacked_api=True,
+        pass_config={"tir.usmp.disable": True},
     )
     source = compiled_test_mods[0].executor_factory.lib.imported_modules[0].get_source()
     # There should be three allocates created for three primitive relay function

From 2e945c28e2d5ea6257e525b90fef62f6dc284dba Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 13 Dec 2021 13:18:14 +0000
Subject: [PATCH 02/15] USMP integration with AoT executor codegen

* This commit breaks out the USMP related tests
to test_crt_aot_usmp.py.
* Switched the polarity of the pass config option
to explicitly enable USMP rather than disable.

Change-Id: Id4bf35b18479b70924ec24e6bb7ba2682b05326e
---
 src/relay/backend/aot_executor_codegen.cc     |   4 +-
 src/tir/usmp/unified_static_memory_planner.cc |   2 +-
 tests/python/contrib/test_ethosu/infra.py     |   2 -
 tests/python/relay/aot/aot_test_utils.py      |   4 +-
 tests/python/relay/aot/test_crt_aot.py        |  33 ---
 tests/python/relay/aot/test_crt_aot_usmp.py   | 266 ++++++++++++++++++
 6 files changed, 272 insertions(+), 39 deletions(-)
 create mode 100644 tests/python/relay/aot/test_crt_aot_usmp.py

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 38b1cee6b9a2..a6a433a7cb76 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -691,13 +691,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    */
   IRModule PlanMemoryLoweredModule(const IRModule& mod) {
     transform::PassContext pass_ctx = transform::PassContext::Current();
-    bool disable_usmp = pass_ctx->GetConfig<Bool>("tir.usmp.disable", Bool(false)).value();
+    bool enable_usmp = pass_ctx->GetConfig<Bool>("tir.usmp.enable", Bool(false)).value();
 
     IRModule lowered_mod = mod->ShallowCopy();
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
-    if (!disable_usmp) {
+    if (enable_usmp) {
       lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
       // Update workspace size based on the pool allocations.
       Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index 9ef88f9acca9..00abaac1b0dd 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -35,7 +35,7 @@
 
 namespace tvm {
 
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.disable", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.enable", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String);
 
 namespace tir {
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 36786ed8f2fe..0b058a94fb60 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -215,8 +215,6 @@ def create_test_runner(accel="ethos-u55-256"):
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
             },
-            # TODO(@manupa-arm): enable USMP once TIR to CS translator is capable of handling let bindings
-            "tir.usmp.disable": True,
         },
     )
 
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index c65888953c33..6900bdc2e6e1 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -702,8 +702,10 @@ def run_and_check(
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
-        # TODO(@manupa-arm): remove the stack allocator once the microNPU uses target hooks
         workspace_bytes = model.extra_memory_in_bytes
+        use_usmp = runner.pass_config.get("tir.usmp.enable", False)
+        if interface_api == "packed" and not use_usmp:
+            workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
 
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 6d2806b8c8e9..566566da1dce 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -657,38 +657,6 @@ def test_deprecated_target_arguments(capsys):
     )
 
 
-@pytest.mark.parametrize(
-    "workspace_byte_alignment,main_workspace_size",
-    [
-        (8, 17280),
-        (16, 17280),
-        (256, 17792),
-    ],
-)
-def test_memory_planning(workspace_byte_alignment, main_workspace_size):
-    mod, params = tvm.relay.testing.synthetic.get_workload()
-    target = "c"
-    runtime = Runtime("crt")
-    executor = Executor(
-        "aot",
-        {
-            "workspace-byte-alignment": workspace_byte_alignment,
-        },
-    )
-    with tvm.transform.PassContext(
-        opt_level=3,
-        config={
-            "tir.disable_vectorize": True,
-            "tir.disable_storage_rewrite": True,
-            "tir.usmp.algorithm": "greedy_by_conflicts",
-        },
-    ):
-        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
-    assert (
-        sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
-    )
-
-
 def test_aot_codegen_backend_alloc_workspace_calls():
     """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""
 
@@ -722,7 +690,6 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl
         models=AOTTestModel(module=relay_mod, inputs=None, outputs=None),
         interface_api="c",
         use_unpacked_api=True,
-        pass_config={"tir.usmp.disable": True},
     )
     source = compiled_test_mods[0].executor_factory.lib.imported_modules[0].get_source()
     # There should be three allocates created for three primitive relay function
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
new file mode 100644
index 000000000000..b88e0905dba5
--- /dev/null
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -0,0 +1,266 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" This file contains test that use USMP + AoT using C runtime APIs"""
+
+from collections import OrderedDict
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm import relay, TVMError
+from tvm.ir.module import IRModule
+from tvm.relay import testing, transform
+from tvm.relay.testing import byoc
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from tvm.relay.backend import Executor, Runtime
+from aot_test_utils import (
+    AOTTestModel,
+    AOTTestRunner,
+    generate_ref_data,
+    convert_to_relay,
+    compile_and_run,
+    compile_models,
+    parametrize_aot_options,
+    run_and_check,
+)
+
+
+def check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
+    """This checker checks whether any c-source produced has TVMBackendAllocWorkspace calls.
+    If USMP is invoked, none of them should have TVMBAW calls"""
+    dso_modules = mod._collect_dso_modules()
+    for dso_mod in dso_modules:
+        assert (
+            dso_mod.type_key == "c"
+        ), 'Current CRT AoT codegen flow should only produce type "c" runtime modules'
+        source = dso_mod.get_source()
+        source.count(
+            "TVMBackendAllocWorkspace"
+        ) == 0, "This is failing because USMP was unable to plan for every tir.allocate node"
+
+
+@pytest.mark.parametrize(
+    "workspace_byte_alignment,main_workspace_size",
+    [
+        (8, 17280),
+        (16, 17280),
+        (256, 17792),
+    ],
+)
+def test_memory_planning(workspace_byte_alignment, main_workspace_size):
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": workspace_byte_alignment,
+        },
+    )
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "tir.disable_storage_rewrite": True,
+            "tir.usmp.enable": True,
+            "tir.usmp.algorithm": "greedy_by_conflicts",
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+    assert (
+        sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
+    )
+
+
+@parametrize_aot_options
+@pytest.mark.parametrize("groups,weight_shape", [(1, 32), (32, 1)])
+def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_shape):
+    """Test a subgraph with a single conv2d operator."""
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    wshape = (32, weight_shape, 3, 3)
+    pass_config = {"tir.usmp.enable": True}
+    test_runner = AOTTestRunner(
+        makefile=test_runner.makefile,
+        prologue=test_runner.prologue,
+        epilogue=test_runner.epilogue,
+        includes=test_runner.includes,
+        parameters=test_runner.parameters,
+        pass_config=pass_config,
+    )
+
+    data0 = relay.var("data", shape=ishape, dtype=dtype)
+    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
+    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=groups)
+    main_f = relay.Function([data0, weight0], out)
+    mod = tvm.IRModule()
+    mod["main"] = main_f
+    mod = transform.InferType()(mod)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w1_data = np.random.uniform(0, 1, wshape).astype(dtype)
+
+    inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
+
+    output_list = generate_ref_data(mod, inputs)
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+    compiled_test_mods = compile_models(
+        models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    for compiled_model in compiled_test_mods:
+        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+    )
+
+
+@pytest.mark.parametrize("merge_compiler_regions", [False, True])
+def test_byoc_microtvm(merge_compiler_regions):
+    """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036"""
+    use_unpacked_api = False
+    interface_api = "packed"
+    test_runner = AOTTestRunner(pass_config={"tir.usmp.enable": True})
+
+    x = relay.var("x", shape=(10, 10))
+    w0 = relay.var("w0", shape=(10, 10))
+    w1 = relay.var("w1", shape=(10, 10))
+
+    # z0 = x + w0
+    x_ = compiler_begin(x, "ccompiler")
+    w0_ = compiler_begin(w0, "ccompiler")
+    z0_ = relay.add(x_, w0_)
+    z0 = compiler_end(z0_, "ccompiler")
+
+    # z1 = z0 + w1
+    z0__ = compiler_begin(z0, "ccompiler")
+    w1_ = compiler_begin(w1, "ccompiler")
+    z1_ = relay.add(z0__, w1_)
+    z1 = compiler_end(z1_, "ccompiler")
+
+    # z2 = z0 + z1
+    z2 = relay.add(z0, z1)
+
+    f = relay.Function([x, w0, w1], z2)
+    mod = tvm.IRModule()
+    mod["main"] = f
+
+    if merge_compiler_regions:
+        mod = transform.MergeCompilerRegions()(mod)
+
+    mod = transform.PartitionGraph("mod_name")(mod)
+    mod = transform.InferType()(mod)
+
+    x_data = [("x", np.random.rand(10, 10).astype("float32"))]
+    w_data = [("w{}".format(i), np.random.rand(10, 10).astype("float32")) for i in range(2)]
+
+    map_inputs = OrderedDict(x_data + w_data)
+    output_list = generate_ref_data(mod, map_inputs)
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    for compiled_model in compiled_test_mods:
+        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+    )
+
+
+MOBILENET_V1_URL = (
+    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+    "mobilenet_v1_1.0_224_quant.tflite",
+)
+
+
+@pytest.mark.parametrize(
+    "model_url, usmp_algo, workspace_size,",
+    [
+        (MOBILENET_V1_URL, "greedy_by_size", 4845696),
+        (MOBILENET_V1_URL, "greedy_by_conflicts", 4845696),
+    ],
+)
+def test_tflite_model(model_url, usmp_algo, workspace_size):
+    """This checks for ML models and the memory used by them when using USMP with different algorithms"""
+    pytest.importorskip("tflite")
+
+    import tvm.relay.testing.tf as tf_testing
+
+    use_unpacked_api = True
+    interface_api = "c"
+    test_runner = AOTTestRunner(
+        pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo}
+    )
+
+    tflite_model_file = tf_testing.get_workload_official(
+        model_url[0],
+        model_url[1],
+    )
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data_shape = (1, 224, 224, 3)
+    in_min, in_max = (0, 255)
+    data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
+    mod, params = convert_to_relay(tflite_model_buf, data, "input")
+    inputs = {"input": data}
+    output_list = generate_ref_data(mod, inputs, params)
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    for compiled_model in compiled_test_mods:
+        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    # Checking the workspace size
+    assert (
+        sum(
+            compiled_model.executor_factory.function_metadata[
+                "__tvm_main__"
+            ].workspace_sizes.values()
+        )
+        == workspace_size
+    )
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+    )

From 13afe19d30aabc881e638dc293707412b689b041 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 13 Dec 2021 17:00:23 +0000
Subject: [PATCH 03/15] USMP integration with AoT executor codegen

* Swap pool var and output var ordering in the main function

Change-Id: Id03748e6f3528399a0ddd9cc2d011adfcee8d554
---
 ..._tir_usmp_transform_convert_pool_allocations_to_offsets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index fc615775c160..ab40c646391c 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -140,7 +140,7 @@ def run_model(input: T.handle, output: T.handle) -> None:
 @tvm.script.ir_module
 class LinearStructurePlanned:
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle) -> None:
+    def run_model(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle,  output: T.handle) -> None:
         fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
         slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
@@ -464,7 +464,7 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
                 T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1_let, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle, global_workspace_0_var: T.handle) -> None:
+    def run_model(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None:
         global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
         T.attr("default", "device_id", 0)

From 6aa927c3a048ea4e47ee4c895fe863adc49e1ca5 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 20 Dec 2021 12:22:00 +0000
Subject: [PATCH 04/15] USMP integration with AoT executor codegen

* Improving comments
* Changing the usage of tir Bool to just plain bool

Change-Id: I907cf9a0befa172183ae488bb3b94660aad39807
---
 include/tvm/tir/transform.h                   |  3 ++-
 include/tvm/tir/usmp/transform.h              | 12 +++++-------
 include/tvm/tir/usmp/utils.h                  | 10 ++++++----
 src/tir/usmp/unified_static_memory_planner.cc |  2 ++
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 97d750ce6aad..3a964eb77d1b 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -486,7 +486,8 @@ TVM_DLL Pass ConvertForLoopsToSerial();
 
 /*!
  * \brief This is the unified static memory planner pass that will
- * plan for memory intra- and inter- PrimFuncs together.
+ * plan for memory intra- and inter- PrimFuncs together. The pass
+ * requires all the function to be PrimFuncs including the main.
  * \return The pass.
  */
 TVM_DLL Pass UnifiedStaticMemoryPlanner();
diff --git a/include/tvm/tir/usmp/transform.h b/include/tvm/tir/usmp/transform.h
index 7bc08ea3933c..6de64704bd8b 100644
--- a/include/tvm/tir/usmp/transform.h
+++ b/include/tvm/tir/usmp/transform.h
@@ -37,9 +37,9 @@ using Pass = tvm::transform::Pass;
 /*!
  * \brief Convert the analyzed PoolAllocation to offsets from pool variables
  *
- * This pass would convert the IRModule that contains all PrimFuncs that contains
- * the associated PoolAllocations to be read from being offset from the input var
- * of the PrimFunc.
+ * This pass would convert the main function to accept pool variables as an input
+ * that get passed onto the operator PrimFuncs. Furthermore, the static allocations
+ * will be converted to offsets within the pool variable.
  *
  * \return the pass
  */
@@ -49,10 +49,8 @@ TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map<tir::Stmt, PoolAllocation
 /*!
  * \brief Assign PoolInfo objects to tir.allocate nodes depending on the PrimFunc's target
  *
- * This pass would assign PoolInfo objects to tir.allocate nodes depending on the each target
- * that each PrimFunc would belong to. If there are not any pools provided in the IRModule,
- * this pass would create a global workspace pool that every target could access for as the
- * default behaviour.
+ * This pass would assign default PoolInfo objects to allocate nodes that are not otherwise
+ * annotated, depending on pool info supplied for each target.
  *
  * \return the pass
  */
diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index cf02596cb46c..28194143e7e8 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -60,8 +60,10 @@ struct PoolInfoNode : public Object {
   Integer size_hint_bytes;
   /*! \brief The accessibility from each Target*/
   Map<Target, String> target_access;  // 'rw' or 'ro'
-  /*! \brief Whether pool is internally generated*/
-  Bool is_internal = Bool(false);
+  /*! \brief Whether pool is internally generated.
+   * The internal pools will be generated as part of
+   * the entry point code generation of the executor*/
+  bool is_internal = false;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pool_name", &pool_name);
@@ -320,8 +322,8 @@ namespace attr {
 static constexpr const char* kPoolArgs = "pool_args";
 
 /*!
- * \brief This is a BaseFunc attribute to indicate which input var represent
- * a PoolInfo Object in the form of a Map<Var, PoolInfo>.
+ * \brief This is a IRModule attribute that contains all the PoolInfo objects
+ * as an Array.
  */
 static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos";
 
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index 00abaac1b0dd..91e0218e34c8 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -54,6 +54,8 @@ IRModule PlanMemory(const IRModule& mod, String algo) {
   BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod);
   Array<BufferInfo> buffer_info_arr =
       CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
+  CHECK(algorithms.count(algo)) << "The selected USMP algorithm : " << algo
+                                << "is not defined. Please define it in the above algorithms map.";
   Map<BufferInfo, PoolAllocation> buffer_info_pool_allocations =
       algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure);
   Map<Stmt, PoolAllocation> stmt_pool_allocations = AssignStmtPoolAllocations(

From 6e1d305c8be4d67611e1ee4353d3fe815785d8e8 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 20 Dec 2021 18:29:40 +0000
Subject: [PATCH 05/15] USMP integration with AoT executor codegen

* fixing unit tests to scope With<Target>
* removing global sanitization of var names

Change-Id: Ifa8934f94744eeaac13e4b4ddcd671842c3dcb21
---
 src/relay/backend/aot_executor_codegen.cc |  3 ++-
 src/target/source/codegen_source_base.cc  |  6 +++---
 src/target/source/codegen_source_base.h   |  2 +-
 tests/cpp/build_module_test.cc            | 16 ++++++++++------
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a6a433a7cb76..3b9ffa75c275 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -42,6 +42,7 @@
 #include <string>
 #include <vector>
 
+#include "../../target/source/codegen_source_base.h"
 #include "../op/annotation/annotation.h"
 #include "../op/call/call.h"
 #include "../op/memory/device_copy.h"
@@ -849,7 +850,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     for (auto input : lowered_main_func->params) {
       input_vars_.push_back(input);
-      std::string input_name = input->name_hint();
+      std::string input_name = codegen::CodeGenSourceBase::SanitiseName(input->name_hint());
       CreateIOVar(input, input_name);
     }
 
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 518da725d08f..e0becee2dd64 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -74,13 +74,13 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
   std::string key = v->name_hint;
   std::string vid = GetUniqueName(key);
   var_idmap_[v] = vid;
-  return SanitiseName(vid);
+  return vid;
 }
 
 std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const {
   auto it = var_idmap_.find(v);
   ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
-  return SanitiseName(it->second);
+  return it->second;
 }
 
 void CodeGenSourceBase::PrintIndent() {
@@ -113,7 +113,7 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
-std::string CodeGenSourceBase::SanitiseName(std::string name) const {
+std::string CodeGenSourceBase::SanitiseName(std::string name) {
   std::replace_if(
       name.begin(), name.end(),
       [](char c) {
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 0a3066db51a6..ff94fa68dae7 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -57,7 +57,7 @@ class CodeGenSourceBase {
    * \brief Sanitize names by removing illegal characters
    * \param name The name to be sanitised.
    */
-  std::string SanitiseName(std::string name) const;
+  static std::string SanitiseName(std::string name);
 
  protected:
   /*! \brief entry in ssa assign map */
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index d5a4c91a3c43..ff3641cd6982 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -107,18 +107,22 @@ TEST(BuildModule, Heterogeneous) {
   auto elemwise_sub = compute(
       C->shape, [&copy, &C](PrimExpr i) { return copy[i] - C[i]; }, "elemwise_sub");
 
-  With<Target> cuda_scope(target_cuda);
-  auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+  auto fcreate_s1 = [=]() {
+    With<Target> cuda_scope(target_cuda);
+    return topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+  };
 
-  With<Target> llvm_scope(target_llvm);
-  auto s2 = create_schedule({elemwise_sub->op});
+  auto fcreate_s2 = [=]() {
+    With<Target> llvm_scope(target_llvm);
+    return create_schedule({elemwise_sub->op});
+  };
 
   auto args1 = Array<Tensor>({A, B, elemwise_add});
   auto args2 = Array<Tensor>({copy, C, elemwise_sub});
 
   std::unordered_map<Tensor, Buffer> binds;
-  auto lowered_s1 = LowerSchedule(s1, args1, "elemwise_add", binds);
-  auto lowered_s2 = LowerSchedule(s2, args2, "elemwise_sub", binds);
+  auto lowered_s1 = LowerSchedule(fcreate_s1(), args1, "elemwise_add", binds);
+  auto lowered_s2 = LowerSchedule(fcreate_s2(), args2, "elemwise_sub", binds);
   Map<tvm::Target, IRModule> inputs = {{target_cuda, lowered_s1}, {target_llvm, lowered_s2}};
   auto module = build(inputs, Target());
 

From 82ba5fb555c324f4743dc6fd0d0a9753c843041e Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 21 Dec 2021 10:18:43 +0000
Subject: [PATCH 06/15] USMP integration with AoT executor codegen

* changing unpacked_api to accept PrimFunc without
  buffer_maps.

Change-Id: I97e8df5272df7f1f8313f184aa660924989940e3
---
 src/tir/transforms/make_unpacked_api.cc | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index d118547517fe..bc9870b9ac84 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -60,15 +60,17 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
 
   // Collect variables and buffers to map between
   Array<Var> args;
-  // We only iterate the function params upto number of buffer_map keys
-  // because if there exist a resource handle, it will not have a buffer
-  for (unsigned int i = 0; i < func->buffer_map.size(); i++) {
-    Var param = func->params[i];
-    args.push_back(func->buffer_map[param]->data);
-  }
-  if (func->params.size() == func->buffer_map.size() + 1) {
-    args.push_back(func->params.back());
+  for (const Var& param : func->params) {
+    // Ideally all func params should have Buffers defined in the buffer_map
+    // We should look to insert buffer_maps for all PrimFuncs that are returned
+    // to the core compiler.
+    if (func->buffer_map.find(param) != func->buffer_map.end()) {
+      args.push_back(func->buffer_map[param]->data);
+    } else {
+      args.push_back(param);
+    }
   }
+
   device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
   device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
 

From ee9f83939b848d5539e88a276c151a828832e437 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 21 Dec 2021 18:13:37 +0000
Subject: [PATCH 07/15] USMP integration with AoT executor codegen

* Restoring the conditional behaviour as we are
  getting PrimFuncs without buffer maps.

Change-Id: Icd374353e0947515c523d506c1e7c02cf5d930b4
---
 src/tir/transforms/make_unpacked_api.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index bc9870b9ac84..6365e09246fc 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -71,8 +71,10 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
     }
   }
 
-  device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
-  device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
+  if (func->buffer_map.size()) {
+    device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop));
+    device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop));
+  }
 
   func_ptr->body = MergeNest(device_init, func_ptr->body);
   func_ptr->params = args;

From 3d7aaa40a76d62516f0ba561858b0dc281238144 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 21 Dec 2021 18:34:27 +0000
Subject: [PATCH 08/15] USMP integration with AoT executor codegen

* Adds tir includes to source_module.cc

Change-Id: I8ad5654ea32abb3aabc0fcc9ef8157a609e9d2dc
---
 src/target/source/source_module.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 9ecbf173cbbc..50cba6b0b996 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -26,6 +26,9 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
 
 #include <string>
 #include <unordered_map>

From e23fe524d49f9b69a637496c2be7ef200b0a82c1 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 22 Dec 2021 08:32:37 +0000
Subject: [PATCH 09/15] USMP integration with AoT executor codegen

* fixing c device api tests

Change-Id: I47ac38b5664a00f36b293a0d9d96330d4206173d
---
 tests/python/relay/aot/test_c_device_api.py | 64 +++++++++------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index 473b8d5ee300..c4ca733422e6 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -92,7 +92,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
             workspace_byte_alignment=16,
             pass_config=test_runner.pass_config,
         )
-        main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[1][1]
+        main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[0][1]
         main_func = main_ir_module["run_model"]
         return main_func
 
@@ -136,44 +136,29 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
 
     # Activate Device
     assert (
-        str(main_func.body[0][0].value)
-        == "@tir.call_extern("
-        + '"TVMDeviceEthosUActivate",'
-        + " device_context_ethos_u: handle,"
-        + " dtype=int32)"
+        str(main_func.body[0])
+        == "tir.call_extern(" + '"TVMDeviceEthosUActivate",' + " device_context_ethos_u)\n"
     )
     # Open Device
+    print(str(main_func.body[1]))
     assert (
-        str(main_func.body[1].body.body[0][0][0].value)
-        == "@tir.call_extern("
-        + '"TVMDeviceEthosUOpen",'
-        + " device_context_ethos_u: handle,"
-        + " dtype=int32)"
+        str(main_func.body[1][0][0][0])
+        == "tir.call_extern(" + '"TVMDeviceEthosUOpen",' + " device_context_ethos_u)\n"
     )
     # Device Call
     assert (
-        str(main_func.body[1].body.body[0][0][1].value)
-        == "@tir.call_extern("
-        + '"tvmgen_default_ethos_u_main_0",'
-        + " input: handle, output: handle,"
-        + " device_context_ethos_u: handle,"
-        + " dtype=int32)"
+        str(main_func.body[1][0][0][1])
+        == 'tir.call_extern("tvmgen_default_ethos_u_main_0", x_int8_buffer_var, output_buffer_var, device_context_ethos_u)\n'
     )
     # Close Device
     assert (
-        str(main_func.body[1].body.body[0][0][2].value)
-        == "@tir.call_extern("
-        + '"TVMDeviceEthosUClose",'
-        + " device_context_ethos_u: handle,"
-        + " dtype=int32)"
+        str(main_func.body[1][0][0][2])
+        == "tir.call_extern(" + '"TVMDeviceEthosUClose",' + " device_context_ethos_u)\n"
     )
     # Deactivate Device
     assert (
-        str(main_func.body[2][0].value)
-        == "@tir.call_extern("
-        + '"TVMDeviceEthosUDeactivate",'
-        + " device_context_ethos_u: handle,"
-        + " dtype=int32)"
+        str(str(main_func.body[2]))
+        == "tir.call_extern(" + '"TVMDeviceEthosUDeactivate",' + " device_context_ethos_u)\n"
     )
 
 
@@ -231,13 +216,10 @@ def test_without_device_api_unpacked_api(non_device_api_main_func):
     """Test a graph without the Device API with the unpacked internal calls"""
 
     main_func = non_device_api_main_func(interface_api="c", use_unpacked_api=True)
-
+    print(str(main_func.body))
     assert (
-        str(main_func.body[1].body.body[0][0].value)
-        == "@tir.call_extern("
-        + '"tvmgen_default_fused_multiply",'
-        + " input: handle, input_1: handle, output: handle,"
-        + " dtype=int32)"
+        str(main_func.body)
+        == 'tir.call_extern("tvmgen_default_fused_multiply", x_buffer_var, y_buffer_var, output_buffer_var)\n'
     )
 
 
@@ -245,12 +227,18 @@ def test_without_device_api_packed_api(non_device_api_main_func):
     """Test a graph without the Device API with the packed internal calls"""
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
-
+    print(str(main_func.body))
     assert (
-        str(main_func.body[1].body.body[0][0])
-        == 'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n'
-        + "tir.tvm_struct_set(tvm_value_0, 0, 1, tir.reinterpret((uint64)0))\n"
-        + 'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", input, input, output, tvm_value_0)\n'
+        str(main_func.body)
+        == 'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n'
+        + 'let tvm_value_2 = tir.tvm_stack_alloca("array", 1)\n'
+        + 'let tvm_value_1 = tir.tvm_stack_alloca("array", 1)\n'
+        + 'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n'
+        + "tir.tvm_struct_set(tvm_value_0, 0, 1, x_buffer_var)\n"
+        + "tir.tvm_struct_set(tvm_value_1, 0, 1, y_buffer_var)\n"
+        + "tir.tvm_struct_set(tvm_value_2, 0, 1, output_buffer_var)\n"
+        + "tir.tvm_struct_set(tvm_value_3, 0, 1, tir.reinterpret((uint64)0))\n"
+        + 'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", tvm_value_0, tvm_value_1, tvm_value_2, tvm_value_3)\n'
     )
 
 

From 439d1efef5d486ce62ced2e5a37a3970d9c8d2d5 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 12 Jan 2022 16:40:33 +0000
Subject: [PATCH 10/15] USMP integration with AoT executor codegen

* moving algo/algo.h to algorithms.h
* creating two functions for USMP and StorageRewrite
* expanding codegen variable names to more descriptive
* removing unncessary print functions and print statements
* re-using relay::backend::SanitizeName

Change-Id: I890caa0104c07a4883eb0c34d3bfcfb8bb56653f
---
 .../tir/usmp/{algo/algo.h => algorithms.h}    |   8 +-
 include/tvm/tir/usmp/utils.h                  |  10 ++
 src/relay/backend/aot_executor_codegen.cc     | 120 ++++++++++--------
 src/target/source/codegen_source_base.cc      |  10 --
 src/target/source/codegen_source_base.h       |   6 -
 src/target/source/source_module.cc            |  17 +--
 src/tir/usmp/algo/greedy.cc                   |   1 +
 src/tir/usmp/transform/assign_pool_info.cc    |   2 +-
 src/tir/usmp/unified_static_memory_planner.cc |   8 +-
 src/tir/usmp/utils.cc                         |  14 --
 tests/python/relay/aot/test_c_device_api.py   |   3 -
 11 files changed, 95 insertions(+), 104 deletions(-)
 rename include/tvm/tir/usmp/{algo/algo.h => algorithms.h} (93%)

diff --git a/include/tvm/tir/usmp/algo/algo.h b/include/tvm/tir/usmp/algorithms.h
similarity index 93%
rename from include/tvm/tir/usmp/algo/algo.h
rename to include/tvm/tir/usmp/algorithms.h
index 8a2e3475ce19..77276a2c931c 100644
--- a/include/tvm/tir/usmp/algo/algo.h
+++ b/include/tvm/tir/usmp/algorithms.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file tir/usmp/algo/algo.h
+ * \file tir/usmp/algorithms.h
  * \brief The memory planning algorithm for USMP
  */
 
-#ifndef TVM_TIR_USMP_ALGO_ALGO_H_
-#define TVM_TIR_USMP_ALGO_ALGO_H_
+#ifndef TVM_TIR_USMP_ALGORITHMS_H_
+#define TVM_TIR_USMP_ALGORITHMS_H_
 
 #include <tvm/tir/usmp/utils.h>
 
@@ -59,4 +59,4 @@ Map<BufferInfo, PoolAllocation> GreedyByConflicts(const Array<BufferInfo>& buffe
 }  // namespace tir
 }  // namespace tvm
 
-#endif  // TVM_TIR_USMP_ALGO_ALGO_H_
+#endif  // TVM_TIR_USMP_ALGORITHMS_H_
diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index 28194143e7e8..ec909a296fd8 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -31,6 +31,16 @@
 #include <tvm/tir/stmt.h>
 
 namespace tvm {
+
+/*!
+ * \brief PassContext option to enable the USMP
+ */
+constexpr const char* kUSMPEnableOption = "tir.usmp.enable";
+/*!
+ * \brief PassContext option to select the memory planning algorithm in USMP
+ */
+constexpr const char* kUSMPAlgorithmOption = "tir.usmp.algorithm";
+
 namespace tir {
 namespace usmp {
 
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 3b9ffa75c275..dd66758d8c71 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -687,62 +687,73 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   /*!
-   * brief This function is a wrapper to run memory planning
-   * followed by recording the latest workspaces required.
+   * brief Run USMP to plan memory for lowered IRModule
    */
-  IRModule PlanMemoryLoweredModule(const IRModule& mod) {
-    transform::PassContext pass_ctx = transform::PassContext::Current();
-    bool enable_usmp = pass_ctx->GetConfig<Bool>("tir.usmp.enable", Bool(false)).value();
-
-    IRModule lowered_mod = mod->ShallowCopy();
+  IRModule PlanMemoryWithUSMP(const IRModule& mod) {
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
-    if (enable_usmp) {
-      lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
-      // Update workspace size based on the pool allocations.
-      Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
-          lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
-      int main_workspace_size = 0;
-      if (allocated_pool_infos) {
-        for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info :
-             allocated_pool_infos.value()) {
-          main_workspace_size += allocated_pool_info->allocated_size->value;
-        }
+    IRModule lowered_mod = mod->ShallowCopy();
+    lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
+    // Update workspace size based on the pool allocations.
+    for (const auto& kv : function_metadata_) {
+      if (lowered_mod->ContainGlobalVar(kv.first) &&
+          lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
+        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
+        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
+        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+        kv.second->workspace_sizes.Set(tgt, ws);
       }
-      for (const auto& kv : function_metadata_) {
-        if (lowered_mod->ContainGlobalVar(kv.first) &&
-            lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
-          tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
-          Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
-          const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
-          kv.second->workspace_sizes.Set(tgt, ws);
+    }
+    Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
+        lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
+    backend::FunctionInfo main_func_info =
+        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
+    main_func_info->workspace_sizes.clear();
+    if (allocated_pool_infos) {
+      for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
+        for (const auto& kv : allocated_pool_info->pool_info->target_access) {
+          Target tgt = kv.first;
+          if (main_func_info->workspace_sizes.find(tgt) == main_func_info->workspace_sizes.end()) {
+            main_func_info->workspace_sizes.Set(tgt, allocated_pool_info->allocated_size);
+          } else {
+            main_func_info->workspace_sizes.Set(tgt,
+                                                main_func_info->workspace_sizes[tgt]->value +
+                                                    allocated_pool_info->allocated_size->value);
+          }
         }
       }
-      backend::FunctionInfo main_func_info =
-          lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
-      main_func_info->workspace_sizes.Set(target_host_, main_workspace_size);
-      function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
-    } else {
-      // Running StorageRewrite just on the main function
-      tir::PrimFunc tir_main_func =
-          Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
-      IRModule main_func_mod;
-      main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
-                            tir_main_func);
-      main_func_mod = tir::transform::StorageRewrite()(main_func_mod);
-      lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
-                          main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
-      tir_main_func =
-          Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
-      // Use the PrimFunc to calculate the workspace required to service the allocates
-      Integer main_workspace_size =
-          CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
-      backend::FunctionInfo main_func_info =
-          lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
-      main_func_info->workspace_sizes.Set(target_host_, main_workspace_size);
-      function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
     }
+    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
+    return lowered_mod;
+  }
+
+  /*!
+   * brief Run StorageRewrite to plan memory for lowered IRModule
+   */
+  IRModule PlanMemoryWithStorageRewrite(const IRModule& mod) {
+    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    Integer workspace_byte_alignment =
+        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    IRModule lowered_mod = mod->ShallowCopy();
+    // Running StorageRewrite just on the main function
+    tir::PrimFunc tir_main_func =
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    IRModule main_func_mod;
+    main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+                          tir_main_func);
+    main_func_mod = tir::transform::StorageRewrite()(main_func_mod);
+    lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+                        main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    tir_main_func =
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    // Use the PrimFunc to calculate the workspace required to service the allocates
+    Integer main_workspace_size_bytes =
+        CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
+    backend::FunctionInfo main_func_info =
+        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
+    main_func_info->workspace_sizes.Set(target_host_, main_workspace_size_bytes);
+    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
     return lowered_mod;
   }
 
@@ -850,7 +861,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     for (auto input : lowered_main_func->params) {
       input_vars_.push_back(input);
-      std::string input_name = codegen::CodeGenSourceBase::SanitiseName(input->name_hint());
+      std::string input_name = SanitizeName(input->name_hint());
       CreateIOVar(input, input_name);
     }
 
@@ -895,7 +906,14 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func);
     // Parallel for loops are not supported in AoT codegen.
     lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
-    lowered_mod = PlanMemoryLoweredModule(lowered_mod);
+
+    transform::PassContext pass_ctx = transform::PassContext::Current();
+    bool enable_usmp = pass_ctx->GetConfig<Bool>(kUSMPEnableOption, Bool(false)).value();
+    if (enable_usmp) {
+      lowered_mod = PlanMemoryWithUSMP(lowered_mod);
+    } else {
+      lowered_mod = PlanMemoryWithStorageRewrite(lowered_mod);
+    }
     ret.function_metadata = std::move(function_metadata_);
 
     // Legalize AOT if needed. This means that all the packed calls
@@ -928,12 +946,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
-    int main_workspace_size = 0;
     if (allocated_pool_infos) {
       for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
         pool_vars.push_back(allocated_pool_info->pool_var.value());
         pool_var_info.Set(allocated_pool_info->pool_var.value(), allocated_pool_info);
-        main_workspace_size += allocated_pool_info->allocated_size->value;
       }
     }
     Array<String> devices = ListDevices();
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index e0becee2dd64..9f0cf9a70b61 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -113,15 +113,5 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
-std::string CodeGenSourceBase::SanitiseName(std::string name) {
-  std::replace_if(
-      name.begin(), name.end(),
-      [](char c) {
-        { return !std::isalnum(c); }
-      },
-      '_');
-  return name;
-}
-
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index ff94fa68dae7..d938469b8969 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -53,12 +53,6 @@ class CodeGenSourceBase {
    */
   void MarkConst(std::string value);
 
-  /*!
-   * \brief Sanitize names by removing illegal characters
-   * \param name The name to be sanitised.
-   */
-  static std::string SanitiseName(std::string name);
-
  protected:
   /*! \brief entry in ssa assign map */
   struct SSAEntry {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 50cba6b0b996..deec2c343ae2 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -26,19 +26,16 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/tir/function.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
 
 #include <string>
 #include <unordered_map>
 #include <utility>
 
+#include "../../relay/backend/name_transforms.h"
 #include "../../runtime/file_utils.h"
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
 #include "codegen_c.h"
-#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
@@ -203,13 +200,13 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   String GenerateDLTensorStructWrapper(String reference_arg) {
-    code_ << "DLTensor " << reference_arg << "_dlt = {\n";
+    code_ << "DLTensor " << reference_arg << "_dltensor = {\n";
     code_ << ".data = &" << reference_arg << "\n";
     code_ << "};\n";
-    code_ << "TVMValue " << reference_arg << "_tvmv = {\n";
-    code_ << ".v_handle = &" << reference_arg << "_dlt\n";
+    code_ << "TVMValue " << reference_arg << "_tvm_value = {\n";
+    code_ << ".v_handle = &" << reference_arg << "_dltensor\n";
     code_ << "};\n";
-    return reference_arg + "_tvmv";
+    return reference_arg + "_tvm_value";
   }
 
   void GenerateInternalWorkspaceBuffers() {
@@ -374,7 +371,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
         } else {
           codegen_c_.PrintType(input_var.dtype(), call_args_ss);
         }
-        call_args_ss << " " << codegen_c_.SanitiseName(input_var->name_hint) << ",";
+        call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ",";
       }
       for (unsigned int i = 0; i < metadata_->num_outputs; ++i) {
         call_args_ss << "void* output" << i << ",";
@@ -411,7 +408,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     {
       std::stringstream call_args_ss;
       for (const auto& input : metadata_->inputs) {
-        call_args_ss << "inputs->" << codegen_c_.SanitiseName(input->name_hint) << ",";
+        call_args_ss << "inputs->" << relay::backend::SanitizeName(input->name_hint) << ",";
       }
       if (metadata_->num_outputs == 1) {
         call_args_ss << "outputs->output,";
diff --git a/src/tir/usmp/algo/greedy.cc b/src/tir/usmp/algo/greedy.cc
index a434d206162f..324474c569d4 100644
--- a/src/tir/usmp/algo/greedy.cc
+++ b/src/tir/usmp/algo/greedy.cc
@@ -40,6 +40,7 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/usmp/algo/greedy.h>
+#include <tvm/tir/usmp/algorithms.h>
 #include <tvm/tir/usmp/utils.h>
 
 namespace tvm {
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index ef6559b27caa..516ddd1a241b 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -20,7 +20,7 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
-#include <tvm/tir/usmp/algo/algo.h>
+#include <tvm/tir/usmp/algorithms.h>
 #include <tvm/tir/usmp/analysis.h>
 #include <tvm/tir/usmp/transform.h>
 #include <tvm/tir/usmp/utils.h>
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index 91e0218e34c8..5a2125077566 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -26,7 +26,7 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
-#include <tvm/tir/usmp/algo/algo.h>
+#include <tvm/tir/usmp/algorithms.h>
 #include <tvm/tir/usmp/analysis.h>
 #include <tvm/tir/usmp/transform.h>
 #include <tvm/tir/usmp/utils.h>
@@ -35,8 +35,8 @@
 
 namespace tvm {
 
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.enable", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPEnableOption, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPAlgorithmOption, String);
 
 namespace tir {
 namespace usmp {
@@ -79,7 +79,7 @@ namespace transform {
 
 tvm::transform::Pass UnifiedStaticMemoryPlanner() {
   auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
-    auto algorithm_str = ctx->GetConfig("tir.usmp.algorithm", String(usmp::kDefaultAlgo));
+    auto algorithm_str = ctx->GetConfig(kUSMPAlgorithmOption, String(usmp::kDefaultAlgo));
     return Downcast<IRModule>(
         usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo))));
   };
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 36aa3b6d768a..1fff70f5892e 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -174,20 +174,6 @@ Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info
   return ret;
 }
 
-void PrintConflicts(const Array<BufferInfo>& bi_arr) {
-  for (const auto& bi : bi_arr) {
-    std::stringstream ss;
-    ss << "buf=" << bi->name_hint;
-    ss << "|conflicts=";
-    for (const auto& conflict_bi : bi->conflicts) {
-      BufferInfo cbi = Downcast<BufferInfo>(conflict_bi);
-      ss << cbi->name_hint << ",";
-    }
-    ss << "\n";
-    LOG(INFO) << ss.str();
-  }
-}
-
 Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
     const Map<BufferInfo, Stmt>& buffer_info_to_stmt,
     const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation) {
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index c4ca733422e6..d369fd0a4a30 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -140,7 +140,6 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
         == "tir.call_extern(" + '"TVMDeviceEthosUActivate",' + " device_context_ethos_u)\n"
     )
     # Open Device
-    print(str(main_func.body[1]))
     assert (
         str(main_func.body[1][0][0][0])
         == "tir.call_extern(" + '"TVMDeviceEthosUOpen",' + " device_context_ethos_u)\n"
@@ -216,7 +215,6 @@ def test_without_device_api_unpacked_api(non_device_api_main_func):
     """Test a graph without the Device API with the unpacked internal calls"""
 
     main_func = non_device_api_main_func(interface_api="c", use_unpacked_api=True)
-    print(str(main_func.body))
     assert (
         str(main_func.body)
         == 'tir.call_extern("tvmgen_default_fused_multiply", x_buffer_var, y_buffer_var, output_buffer_var)\n'
@@ -227,7 +225,6 @@ def test_without_device_api_packed_api(non_device_api_main_func):
     """Test a graph without the Device API with the packed internal calls"""
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
-    print(str(main_func.body))
     assert (
         str(main_func.body)
         == 'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n'

From 7558345f845e789d771ad5bdfacd92e123151272 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Thu, 13 Jan 2022 10:37:22 +0000
Subject: [PATCH 11/15] USMP integration with AoT executor codegen

Moving PrintType from codegen_c.cc to codegen_source_base.cc
to be accessible by source_module.cc

Change-Id: Icb6a85de4c26110d2fea370fbd75e02a3639de59
---
 src/target/source/codegen_c.cc           | 55 ------------------------
 src/target/source/codegen_c.h            | 13 +-----
 src/target/source/codegen_source_base.cc | 55 ++++++++++++++++++++++++
 src/target/source/codegen_source_base.h  | 12 ++++++
 src/target/source/source_module.cc       | 36 +++++++++++-----
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index ac6f3adad606..e6f81646242d 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -362,61 +362,6 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) {
   ICHECK_EQ(scope, "global");
 }
 
-void CodeGenC::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
-  if (t.is_handle()) {
-    os << "void*";
-    return;
-  }
-  if (t.is_float()) {
-    if (t.bits() == 32) {
-      os << "float";
-      return;
-    }
-    if (t.bits() == 64) {
-      os << "double";
-      return;
-    }
-  } else if (t.is_uint()) {
-    switch (t.bits()) {
-      case 8:
-      case 16:
-      case 32:
-      case 64: {
-        os << "uint" << t.bits() << "_t";
-        return;
-      }
-      case 1:
-        os << "int";
-        return;
-    }
-  } else if (t.is_int()) {
-    switch (t.bits()) {
-      case 8:
-      case 16:
-      case 32:
-      case 64: {
-        os << "int" << t.bits() << "_t";
-        return;
-      }
-    }
-  }
-  LOG(FATAL) << "Cannot convert type " << t << " to C type";
-}
-
-void CodeGenC::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
-  if (auto* ptr = type.as<PrimTypeNode>()) {
-    return PrintType(ptr->dtype, os);
-  } else if (auto* ptr = type.as<PointerTypeNode>()) {
-    PrintType(ptr->element_type, os);
-    os << '*';
-  } else if (IsVoidType(type)) {
-    os << "void";
-  } else {
-    LOG(FATAL) << "Type " << type << " does not have a corresponding C Type";
-  }
-}
-
 inline void PrintConst(const IntImmNode* op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
   if (op->dtype == DataType::Int(32)) {
     std::ostringstream temp;
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index 299f7e0a9cef..3b042b9fbd2c 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -163,18 +163,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   void VisitStmt_(const AssertStmtNode* op) override;
   void VisitStmt_(const EvaluateNode* op) override;
   void VisitStmt_(const SeqStmtNode* op) override;
-  /*!
-   * Print Type represetnation of type t.
-   * \param t The type representation.
-   * \param os The stream to print the ctype into
-   */
-  virtual void PrintType(DataType t, std::ostream& os);  // NOLINT(*)
-  /*!
-   * Print Type represetnation of type type.
-   * \param type The type representation.
-   * \param os The stream to print the ctype into
-   */
-  virtual void PrintType(const Type& type, std::ostream& os);  // NOLINT(*)
+
   /*!
    * \brief Print expr representing the thread tag
    * \param IterVar iv The thread index to be binded;
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 9f0cf9a70b61..933bd0502662 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -113,5 +113,60 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
+void CodeGenSourceBase::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+  ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
+  if (t.is_handle()) {
+    os << "void*";
+    return;
+  }
+  if (t.is_float()) {
+    if (t.bits() == 32) {
+      os << "float";
+      return;
+    }
+    if (t.bits() == 64) {
+      os << "double";
+      return;
+    }
+  } else if (t.is_uint()) {
+    switch (t.bits()) {
+      case 8:
+      case 16:
+      case 32:
+      case 64: {
+        os << "uint" << t.bits() << "_t";
+        return;
+      }
+      case 1:
+        os << "int";
+        return;
+    }
+  } else if (t.is_int()) {
+    switch (t.bits()) {
+      case 8:
+      case 16:
+      case 32:
+      case 64: {
+        os << "int" << t.bits() << "_t";
+        return;
+      }
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenSourceBase::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
+  if (auto* ptr = type.as<PrimTypeNode>()) {
+    return PrintType(ptr->dtype, os);
+  } else if (auto* ptr = type.as<PointerTypeNode>()) {
+    PrintType(ptr->element_type, os);
+    os << '*';
+  } else if (IsVoidType(type)) {
+    os << "void";
+  } else {
+    LOG(FATAL) << "Type " << type << " does not have a corresponding C Type";
+  }
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index d938469b8969..a516ff6b3d96 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -52,6 +52,18 @@ class CodeGenSourceBase {
    * \param value The constant value.
    */
   void MarkConst(std::string value);
+  /*!
+   * Print Type represetnation of type t.
+   * \param t The type representation.
+   * \param os The stream to print the ctype into
+   */
+  virtual void PrintType(DataType t, std::ostream& os);  // NOLINT(*)
+  /*!
+   * Print Type represetnation of type type.
+   * \param type The type representation.
+   * \param os The stream to print the ctype into
+   */
+  virtual void PrintType(const Type& type, std::ostream& os);  // NOLINT(*)
 
  protected:
   /*! \brief entry in ssa assign map */
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index deec2c343ae2..8fe95970a1c6 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -35,7 +35,7 @@
 #include "../../runtime/file_utils.h"
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
-#include "codegen_c.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
@@ -128,6 +128,22 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
   return runtime::Module(n);
 }
 
+/*!
+ * \brief A concrete class to get access to base methods of CodegenSourceBase.
+ *
+ * This class exist to get access to methods of CodegenSourceBase without duplicating
+ * them. Therefore, keeping alignment with how codegen and source_module here generates
+ * code.
+ */
+class ConcreteCodegenSourceBase : public CodeGenSourceBase {
+  /*!
+   * \brief Do nothing as this class exist to get access to methods of CodeGenSourceBase
+   */
+  void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) final {
+    return;
+  }
+};
+
 class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
  public:
   CSourceCrtMetadataModuleNode(const Array<String>& func_names, const std::string& fmt,
@@ -166,7 +182,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   Target target_;
   relay::Runtime runtime_;
   runtime::Metadata metadata_;
-  CodeGenC codegen_c_;
+  ConcreteCodegenSourceBase codegen_c_base_;
 
   void CreateFuncRegistry() {
     code_ << "#include <tvm/runtime/crt/module.h>\n";
@@ -245,9 +261,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       std::stringstream call_args_ss;
       for (const tir::Var& input_var : metadata_->inputs) {
         if (input_var->type_annotation.defined()) {
-          codegen_c_.PrintType(input_var->type_annotation, call_args_ss);
+          codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
         } else {
-          codegen_c_.PrintType(input_var.dtype(), call_args_ss);
+          codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
         }
         call_args_ss << " " << input_var->name_hint << ",";
       }
@@ -256,9 +272,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       }
       for (const tir::Var& pool_var : metadata_->pools) {
         if (pool_var->type_annotation.defined()) {
-          codegen_c_.PrintType(pool_var->type_annotation, call_args_ss);
+          codegen_c_base_.PrintType(pool_var->type_annotation, call_args_ss);
         } else {
-          codegen_c_.PrintType(pool_var.dtype(), call_args_ss);
+          codegen_c_base_.PrintType(pool_var.dtype(), call_args_ss);
         }
         call_args_ss << " " << pool_var->name_hint << ",";
       }
@@ -367,9 +383,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       std::stringstream call_args_ss;
       for (const tir::Var& input_var : metadata_->inputs) {
         if (input_var->type_annotation.defined()) {
-          codegen_c_.PrintType(input_var->type_annotation, call_args_ss);
+          codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
         } else {
-          codegen_c_.PrintType(input_var.dtype(), call_args_ss);
+          codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
         }
         call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ",";
       }
@@ -378,9 +394,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       }
       for (const tir::Var& pool_var : metadata_->pools) {
         if (pool_var->type_annotation.defined()) {
-          codegen_c_.PrintType(pool_var->type_annotation, call_args_ss);
+          codegen_c_base_.PrintType(pool_var->type_annotation, call_args_ss);
         } else {
-          codegen_c_.PrintType(pool_var.dtype(), call_args_ss);
+          codegen_c_base_.PrintType(pool_var.dtype(), call_args_ss);
         }
         call_args_ss << " " << pool_var->name_hint << ",";
       }

From 48d25c6697ae4b916f54387c95640b02a7e30bee Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 14 Jan 2022 14:28:47 +0000
Subject: [PATCH 12/15] USMP integration with AoT executor codegen

Moving runtime::metadata to be ExecutorCodegeMetadata
as it contains metadata produced by ExecutorCodegen for
actual codegeneration (not a runtime component).

Change-Id: I13e95573ef331fb995281dbe220db01a7aa91add
---
 src/relay/backend/aot_executor_codegen.cc |  6 +--
 src/relay/backend/build_module.cc         |  4 +-
 src/relay/backend/utils.h                 | 59 ++++++++++++++++++++++-
 src/relay/backend/vm/compiler.cc          |  3 +-
 src/runtime/meta_data.h                   | 56 ---------------------
 src/target/metadata_module.cc             |  2 +-
 src/target/metadata_module.h              |  4 +-
 src/target/source/source_module.cc        | 13 +++--
 src/target/source/source_module.h         |  4 +-
 tests/python/relay/aot/corstone300.ld     |  9 +---
 10 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index dd66758d8c71..f076efeb4ac5 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -957,9 +957,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         Array<tir::Var>(tir_main_func->params.begin(),
                         tir_main_func->params.begin() + tir_main_func->params.size() -
                             return_sid_.size() - pool_vars.size() - devices.size());
-    ret.metadata =
-        runtime::Metadata(inputs, pool_vars, devices, return_sid_.size(), runtime::kTvmExecutorAot,
-                          mod_name, interface_api, use_unpacked_api_, pool_var_info);
+    ret.metadata = ExecutorCodegenMetadata(inputs, pool_vars, devices, return_sid_.size(),
+                                           runtime::kTvmExecutorAot, mod_name, interface_api,
+                                           use_unpacked_api_, pool_var_info);
     return ret;
   }
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index ccfd30476f67..2f986669e758 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -103,7 +103,9 @@ struct ExecutorCodegen {
 
   Array<String> ListDevices() { return CallFunc<Array<String>>("get_devices"); }
 
-  runtime::Metadata GetMetadata() { return CallFunc<runtime::Metadata>("get_metadata"); }
+  relay::backend::ExecutorCodegenMetadata GetMetadata() {
+    return CallFunc<relay::backend::ExecutorCodegenMetadata>("get_metadata");
+  }
   virtual ~ExecutorCodegen() {}
 
  protected:
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 658283b5dc36..8e7aff0e6785 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -53,6 +53,63 @@ class TECompiler;
 namespace backend {
 using Pass = tvm::transform::Pass;
 
+/*!
+ * \brief Structure that can be optionally used by the executor codegen
+ */
+class ExecutorCodegenMetadataNode : public Object {
+ public:
+  /*! \brief input information for the main function */
+  Array<tir::Var> inputs;
+  /*! \brief pool information for the main function */
+  Array<tir::Var> pools;
+  /*! \brief number of outputs of the main function */
+  unsigned int num_outputs = 1;
+  /*! \brief device contexts information for the main function */
+  Array<String> devices;
+  /*! \brief the executor to be used to run the model */
+  String executor = runtime::kTvmExecutorGraph;
+  /*! \brief The external API (packed or c) in use */
+  String interface_api;
+  /*! \brief The internal API (packed or unpacked) in use */
+  bool unpacked_api;
+  /*! \brief the input var names that correspond to pool_inputs */
+  Optional<Map<tir::Var, tir::usmp::AllocatedPoolInfo>> pool_inputs;
+
+  String mod_name = "";
+
+  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
+  static constexpr const char* _type_key = "MetadataObj";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ExecutorCodegenMetadataNode, Object);
+};
+
+/*!
+ * \brief Managed reference to ExecutorCodegenMetadataNode.
+ */
+class ExecutorCodegenMetadata : public ObjectRef {
+ public:
+  TVM_DLL ExecutorCodegenMetadata(Array<tir::Var> inputs, Array<tir::Var> pools,
+                                  Array<String> devices, int num_outputs, String executor,
+                                  String mod_name, String interface_api = "packed",
+                                  bool unpacked_api = false,
+                                  Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
+                                      Map<tir::Var, tir::usmp::AllocatedPoolInfo>()) {
+    auto n = make_object<ExecutorCodegenMetadataNode>();
+    n->inputs = inputs;
+    n->pools = pools;
+    n->devices = devices;
+    n->num_outputs = num_outputs;
+    n->executor = executor;
+    n->interface_api = interface_api;
+    n->unpacked_api = unpacked_api;
+    n->mod_name = mod_name;
+    n->pool_inputs = pool_inputs;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_OBJECT_REF_METHODS(ExecutorCodegenMetadata, ObjectRef, ExecutorCodegenMetadataNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ExecutorCodegenMetadataNode);
+};
+
 /*!
  * \brief The static storage information for each Tensor in the result of a Relay expression
  * (as per relay::FlattenTupleType).
@@ -147,7 +204,7 @@ struct LoweredOutput {
   Array<tvm::runtime::Module> external_mods;
   Map<String, FunctionInfo> function_metadata;
   std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
-  runtime::Metadata metadata;
+  ExecutorCodegenMetadata metadata;
 };
 
 /*!
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 73f4b672a81c..f68dd9f8d2df 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1162,7 +1162,8 @@ void VMCompiler::Codegen() {
   }
 
   lib = codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,
-                                      Runtime::Create("cpp"), runtime::Metadata());
+                                      Runtime::Create("cpp"),
+                                      relay::backend::ExecutorCodegenMetadata());
   exec_->SetLib(lib);
 }
 
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 80d3eeb09dd8..3b9f5db4da6a 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -50,62 +50,6 @@ inline String get_name_mangled(const String& module_name, const String& name) {
   return ss.str();
 }
 
-/*!
- * \brief Structure that can be optionally used by the executor codegen
- */
-class MetadataNode : public Object {
- public:
-  /*! \brief input information for the main function */
-  Array<tir::Var> inputs;
-  /*! \brief pool information for the main function */
-  Array<tir::Var> pools;
-  /*! \brief number of outputs of the main function */
-  unsigned int num_outputs = 1;
-  /*! \brief device contexts information for the main function */
-  Array<String> devices;
-  /*! \brief the executor to be used to run the model */
-  String executor = kTvmExecutorGraph;
-  /*! \brief The external API (packed or c) in use */
-  String interface_api;
-  /*! \brief The internal API (packed or unpacked) in use */
-  bool unpacked_api;
-  /*! \brief the input var names that correspond to pool_inputs */
-  Optional<Map<tir::Var, tir::usmp::AllocatedPoolInfo>> pool_inputs;
-
-  String mod_name = "";
-
-  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-  static constexpr const char* _type_key = "MetadataObj";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
-};
-
-/*!
- * \brief Managed reference to MetadataNode.
- */
-class Metadata : public ObjectRef {
- public:
-  TVM_DLL Metadata(Array<tir::Var> inputs, Array<tir::Var> pools, Array<String> devices,
-                   int num_outputs, String executor, String mod_name,
-                   String interface_api = "packed", bool unpacked_api = false,
-                   Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
-                       Map<tir::Var, tir::usmp::AllocatedPoolInfo>()) {
-    auto n = make_object<MetadataNode>();
-    n->inputs = inputs;
-    n->pools = pools;
-    n->devices = devices;
-    n->num_outputs = num_outputs;
-    n->executor = executor;
-    n->interface_api = interface_api;
-    n->unpacked_api = unpacked_api;
-    n->mod_name = mod_name;
-    n->pool_inputs = pool_inputs;
-    data_ = std::move(n);
-  }
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
-};
-
 /*!
  * \brief Create a metadata module object.
  *
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 2b190e5d66ed..2facf1de64d5 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -37,7 +37,7 @@ namespace codegen {
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
-    tvm::relay::Runtime runtime, runtime::Metadata metadata) {
+    tvm::relay::Runtime runtime, relay::backend::ExecutorCodegenMetadata metadata) {
   // Here we split modules into two groups:
   //  1. Those modules which can be exported to C-runtime. These are DSO-exportable
   //     (i.e. llvm or c) modules which return nothing from get_const_vars().
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
index ee6f7231b3a1..2afcf3497ab8 100644
--- a/src/target/metadata_module.h
+++ b/src/target/metadata_module.h
@@ -33,7 +33,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "../runtime/meta_data.h"
+#include "../relay/backend/utils.h"
 
 namespace tvm {
 namespace codegen {
@@ -54,7 +54,7 @@ namespace codegen {
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
     const Array<runtime::Module>& ext_modules, Target target, tvm::relay::Runtime runtime,
-    runtime::Metadata metadata);
+    relay::backend::ExecutorCodegenMetadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 8fe95970a1c6..43fc619133b5 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -147,7 +147,8 @@ class ConcreteCodegenSourceBase : public CodeGenSourceBase {
 class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
  public:
   CSourceCrtMetadataModuleNode(const Array<String>& func_names, const std::string& fmt,
-                               Target target, relay::Runtime runtime, runtime::Metadata metadata)
+                               Target target, relay::Runtime runtime,
+                               relay::backend::ExecutorCodegenMetadata metadata)
       : fmt_(fmt),
         func_names_(func_names),
         target_(target),
@@ -181,7 +182,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   Array<String> func_names_;
   Target target_;
   relay::Runtime runtime_;
-  runtime::Metadata metadata_;
+  relay::backend::ExecutorCodegenMetadata metadata_;
   ConcreteCodegenSourceBase codegen_c_base_;
 
   void CreateFuncRegistry() {
@@ -230,7 +231,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       for (const auto& kv : metadata_->pool_inputs.value()) {
         tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second;
         if (allocated_pool_info->pool_info->is_internal) {
-          code_ << "__attribute__((section(\".bss.tvm\"), ";
+          code_ << "__attribute__((section(\".data.tvm\"), ";
           code_ << "aligned(" << 16 << ")))\n";
           code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "["
                 << allocated_pool_info->allocated_size->value << "];\n";
@@ -496,7 +497,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 };
 
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
-                                               relay::Runtime runtime, runtime::Metadata metadata) {
+                                               relay::Runtime runtime,
+                                               relay::backend::ExecutorCodegenMetadata metadata) {
   Array<String> func_names;
   for (runtime::Module mod : modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
@@ -580,7 +582,8 @@ TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule")
     .set_body_typed([](const Array<runtime::Module>& modules, Target target,
                        relay::Runtime runtime) {
       // Note that we don't need metadata when we compile a single operator
-      return CreateCSourceCrtMetadataModule(modules, target, runtime, runtime::Metadata());
+      return CreateCSourceCrtMetadataModule(modules, target, runtime,
+                                            relay::backend::ExecutorCodegenMetadata());
     });
 
 }  // namespace codegen
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index fde363c1198a..3b482a107600 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -29,6 +29,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
+#include "../../relay/backend/utils.h"
 #include "../../runtime/meta_data.h"
 
 namespace tvm {
@@ -43,7 +44,8 @@ namespace codegen {
  * \return The wrapped module.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
-                                               relay::Runtime runtime, runtime::Metadata metadata);
+                                               relay::Runtime runtime,
+                                               relay::backend::ExecutorCodegenMetadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld
index e066b1538481..a825da74c1db 100644
--- a/tests/python/relay/aot/corstone300.ld
+++ b/tests/python/relay/aot/corstone300.ld
@@ -141,6 +141,8 @@ SECTIONS
     . = ALIGN (16);
     *(.rodata.tvm)
     . = ALIGN (16);
+    *(.data.tvm)
+    . = ALIGN (16);
   } > DDR
 
   .text :
@@ -249,13 +251,6 @@ SECTIONS
     . = ALIGN(16);
   } > SRAM AT > SRAM
 
-  .bss.tvm :
-  {
-    . = ALIGN(16);
-    *(.bss.tvm)
-    . = ALIGN(16);
-  } > DDR
-
   .bss.NoInit :
   {
     . = ALIGN(16);

From 8235183f9f994b17645e63e7a0b6467f79ac431d Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 14 Jan 2022 14:59:15 +0000
Subject: [PATCH 13/15] USMP integration with AoT executor codegen

Remove unused function declaration

Change-Id: Ib274553938eb5c82fe2b30cecc982481cff3937d
---
 include/tvm/tir/usmp/utils.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index ec909a296fd8..ea37111f3ef0 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -295,7 +295,6 @@ Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info
  */
 Integer CalculateModuleWorkspaceSize(const IRModule& mod);
 
-void PrintConflicts(const Array<BufferInfo>& buffer_info_arr);
 
 /*!
  * \brief The allocate node attribute to indicate candidate memory pools.

From 526f3070abd5dc1b5938bfdee361387d25fbed44 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 14 Jan 2022 17:05:28 +0000
Subject: [PATCH 14/15] USMP integration with AoT executor codegen

* Remove unncessary header file inclusion

Change-Id: I5575d6226baa74fe09dce1c71176e557d84f669a
---
 include/tvm/tir/usmp/utils.h | 1 -
 src/relay/backend/utils.h    | 1 +
 src/runtime/meta_data.h      | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index ea37111f3ef0..582399865d6f 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -295,7 +295,6 @@ Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info
  */
 Integer CalculateModuleWorkspaceSize(const IRModule& mod);
 
-
 /*!
  * \brief The allocate node attribute to indicate candidate memory pools.
  * This needs to be kept in sync with CANDIDATE_MEMORY_POOL_ATTR in
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 8e7aff0e6785..cb019083a9d5 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -33,6 +33,7 @@
 #include <tvm/target/codegen.h>
 #include <tvm/target/virtual_device.h>
 #include <tvm/te/operation.h>
+#include <tvm/tir/usmp/utils.h>
 
 #include <string>
 #include <typeinfo>
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 3b9f5db4da6a..e83e1a3a7629 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -30,7 +30,6 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/tir/usmp/utils.h>
 
 #include <string>
 #include <unordered_map>

From 4a7c907ea5e85dc43a577638951a7fc515d0f873 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 18 Jan 2022 15:49:49 +0000
Subject: [PATCH 15/15] USMP integration with AoT executor codegen

* fixed a typo
* Improved consistenty with arg names for type printers

Change-Id: I25263831c8bbfa11e6e0d6f8f8dd998d254294ff
---
 src/target/source/codegen_source_base.cc | 26 ++++++++++++------------
 src/target/source/codegen_source_base.h  |  6 +++---
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 933bd0502662..5dcf1587bdb9 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -113,46 +113,46 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
-void CodeGenSourceBase::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
-  if (t.is_handle()) {
+void CodeGenSourceBase::PrintType(DataType type, std::ostream& os) {  // NOLINT(*)
+  ICHECK_EQ(type.lanes(), 1) << "do not yet support vector types";
+  if (type.is_handle()) {
     os << "void*";
     return;
   }
-  if (t.is_float()) {
-    if (t.bits() == 32) {
+  if (type.is_float()) {
+    if (type.bits() == 32) {
       os << "float";
       return;
     }
-    if (t.bits() == 64) {
+    if (type.bits() == 64) {
       os << "double";
       return;
     }
-  } else if (t.is_uint()) {
-    switch (t.bits()) {
+  } else if (type.is_uint()) {
+    switch (type.bits()) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "uint" << t.bits() << "_t";
+        os << "uint" << type.bits() << "_t";
         return;
       }
       case 1:
         os << "int";
         return;
     }
-  } else if (t.is_int()) {
-    switch (t.bits()) {
+  } else if (type.is_int()) {
+    switch (type.bits()) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "int" << t.bits() << "_t";
+        os << "int" << type.bits() << "_t";
         return;
       }
     }
   }
-  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+  LOG(FATAL) << "Cannot convert type " << type << " to C type";
 }
 
 void CodeGenSourceBase::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index a516ff6b3d96..8f8f9e1b8bf2 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -53,13 +53,13 @@ class CodeGenSourceBase {
    */
   void MarkConst(std::string value);
   /*!
-   * Print Type represetnation of type t.
+   * Print Type representation of type type.
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(DataType t, std::ostream& os);  // NOLINT(*)
+  virtual void PrintType(DataType type, std::ostream& os);  // NOLINT(*)
   /*!
-   * Print Type represetnation of type type.
+   * Print Type representation of type type.
    * \param type The type representation.
    * \param os The stream to print the ctype into
    */