diff --git a/apps/microtvm/arduino/template_project/src/example_project/model.c b/apps/microtvm/arduino/template_project/src/example_project/model.c
index 553665191b14..25d609dacce1 100644
--- a/apps/microtvm/arduino/template_project/src/example_project/model.c
+++ b/apps/microtvm/arduino/template_project/src/example_project/model.c
@@ -86,7 +86,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
 void TVMInitialize() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
 
 void TVMExecute(void* input_data, void* output_data) {
-  int ret_val = tvmgen_default_run_model(input_data, output_data);
+  int ret_val = tvmgen_default___tvm_main__(input_data, output_data);
   if (ret_val != 0) {
     TVMPlatformAbort(kTvmErrorPlatformCheckFailure);
   }
diff --git a/apps/microtvm/zephyr_cmsisnn/src/main.c b/apps/microtvm/zephyr_cmsisnn/src/main.c
index 274bd63d3ea5..31f6cd0cc1d0 100644
--- a/apps/microtvm/zephyr_cmsisnn/src/main.c
+++ b/apps/microtvm/zephyr_cmsisnn/src/main.c
@@ -34,7 +34,7 @@ extern float output_storage[12];
 
 extern const size_t output_len;
 
-static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE + 512];
+static uint8_t g_crt_workspace[TVMGEN_DEFAULT_WORKSPACE_SIZE];
 tvm_workspace_t app_workspace;
 
 void TVMLogf(const char* msg, ...) {
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 7b5326a44921..2e2a79b1ca53 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -235,8 +235,6 @@ constexpr const char* tvm_module_main = "__tvm_main__";
 constexpr const char* tvm_param_prefix = "__tvm_param__";
 /*! \brief A PackedFunc that looks up linked parameters by storage_id. */
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
-/*! \brief The main AOT executor function generated from TIR */
-constexpr const char* tvm_run_func_suffix = "run_model";
 /*! \brief Model entrypoint generated as an interface to the AOT function outside of TIR */
 constexpr const char* tvm_entrypoint_suffix = "run";
 }  // namespace symbol
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 2168ea74a0ff..a25ef458906c 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -658,8 +658,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Define the PrimFunc attributes
     Map<String, ObjectRef> dict_attrs;
-    String run_func_name =
-        runtime::get_name_mangled(mod_name, runtime::symbol::tvm_run_func_suffix);
+    String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_module_main);
     dict_attrs.Set("global_symbol", run_func_name);
     dict_attrs.Set("runner_function", Bool(true));
     dict_attrs.Set(tvm::attr::kTarget, target_host_);
@@ -702,6 +701,35 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * brief Calculate workspace sizes for PrimFuncs in the IRModule
+   */
+  Map<String, FunctionInfo> CalculateWorkspaceSizes(
+      const IRModule& lowered_mod, const Map<String, FunctionInfo>& function_metadata) {
+    Executor executor_config = lowered_mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    Integer workspace_byte_alignment =
+        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Map<String, FunctionInfo> updated_function_metadata;
+    for (const auto& kv : lowered_mod->functions) {
+      GlobalVar global_var = kv.first;
+      BaseFunc base_func = kv.second;
+      if (base_func->IsInstance<tir::PrimFuncNode>()) {
+        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(base_func);
+        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
+        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
+        if (function_metadata.count(global_var->name_hint)) {
+          updated_function_metadata.Set(global_var->name_hint,
+                                        function_metadata[global_var->name_hint]);
+          updated_function_metadata[global_var->name_hint]->workspace_sizes.Set(tgt, ws);
+        } else {
+          FunctionInfo finfo{{{tgt, ws}}, {}, {}, {{tgt, pfunc}}, {}};
+          updated_function_metadata.Set(global_var->name_hint, finfo);
+        }
+      }
+    }
+    return updated_function_metadata;
+  }
+
   /*!
    * brief Run USMP to plan memory for lowered IRModule
    */
@@ -710,17 +738,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
-    // Update workspace size based on the pool allocations.
-    for (const auto& kv : function_metadata_) {
-      if (lowered_mod->ContainGlobalVar(kv.first) &&
-          lowered_mod->Lookup(kv.first)->IsInstance<tir::PrimFuncNode>()) {
-        tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(lowered_mod->Lookup(kv.first));
-        Target tgt = pfunc->GetAttr<Target>(tvm::attr::kTarget).value();
-        const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment);
-        kv.second->workspace_sizes.Set(tgt, ws);
-      }
-    }
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     backend::FunctionInfo main_func_info =
@@ -752,17 +771,18 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     // Running StorageRewrite just on the main function
     tir::PrimFunc tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     IRModule main_func_mod;
-    main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
+    main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main),
                           tir_main_func);
     main_func_mod = tir::transform::StorageRewrite()(main_func_mod);
-    lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix),
-                        main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main),
+                        main_func_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     // Use the PrimFunc to calculate the workspace required to service the allocates
     Integer main_workspace_size_bytes =
         CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
@@ -920,7 +940,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // function and replacing it with its TIR version. We should try to make this a Pass.
     lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
     auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
-    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func);
+    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), prim_func);
     // Parallel for loops are not supported in AoT codegen.
     lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
 
@@ -960,7 +980,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
     std::vector<tir::Var> pool_vars;
     tir::PrimFunc tir_main_func =
-        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     if (allocated_pool_infos) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 907eb061303f..97461ca2091f 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -474,7 +474,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   void GenerateAOTDescriptor() {
-    const std::string run_func_suffix = ::tvm::runtime::symbol::tvm_run_func_suffix;
+    const std::string run_func_suffix = ::tvm::runtime::symbol::tvm_module_main;
     const std::string tvm_entrypoint_suffix = ::tvm::runtime::symbol::tvm_entrypoint_suffix;
     const std::string run_func_mangled =
         runtime::get_name_mangled(metadata_->mod_name, run_func_suffix);
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index 9d8e36137c37..a2304f3b9e3d 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -42,7 +42,7 @@ class PoolInfoAssigner : public StmtExprMutator {
  public:
   explicit PoolInfoAssigner(const IRModule& module) {
     PrimFunc main_func =
-        Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+        Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
     ICHECK(main_func.defined()) << "main function is not in the module";
     Optional<Target> target_host = main_func->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target_host) << "main function does not have a target attr";
@@ -79,7 +79,7 @@ class PoolInfoAssigner : public StmtExprMutator {
 PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module) {
   Map<Target, String> target_access;
   tir::PrimFunc tir_main_func =
-      Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
   Target target_host = tir_main_func->GetAttr<Target>(tvm::attr::kTarget).value();
   for (const auto& kv : module->functions) {
     BaseFunc func = kv.second;
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 999ca37d2128..6abc48c31be0 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -331,7 +331,7 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const LoadNode* op) {
 }
 
 IRModule PoolAllocationToOffsetConverter::operator()() {
-  GlobalVar gv = module_->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix);
+  GlobalVar gv = module_->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main);
   PrimFunc main_func = Downcast<PrimFunc>(module_->Lookup(gv));
   ScopeInfo si = UpdateFunctionScopeInfo(main_func);
   this->scope_stack.push(si);
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index 3b941d3cc021..e848440f029e 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -51,7 +51,7 @@ static std::unordered_map<String, std::function<Map<BufferInfo, PoolAllocation>(
 
 IRModule PlanMemory(const IRModule& mod, String algo) {
   VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod);
-  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
   BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod);
   Array<BufferInfo> buffer_info_arr =
       CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
@@ -63,7 +63,7 @@ IRModule PlanMemory(const IRModule& mod, String algo) {
       buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
   IRModule ret = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(mod);
   tir::PrimFunc tir_main_func =
-      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_module_main));
   Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
       tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
   if (allocated_pool_infos) {
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 5c95f7d7a7be..03fac325905c 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -181,7 +181,7 @@ class ModuleWorkspaceSizeCalculator : public StmtExprVisitor {
     for (const auto& gv_func : mod_->functions) {
       functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
     }
-    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
+    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
     ICHECK(main_func_.defined()) << "main function is not in the module";
     Optional<Target> target_host = main_func_->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target_host) << "main function does not have a target attr";
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 4bdaef7a74ca..b355c440e006 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -242,12 +242,13 @@ def build_source(
 def verify_source(
     models: List[AOTCompiledTestModel],
     accel="ethos-u55-256",
+    enable_usmp=True,
 ):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel)
+    test_runner = create_test_runner(accel, enable_usmp)
     run_and_check(
         models,
         test_runner,
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index e9c6da5be18a..7e3140ff514a 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -71,7 +71,7 @@ def test_forward_mobilenet_v1(accel_type, enable_usmp):
     compiled_models = infra.build_source(
         mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp
     )
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, accel_type, enable_usmp=enable_usmp)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index b7021e5a8984..63817fc4b965 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -265,21 +265,29 @@ def emit_data_linkage(output_file, data_linkage):
 
 
 def emit_main_prologue(
-    main_file, custom_prologue, workspace_bytes, data_linkage, compiled_models, interface_api
+    main_file,
+    custom_prologue,
+    workspace_bytes,
+    data_linkage,
+    compiled_models,
+    interface_api,
+    use_stack_allocator=True,
 ):
-    # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
-    workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
-    if interface_api == "c":
-        for compiled_model in compiled_models:
-            model = compiled_model.model
-            workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
-    workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
-    main_file.write(workspace_define)
-    emit_data_linkage(main_file, data_linkage)
-    main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
-    main_file.write("tvm_workspace_t app_workspace;\n")
-    main_file.write(
-        """
+    if use_stack_allocator:
+        workspace_define = f"#define WORKSPACE_SIZE ({workspace_bytes}"
+        if interface_api == "c":
+            for compiled_model in compiled_models:
+                model = compiled_model.model
+                workspace_define += f" + TVMGEN_{model.name.upper()}_WORKSPACE_SIZE"
+        # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
+        workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
+        main_file.write(workspace_define)
+        emit_data_linkage(main_file, data_linkage)
+        main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
+        main_file.write("tvm_workspace_t app_workspace;\n")
+        main_file.write(
+            """
+            
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
@@ -287,7 +295,26 @@ def emit_main_prologue(
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return StackMemoryManager_Free(&app_workspace,ptr);
 }
+        """
+        )
+    else:
+        # An implementation is not needed for these if the stack allocator is not used
+        main_file.write(
+            """
+            
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
 
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+    return kTvmErrorFunctionCallNotImplemented;
+}
+
+            """
+        )
+    main_file.write(
+        """
+    
 void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); }
 
 void TVMLogf(const char* msg, ...) {
@@ -296,10 +323,10 @@ def emit_main_prologue(
   vfprintf(stdout, msg, args);
   va_end(args);
 }
-
+    
 TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
 int main(){\n
-"""
+    """
     )
     main_file.write(custom_prologue)
 
@@ -511,6 +538,7 @@ def create_main(
     data_linkage,
     interface_api,
     workspace_bytes,
+    use_stack_allocator=True,
 ):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
@@ -533,8 +561,10 @@ def create_main(
             data_linkage,
             compiled_models,
             interface_api,
+            use_stack_allocator,
         )
-        emit_main_init_memory_manager(main_file)
+        if use_stack_allocator:
+            emit_main_init_memory_manager(main_file)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
@@ -709,11 +739,14 @@ def run_and_check(
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
-        workspace_bytes = model.extra_memory_in_bytes
-        use_usmp = runner.pass_config.get("tir.usmp.enable", False)
-        if interface_api == "packed" and not use_usmp:
+        # Interface C APIs does not need compiler generated
+        # workspace to generate the test application, because
+        # workspace size is codegen'd as a macro to
+        # tvmgen_<model_name>.h.
+        if interface_api != "c":
             workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
 
+        workspace_bytes += model.extra_memory_in_bytes
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
             create_header_file(
@@ -738,6 +771,10 @@ def run_and_check(
                 data_linkage,
             )
 
+    use_usmp = runner.pass_config.get("tir.usmp.enable", False)
+    # We only need the stack allocator if USMP is not used
+    use_stack_allocator = not use_usmp
+
     create_main(
         "test.c",
         models,
@@ -748,6 +785,7 @@ def run_and_check(
         data_linkage,
         interface_api,
         workspace_bytes,
+        use_stack_allocator,
     )
 
     # Verify that compiles fine
@@ -868,3 +906,22 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
         output_tensor_names = main.attrs["output_tensor_names"]
 
     return dict(zip(output_tensor_names, out))
+
+
+def create_relay_module_and_inputs_from_tflite_file(tflite_model_file):
+    """A helper function to create a Relay IRModule with inputs
+    and params from a tflite file"""
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    mod, params = convert_to_relay(tflite_model_buf)
+
+    inputs = dict()
+    for param in mod["main"].params:
+        name = str(param.name_hint)
+        data_shape = [int(i) for i in param.type_annotation.shape]
+        dtype = str(param.type_annotation.dtype)
+        in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max)
+        data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype)
+        inputs[name] = data
+
+    return mod, inputs, params
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index d369fd0a4a30..8252ee68ade8 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -93,7 +93,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
             pass_config=test_runner.pass_config,
         )
         main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[0][1]
-        main_func = main_ir_module["run_model"]
+        main_func = main_ir_module["__tvm_main__"]
         return main_func
 
     return compile_to_main_func
@@ -124,7 +124,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
             pass_config=test_runner.pass_config,
         )
         main_ir_module = list(compiled_models[0].executor_factory.lowered_ir_mods.values())[0]
-        main_func = main_ir_module["run_model"]
+        main_func = main_ir_module["__tvm_main__"]
         return main_func
 
     return compile_to_main_func
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f4f0806dca52..0147b8cf755a 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -28,6 +28,7 @@
 from tvm.relay.testing import byoc
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
+from tvm.micro import model_library_format as mlf
 from aot_test_utils import (
     AOTTestModel,
     AOT_DEFAULT_RUNNER,
@@ -36,6 +37,7 @@
     compile_and_run,
     compile_models,
     parametrize_aot_options,
+    create_relay_module_and_inputs_from_tflite_file,
 )
 
 
@@ -541,13 +543,7 @@ def test_quant_mobilenet_tfl():
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         "mobilenet_v1_1.0_224_quant.tflite",
     )
-    with open(tflite_model_file, "rb") as f:
-        tflite_model_buf = f.read()
-    data_shape = (1, 224, 224, 3)
-    in_min, in_max = (0, 255)
-    data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
-    mod, params = convert_to_relay(tflite_model_buf)
-    inputs = {"input": data}
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
     compile_and_run(
         AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
@@ -843,5 +839,75 @@ def representative_dataset():
         assert output_name in source
 
 
+@pytest.mark.parametrize(
+    "workspace_byte_alignment,main_workspace_size",
+    [
+        (8, 14880),
+        (16, 14880),
+        (256, 15616),
+    ],
+)
+def test_workspace_calculation(workspace_byte_alignment, main_workspace_size):
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": workspace_byte_alignment,
+        },
+    )
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == main_workspace_size
+
+
+@tvm.testing.requires_package("tflite")
+@tvm.testing.requires_cmsisnn
+def test_workspace_calculation_cmsis_nn():
+    """This tests cmsis_nn codegen for workspace calculation.
+    This is tested specially because cmsis-nn codegen creates
+    multiple PrimFuncs per offloaded relay function in a non
+    -hierarchical manner."""
+    pytest.importorskip("tflite")
+
+    from tvm.relay.op.contrib import cmsisnn
+    from tvm.contrib.download import download_testdata
+
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 16,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+
+    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    file_to_download = "cnn_s_quantized.tflite"
+    file_saved = "cnn_s_quantized_15Dec2021.tflite"
+    model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
+    mod, _, params = create_relay_module_and_inputs_from_tflite_file(model_file)
+    mod = cmsisnn.partition_for_cmsisnn(mod, params)
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+        },
+    ):
+        lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 9904
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index a27609cc07ad..73b34700ee27 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -39,6 +39,7 @@
     compile_models,
     parametrize_aot_options,
     run_and_check,
+    create_relay_module_and_inputs_from_tflite_file,
 )
 
 
@@ -202,23 +203,6 @@ def test_byoc_microtvm(merge_compiler_regions):
     )
 
 
-def _get_relay_module_and_inputs_from_tflite_file(tflite_model_file):
-    with open(tflite_model_file, "rb") as f:
-        tflite_model_buf = f.read()
-    mod, params = convert_to_relay(tflite_model_buf)
-
-    inputs = dict()
-    for param in mod["main"].params:
-        name = str(param.name_hint)
-        data_shape = [int(i) for i in param.type_annotation.shape]
-        dtype = str(param.type_annotation.dtype)
-        in_min, in_max = (np.iinfo(dtype).min, np.iinfo(dtype).max)
-        data = np.random.randint(in_min, high=in_max, size=data_shape, dtype=dtype)
-        inputs[name] = data
-
-    return mod, inputs, params
-
-
 MOBILENET_V1_URL = (
     "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
     "mobilenet_v1_1.0_224_quant.tflite",
@@ -253,7 +237,7 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -324,7 +308,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -384,7 +368,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
         model_url[0],
         model_url[1],
     )
-    mod, inputs, params = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
     output_list = generate_ref_data(mod, inputs, params)
 
     compiled_test_mods = compile_models(
@@ -438,14 +422,14 @@ def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_ur
         model_urls[0][0],
         model_urls[0][1],
     )
-    mod1, inputs1, params1 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file1)
+    mod1, inputs1, params1 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file1)
     output_list1 = generate_ref_data(mod1, inputs1, params1)
 
     tflite_model_file2 = tf_testing.get_workload_official(
         model_urls[1][0],
         model_urls[1][1],
     )
-    mod2, inputs2, params2 = _get_relay_module_and_inputs_from_tflite_file(tflite_model_file2)
+    mod2, inputs2, params2 = create_relay_module_and_inputs_from_tflite_file(tflite_model_file2)
     output_list2 = generate_ref_data(mod2, inputs2, params2)
 
     compiled_test_mods = compile_models(
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index ab40c646391c..07e31a989874 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -122,9 +122,9 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
                 T.store(T_cast_7.data, (((ax0_ax1_fused_5*3584) + (ax2_5*64)) + ax3_3), T.cast(T.load("uint8", tensor_2, (((ax0_ax1_fused_5*3584) + (ax2_5*64)) + ax3_3)), "int16"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "run_model", "runner_function": True})
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
@@ -140,7 +140,7 @@ def run_model(input: T.handle, output: T.handle) -> None:
 @tvm.script.ir_module
 class LinearStructurePlanned:
     @T.prim_func
-    def run_model(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle,  output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle,  output: T.handle) -> None:
         fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
         slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
@@ -217,7 +217,7 @@ def test_mobilenet_subgraph():
     tir_mod = assign_poolinfos_to_allocates_in_irmodule(
         tir_mod, [fast_memory_pool, slow_memory_pool]
     )
-    main_func = tir_mod["run_model"]
+    main_func = tir_mod["__tvm_main__"]
     buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
     buffer_info_map = buffer_analysis.buffer_info_stmts
 
@@ -328,9 +328,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
                     T.store(T_cast_7.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4, T.cast(T.max(T.min(T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_3, ax3_inner_4) + T.load("int32", placeholder_26.data, ax3_outer_2 * 64 + ax3_inner_4), 1343014664, 31, -8, dtype="int32") + 136, 255), 0), "uint8"), "int32") - 136, 1073903788, 31, 1, dtype="int32") + T.load("int32", placeholder_28.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4), 255), 0), "uint8"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
         # function attr dict
-        T.func_attr({"global_symbol": "run_model", "runner_function": True})
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
@@ -464,7 +464,7 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
                 T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1_let, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
 
     @T.prim_func
-    def run_model(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None:
+    def __tvm_main__(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None:
         global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
         # body
         T.attr("default", "device_id", 0)
@@ -491,7 +491,7 @@ def test_resnet_subgraph():
     tir_mod = ResnetStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
     tir_mod = assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool])
-    main_func = tir_mod["run_model"]
+    main_func = tir_mod["__tvm_main__"]
     buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
     buffer_info_map = buffer_analysis.buffer_info_stmts
 
diff --git a/tests/scripts/task_demo_microtvm.sh b/tests/scripts/task_demo_microtvm.sh
index 9ed9c671acc0..b5c18ec9e757 100755
--- a/tests/scripts/task_demo_microtvm.sh
+++ b/tests/scripts/task_demo_microtvm.sh
@@ -19,8 +19,7 @@
 set -euxo pipefail
 
 pushd apps/microtvm/zephyr_cmsisnn
-# Demo tests are disabled here due to https://github.com/apache/tvm/issues/10312
-# timeout 5m ./run_demo.sh
+ timeout 5m ./run_demo.sh
 popd
 
 pushd apps/microtvm/ethosu
@@ -28,6 +27,6 @@ FVP_PATH="/opt/arm/FVP_Corstone_SSE-300_Ethos-U55"
 CMAKE_PATH="/opt/arm/cmake/bin/cmake"
 FREERTOS_PATH="/opt/freertos/FreeRTOSv202112.00"
 
-# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH
-# timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH
+ timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH
+ timeout 5m ./run_demo.sh --fvp_path $FVP_PATH --cmake_path $CMAKE_PATH --freertos_path $FREERTOS_PATH
 popd