From f787a0a09431bdb332539378cbac9c28a406fedc Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 2 Feb 2021 19:14:25 +0000
Subject: [PATCH 01/33] [AOT] Introducing AOT in TVM

This change adds the code generation and minimal runtime API to use the
Ahead Of Time (AOT) compilation flow. The main logic is contained in:

- src/relay/backend/aot_codegen.cc

Which produces a TIR PrimFunc traversing the Relay graph

The runtime interface (authored by @mousius) leaves a gap for future
iterations using platform-specific features from RTOS.

Currently AOT runs successfully on x86 in a host OS, running these
tests on micro is coming soon.

This PR is based on the RFC described here: https://discuss.tvm.apache.org/t/implementing-aot-in-tvm/9206

Co-authored-by: Christopher Sidebottom <Christopher.Sidebottom@arm.com>
Change-Id: I9f731c953231f129e1472298915dddc01788efd7
---
 cmake/modules/StandaloneCrt.cmake             |   7 +-
 include/tvm/runtime/crt/aot/tvm_backend.h     | 104 +++
 include/tvm/runtime/crt/aot/tvm_error.h       |  68 ++
 include/tvm/runtime/crt/aot/tvm_executor.h    |  97 +++
 include/tvm/runtime/module.h                  |   2 +
 include/tvm/tir/builtin.h                     |   4 +
 python/tvm/micro/model_library_format.py      |  33 +-
 .../relay/backend/graph_executor_codegen.py   |   2 +-
 .../relay/backend/graph_executor_factory.py   |  13 +-
 python/tvm/relay/build_module.py              |  15 +-
 src/relay/backend/aot_codegen.cc              | 674 ++++++++++++++++++
 src/relay/backend/build_module.cc             |  63 +-
 src/relay/backend/graph_executor_codegen.cc   |   4 +-
 src/relay/backend/graph_plan_memory.cc        |   4 +-
 src/runtime/crt/aot/tvm_executor.c            |  91 +++
 .../crt/graph_executor/graph_executor.c       |   6 +
 src/runtime/meta_data.h                       |  32 +
 src/target/metadata_module.cc                 |   5 +-
 src/target/metadata_module.h                  |   5 +-
 src/target/source/codegen_c_host.cc           |  67 +-
 src/target/source/codegen_c_host.h            |   5 +-
 src/target/source/codegen_source_base.h       |   7 +-
 src/target/source/source_module.cc            |  32 +-
 src/target/source/source_module.h             |   5 +-
 src/target/target_kind.cc                     |   4 +-
 src/tir/op/builtin.cc                         |   3 +
 src/tir/transforms/lower_tvm_builtin.cc       |   1 +
 tests/cpp/relay_build_module_test.cc          |   2 +-
 tests/cpp/utvm_runtime_standalone_test.cc     |   2 +-
 tests/crt/aot_executor_test.cc                | 199 ++++++
 tests/crt/aot_memory_test.cc                  | 105 +++
 tests/python/relay/aot/aot_test.mk            |  71 ++
 tests/python/relay/aot/infra.py               | 213 ++++++
 tests/python/relay/aot/test_crt_aot.py        | 258 +++++++
 .../relay/test_backend_graph_executor.py      |   2 +-
 tests/python/relay/test_pass_annotation.py    |   2 +-
 tests/python/unittest/test_crt.py             |   4 +-
 tests/python/unittest/test_link_params.py     |   4 +-
 .../test_micro_model_library_format.py        |   2 +-
 .../test_runtime_module_based_interface.py    |  10 +-
 40 files changed, 2146 insertions(+), 81 deletions(-)
 create mode 100644 include/tvm/runtime/crt/aot/tvm_backend.h
 create mode 100644 include/tvm/runtime/crt/aot/tvm_error.h
 create mode 100644 include/tvm/runtime/crt/aot/tvm_executor.h
 create mode 100644 src/relay/backend/aot_codegen.cc
 create mode 100644 src/runtime/crt/aot/tvm_executor.c
 create mode 100644 tests/crt/aot_executor_test.cc
 create mode 100644 tests/crt/aot_memory_test.cc
 create mode 100644 tests/python/relay/aot/aot_test.mk
 create mode 100644 tests/python/relay/aot/infra.py
 create mode 100644 tests/python/relay/aot/test_crt_aot.py

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index fe6baf81c3e5..ea9b393afdcc 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -40,6 +40,7 @@ if(USE_MICRO)
          "3rdparty/dmlc-core/include *.h -> include"
          "include/tvm/runtime c_*_api.h -> include/tvm/runtime"
          "include/tvm/runtime/crt *.h -> include/tvm/runtime/crt"
+         "include/tvm/runtime/crt/aot *.h -> src/runtime/crt/aot"
          "src/runtime/crt Makefile -> ."
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
@@ -48,6 +49,7 @@ if(USE_MICRO)
          "src/runtime/crt/host crt_config.h -> template/host"
          "src/runtime/crt/host *.cc -> template/host"
          "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
+         "src/runtime/crt/aot *.c -> src/runtime/crt/aot"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
          "src/runtime/minrpc *.h -> src/runtime/minrpc"
@@ -135,6 +137,7 @@ if(USE_MICRO)
     file(GLOB TEST_SRCS ${CMAKE_SOURCE_DIR}/tests/crt/*_test.cc)
     find_path(GTEST_INCLUDE_DIR gtest/gtest.h)
     find_library(GTEST_LIB gtest "$ENV{GTEST_LIB}")
+    set(aot_executor_src "${standalone_crt_base}/src/runtime/crt/aot/tvm_executor.c")
 
     # Create the `crttest` target if we can find GTest.  If not, we create dummy
     # targets that give the user an informative error message.
@@ -144,7 +147,9 @@ if(USE_MICRO)
         string(REPLACE ".cc" "" __execname ${__srcname})
         add_executable(${__execname} ${__srcpath})
         list(APPEND TEST_EXECS ${__execname})
-        target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_SOURCE_DIR}/src/runtime/crt/host)
+        target_sources(${__execname} PRIVATE ${aot_executor_src})
+        target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/src/runtime/crt/host)
+        target_include_directories(${__execname} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/src/runtime/crt/aot)
         target_compile_options(${__execname} PRIVATE -pthread)
         target_link_libraries(${__execname} ${cmake_crt_libraries} ${GTEST_LIB} pthread)
         set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
diff --git a/include/tvm/runtime/crt/aot/tvm_backend.h b/include/tvm/runtime/crt/aot/tvm_backend.h
new file mode 100644
index 000000000000..1875cea10a6b
--- /dev/null
+++ b/include/tvm/runtime/crt/aot/tvm_backend.h
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/crt/aot/tvm_backend.h
+ * \brief Backend functions for the AOT executor
+ *
+ * These are not designed to user-facing and may change without warning
+ */
+
+#ifndef TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
+#define TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tvm_error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Memory alignment for allocator */
+#ifndef TVM_RUNTIME_ALLOC_ALIGNMENT
+#define TVM_RUNTIME_ALLOC_ALIGNMENT 16
+#endif
+
+/*! The AOT runtime links staticly */
+#define TVM_DLL
+
+/*!
+ * \brief Minimal TVMValue
+ */
+typedef union {
+  int64_t v_int64; /** Currently used for parameter lookup */
+  void* v_handle;  /** Pointer to other values */
+} TVMValue;
+
+/*!
+ * \brief Packed function signature definition
+ */
+typedef int32_t(tvm_function_t)(void* args, void* arg_type_ids, int32_t num_args,
+                                void* out_ret_value, void* out_ret_tcode, void* resource_handle);
+
+/*!
+ * \brief Workspace memory structure
+ */
+typedef struct {
+  uint8_t* next_alloc;   /** Pointer to the next block of bytes to allocate */
+  uint8_t* workspace;    /** Pointer to start of the workspace */
+  size_t workspace_size; /** Total number of bytes in the workspace */
+} tvm_workspace_t;
+
+/**
+ * \brief Backend function to allocate temporal workspace.
+ *
+ * \note The result allocated space is ensured to be aligned to TVM_RUNTIME_ALLOC_ALIGNMENT.
+ * \note Currently matches CRT runtime signature but this will change in future to accommodate
+ * memory planning
+ *
+ * \param device_type Ignored
+ * \param device_id Ignored
+ * \param nbytes The size of the space requested.
+ * \param dtype_code_hint Ignored
+ * \param dtype_bits_hint Ignored
+ * \return void* NULL on error, a valid pointer on success
+ */
+void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
+                               int dtype_bits_hint);
+
+/*!
+ * \brief Backend function to free temporal workspace.
+ *
+ * \note Currently matches CRT runtime signature but this will change in future to accomodate memory
+ * planning
+ *
+ * \param ptr The result allocated space pointer.
+ * \param device_type Ignored
+ * \param device_id Ignored
+ * \return tvm_crt_error_t Containing any error statuses
+ */
+tvm_crt_error_t TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
diff --git a/include/tvm/runtime/crt/aot/tvm_error.h b/include/tvm/runtime/crt/aot/tvm_error.h
new file mode 100644
index 000000000000..4b90c1afd9fe
--- /dev/null
+++ b/include/tvm/runtime/crt/aot/tvm_error.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/crt/aot/tvm_error.h
+ * \brief Defines a subset of error codes returned by the CRT AOT executor.
+ */
+
+#ifndef TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
+#define TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TVM_CRT_ERROR_CATEGORY_Pos 8
+#define TVM_CRT_ERROR_CATEGORY_Msk (0xff << TVM_CRT_ERROR_CATEGORY_Pos)
+#define TVM_CRT_ERROR_CODE_Pos 0
+#define TVM_CRT_ERROR_CODE_Msk (0xff << TVM_CRT_ERROR_CODE_Pos)
+
+#define DEFINE_TVM_CRT_ERROR(category, code) \
+  (((category) << TVM_CRT_ERROR_CATEGORY_Pos) | ((code) << TVM_CRT_ERROR_CODE_Pos))
+typedef enum {
+  kTvmErrorCategoryPlatform = 5,
+  kTvmErrorCategoryFunctionCall = 8,
+} tvm_crt_error_category_t;
+
+typedef enum {
+  kTvmErrorNoError = 0,
+
+  // Platform
+  kTvmErrorPlatformCheckFailure = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 0),
+  kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
+  kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
+  kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
+  kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
+
+  // Function Calls - common problems encountered calling functions.
+  kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
+  kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
+  kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
+
+  // System errors are always negative integers; this mask indicates presence of a system error.
+  // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
+  kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
+} tvm_crt_error_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
diff --git a/include/tvm/runtime/crt/aot/tvm_executor.h b/include/tvm/runtime/crt/aot/tvm_executor.h
new file mode 100644
index 000000000000..efa5e7b06750
--- /dev/null
+++ b/include/tvm/runtime/crt/aot/tvm_executor.h
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/crt/aot/tvm_executor.h
+ * \brief TVM Executor for the Ahead-of-Time Runtime
+ *
+ * AOT models are described by the TVM model descriptor format
+ * which can be passed to tvm_runtime_run. These descriptors will be
+ * generated by the AOT compilation process. This can optionally be
+ * augmented with platform specific context to be passed to the TVM
+ * operators.
+ *
+ * Example:
+ * extern tvm_model_t my_network;
+ * int main() {
+ *    void* data = get_data();
+ *    void* output[4] = {0, 0, 0, 0};
+ *    void* inputs = {data};
+ *    void* outputs = {output};
+ *    tvm_context_t my_context = {
+ *      .driver = ...;
+ *    };
+ *    tvm_runtime_run(
+ *      &my_network,
+ *      inputs,
+ *      outputs
+ *      &my_context
+ *    );
+ *    return 0;
+ * }
+ */
+
+#ifndef TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
+
+#include <stdint.h>
+
+#include "tvm_backend.h"
+#include "tvm_error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief Context information for future integrations
+ *  which is passed through to the operators.
+ *
+ * \note Can be used for drivers and platform specific information.
+ */
+typedef struct {
+} tvm_context_t;
+
+/*!
+ * \brief TVM Model descriptor to describe the
+ *  model to the runtime.
+ */
+typedef struct {
+  uint32_t num_input_tensors;  /** Number of expected input tensors */
+  uint32_t num_output_tensors; /** Number of expected output tensors */
+  tvm_function_t* run_func;    /** Generated model function, called through tvm_runtime_run */
+  tvm_workspace_t* workspace;  /** Memory workspace for the model to use */
+} tvm_model_t;
+
+/*!
+ * \brief Main entry point for
+ * \param model Model descriptor structure to reference for runtime information
+ * \param inputs Pointer to input pointer(s)
+ * \param outputs Pointer to output pointer(s)
+ * \param context Context information to be passed through to operators
+ * \return tvm_status_t containing success or errors from the model run
+ */
+tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs,
+                                tvm_context_t* context);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 04a5cf8bf25d..689fe6fa53fc 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -230,6 +230,8 @@ constexpr const char* tvm_module_main = "__tvm_main__";
 constexpr const char* tvm_param_prefix = "__tvm_param__";
 /*! \brief A PackedFunc that looks up linked parameters by storage_id. */
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
+/*! \brief The main AOT executor function */
+constexpr const char* tvm_run_func_prefix = "tvm__run_func";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index d8248d4e1a87..33c234eeede5 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -346,6 +346,10 @@ TVM_DLL const Op& tvm_stack_make_array();
  */
 TVM_DLL const Op& tvm_call_packed();
 
+// This achieve the same of a packed call, but with an extern call
+// directly to the operator
+TVM_DLL const Op& tvm_call_unpacked();
+
 /*!
  * \brief See pesudo code
  *
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 6768e03f4473..a3f0d4153aa6 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -73,7 +73,7 @@ def _populate_codegen_dir(mod, codegen_dir: str):
         dso_mod.save(file_name)
 
 
-def _build_memory_map(graph_json):
+def _build_memory_map(graph_str):
     """Build a simpler memory map from graph JSON.
 
     Parameters
@@ -86,10 +86,13 @@ def _build_memory_map(graph_json):
     list :
         A list with one entry per storage id describing that memory.
     """
-    graph = json.loads(graph_json)
+    memory_map = []
+    if graph_str.startswith("primfn"):
+        return memory_map
+
+    graph = json.loads(graph_str)
 
     seen_storage_ids = set()
-    memory_map = []
     for node_id, storage_id in enumerate(graph["attrs"]["storage_id"][1]):
         if storage_id in seen_storage_ids:
             continue
@@ -132,14 +135,25 @@ def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactory
         Path to the .tar archive to generate.
     """
     tempdir = utils.tempdir()
+    is_aot = False
+    for v in mod.target.values():
+        if v.attrs.get("executor", "graph_runtime") == "aot":
+            is_aot = True
+            break
+
+    runtime = ["graph"]
+    if is_aot:
+        runtime = ["aot"]
+
     metadata = {
         "version": 1,
         "model_name": mod.libmod_name,
         "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": _build_memory_map(mod.graph_json),
+        "memory": _build_memory_map(mod.graph),
         "target": {int(k): str(v) for k, v in mod.target.items()},
-        "runtimes": ["graph"],
+        "runtimes": runtime,
     }
+
     with open(tempdir.relpath("metadata.json"), "w") as json_f:
         json.dump(metadata, json_f, indent=2, sort_keys=True)
 
@@ -156,10 +170,11 @@ def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactory
     with open(tempdir.relpath("relay.txt"), "w") as f:
         f.write(str(mod.ir_mod))
 
-    graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
-    os.makedirs(graph_config_dir_path)
-    with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
-        f.write(mod.graph_json)
+    if not is_aot:
+        graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
+        os.makedirs(graph_config_dir_path)
+        with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
+            f.write(mod.graph)
 
     with tarfile.open(file_name, "w") as tar_f:
 
diff --git a/python/tvm/relay/backend/graph_executor_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
index f24bf2c2b55b..6dcc5655aa9a 100644
--- a/python/tvm/relay/backend/graph_executor_codegen.py
+++ b/python/tvm/relay/backend/graph_executor_codegen.py
@@ -46,7 +46,7 @@ def __init__(self, mod, target):
         self._mod = _build_module._GraphExecutorCodegen()
         self._init = self._mod["init"]
         self._codegen = self._mod["codegen"]
-        self._get_graph_json = self._mod["get_graph_json"]
+        self._get_graph_json = self._mod["get_graph"]
         self._list_params_name = self._mod["list_params_name"]
         self._get_param_by_name = self._mod["get_param_by_name"]
         self._get_irmodule = self._mod["get_irmodule"]
diff --git a/python/tvm/relay/backend/graph_executor_factory.py b/python/tvm/relay/backend/graph_executor_factory.py
index d6959d22e5c8..bc543d90c8fb 100644
--- a/python/tvm/relay/backend/graph_executor_factory.py
+++ b/python/tvm/relay/backend/graph_executor_factory.py
@@ -41,17 +41,18 @@ class GraphExecutorFactoryModule:
         The parameters of module
     """
 
-    def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
+    def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
         assert isinstance(graph_json_str, string_types)
         fcreate = get_global_func("tvm.graph_executor_factory.create")
         args = []
         for k, v in params.items():
             args.append(k)
             args.append(ndarray.array(v))
+
         self.ir_mod = ir_mod
         self.target = target
-        self.module = fcreate(graph_json_str, libmod, libmod_name, *args)
-        self.graph_json = graph_json_str
+        self.module = fcreate(graph_str, libmod, libmod_name, *args)
+        self.graph = graph_str
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
@@ -66,8 +67,8 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
     def get_params(self):
         return self.params
 
-    def get_json(self):
-        return self.graph_json
+    def get_graph(self):
+        return self.graph
 
     def get_lib(self):
         return self.lib
@@ -90,7 +91,7 @@ def __next__(self):
         if self.iter_cnt > 2:
             raise StopIteration
 
-        objs = [self.graph_json, self.lib, self.params]
+        objs = [self.graph, self.lib, self.params]
         obj = objs[self.iter_cnt]
         self.iter_cnt += 1
         return obj
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 6eb684e570d9..88a49fde0461 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -77,7 +77,7 @@ class BuildModule(object):
 
     def __init__(self):
         self.mod = _build_module._BuildModule()
-        self._get_graph_json = self.mod["get_graph_json"]
+        self._get_graph = self.mod["get_graph"]
         self._get_module = self.mod["get_module"]
         self._build = self.mod["build"]
         self._optimize = self.mod["optimize"]
@@ -143,11 +143,11 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
-        graph_json = self.get_json()
+        graph = self.get_graph()
         mod = self.get_module()
         params = self.get_params()
 
-        return graph_json, mod, params
+        return graph, mod, params
 
     def optimize(self, mod, target=None, params=None):
         """
@@ -187,9 +187,9 @@ def optimize(self, mod, target=None, params=None):
     def _set_params(self, params):
         self._set_params_func(_convert_param_map(params))
 
-    def get_json(self):
+    def get_graph(self):
         """Return the json file of the built program."""
-        return self._get_graph_json()
+        return self._get_graph()
 
     def get_module(self):
         """Return the built module."""
@@ -251,7 +251,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     Returns
     -------
-    factory_module : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
+    factory_module : tvm.relay.backend.graph_executor_factory.ExecutorFactoryModule
             The runtime factory for the TVM graph executor.
     """
     # pylint: enable=line-too-long
@@ -287,7 +287,8 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
+
+        graph, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
         executor_factory = _graph_executor_factory.GraphExecutorFactoryModule(
             ir_mod, target, graph_json, runtime_mod, mod_name, params
         )
diff --git a/src/relay/backend/aot_codegen.cc b/src/relay/backend/aot_codegen.cc
new file mode 100644
index 000000000000..401334ef11cf
--- /dev/null
+++ b/src/relay/backend/aot_codegen.cc
@@ -0,0 +1,674 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  std::string graph_tir;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+using TIRNetwork = tvm::Array<tir::Stmt>;
+
+/*! \brief Code generator for graph runtime */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. This is because the
+    // builtin::ret is not supported yet in the c target. Once return is supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[reverse_params_lookup_[expr]];
+    auto lookup_linked_param_fn = tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (reverse_params_lookup_.find(arg) != reverse_params_lookup_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the variable to the output
+    // A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    reverse_params_lookup_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    throw std::invalid_argument("Let not yet implemented in AOT");
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); }
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+  }
+
+  // Create the main PrimFunc to execute the graph
+  tir::PrimFunc CreateMainFunc(unsigned int relay_params) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+
+    for (auto kv : storage_device_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != input_vars_.end());
+      const bool is_param = (reverse_params_lookup_.find(kv.first) != reverse_params_lookup_.end());
+      if (is_input || is_param) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
+        int size = kv.second[2][i];
+        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+
+        if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) {
+          continue;
+        }
+
+        if (!allocated[sid]) {
+          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), body);
+        }
+        allocated[sid] = true;
+      }
+    }
+
+    // Define the attributes
+    body = tir::AttrStmt(PrimExpr(), tir::attr::device_context_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tir::attr::device_context_id, 0, body);
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, tir::Buffer>(),
+                         DictAttrs(dict_attrs_));
+  }
+
+ protected:
+  /*! \brief nodes */
+  /*! \brief mod */
+  runtime::Module* mod_;
+  std::vector<Expr> input_vars_;
+  Array<tir::Var> main_signature_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  Target target_host_;
+  Map<String, ObjectRef> dict_attrs_;
+
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
+   * used to lookup the parameter.
+   */
+  Map<Expr, String> reverse_params_lookup_;
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
+
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  std::unordered_map<int, te::Var> sids_table_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;
+  /*! \brief compile engine */
+  CompileEngine compile_engine_;
+  /*! \brief GraphPlanMemory module */
+  runtime::Module graph_plan_memory_module_;
+  /*! \brief the IR module stored which represents the executor program */
+  Map<String, IRModule> tir_module_;
+  /*! \brief the set of statements that make the program */
+  std::vector<tir::Stmt> stmts_;
+  /*! \brief the list of return sids (note that the function might return more then one output */
+  IntegerArray return_sid_;
+
+ public:
+  AOTCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
+      : mod_(mod), return_sid_() {
+    compile_engine_ = CompileEngine::Global();
+    targets_ = targets;
+    target_host_ = target_host;
+    dict_attrs_.Set("global_symbol", runtime::String("tvm__run_func"));
+  }
+
+  AOTLoweredOutput Codegen(relay::Function func) {
+    // Get the module, storage map and token sizes
+    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
+    storage_device_map_ = (*pf)(func);
+
+    int input_index = 0;
+    for (auto input : func->params) {
+      input_vars_.push_back(input);
+      main_signature_.push_back(tir::Var(make_string("input_", input_index), DataType::Handle()));
+    }
+
+    // Define the storage allocator ids
+    for (auto kv : storage_device_map_) {
+      for (const auto& sid : kv.second[0]) {
+        te::Var sid_var(make_string("sid_", sid), DataType::Handle());
+        sids_table_[sid] = sid_var;
+      }
+    }
+
+    // Find the return sid
+    return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
+    for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
+      main_signature_.push_back(tir::Var(make_string("output_", output_index), DataType::Handle()));
+    }
+
+    VisitExpr(func->body);
+
+    auto prim_func = CreateMainFunc(func->params.size());
+    AOTLoweredOutput ret;
+
+    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
+    for (auto param : params_) {
+      ret.params.emplace(std::make_pair(
+          param.first,
+          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+    }
+
+    for (auto& kv : lowered_funcs_) {
+      if (ret.lowered_funcs.count(kv.first) == 0) {
+        ret.lowered_funcs.Set(kv.first, IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+      auto& mod = ret.lowered_funcs[kv.first];
+      mod->Update(kv.second);
+      ret.lowered_funcs.Set(kv.first, mod);
+    }
+    ret.external_mods = compile_engine_->LowerExternalFunctions();
+
+    auto target_host_str = target_host_->str();
+    if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) {
+      ret.lowered_funcs[target_host_str]->Add(
+          GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
+    } else {
+      Map<GlobalVar, BaseFunc> symbol_map;
+      symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
+      ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
+    }
+
+    ret.graph_tir = PrettyPrint(prim_func);
+    ret.aot_metadata = runtime::AOTMetadata(input_vars_.size(), return_sid_.size());
+    return ret;
+  }
+};
+
+class AOTCodegenModule : public runtime::ModuleNode {
+ public:
+  AOTCodegenModule() {}
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
+    if (name == "init") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        ICHECK_EQ(args.num_args, 3) << "The expected of arguments are: "
+                                    << "runtime::Module mod and Map<int, Target> targets";
+        void* mod = args[0];
+        Map<Integer, tvm::Target> tmp = args[1];
+        tvm::Target target_host = args[2];
+        TargetsMap targets;
+        for (const auto& it : tmp) {
+          auto dev_type = it.first.as<tir::IntImmNode>();
+          ICHECK(dev_type);
+          targets[dev_type->value] = it.second;
+        }
+        codegen_ = std::make_shared<AOTCodegen>(reinterpret_cast<runtime::Module*>(mod), targets,
+                                                target_host);
+      });
+    } else if (name == "codegen") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        Function func = args[0];
+        this->output_ = this->codegen_->Codegen(func);
+      });
+    } else if (name == "get_graph") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.graph_tir; });
+    } else if (name == "list_params_name") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        Array<runtime::String> ret;
+        for (const auto& kv : this->output_.params) {
+          ret.push_back(kv.first);
+        }
+        *rv = ret;
+      });
+    } else if (name == "get_param_by_name") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        String key = args[0];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.second;
+      });
+    } else if (name == "get_param_id") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        String key = args[0];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.first;
+      });
+    } else if (name == "get_irmodule") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.lowered_funcs;
+      });
+    } else if (name == "get_external_modules") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.external_mods;
+      });
+    } else if (name == "get_aot_metadata") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->output_.aot_metadata;
+      });
+    } else {
+      return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
+    }
+  }
+
+  const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; }
+
+ private:
+  std::shared_ptr<AOTCodegen> codegen_;
+  AOTLoweredOutput output_;
+};
+
+runtime::Module CreateAOTCodegenMod() {
+  auto ptr = make_object<AOTCodegenModule>();
+  return runtime::Module(ptr);
+}
+
+TVM_REGISTER_GLOBAL("relay.build_module._GraphAOTCodegen")
+    .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CreateAOTCodegenMod(); });
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 07bb51150bee..f1df6b1ad181 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -43,12 +43,14 @@ namespace backend {
 using TargetsMap = Map<tvm::Integer, tvm::Target>;
 using namespace tvm::relay::transform;
 
+enum class Executor { GraphRuntime, Aot };
+
 /*!
  * \brief Output of building module
  *
  */
 struct BuildOutput {
-  std::string graph_json;
+  std::string graph;
   runtime::Module mod;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
@@ -59,17 +61,35 @@ struct BuildOutput {
  */
 struct GraphCodegen {
  public:
-  GraphCodegen() {
-    auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
-    mod = (*pf)();
+  explicit GraphCodegen(Target target_host) : target_host_(target_host) {
+    const String executor_str = target_host->GetAttr<String>("executor").value_or("graph_runtime");
+    if (executor_str == "graph_runtime") {
+      executor_ = Executor::GraphRuntime;
+      auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
+      mod = (*pf)();
+    } else if (executor_str == "aot") {
+      executor_ = Executor::Aot;
+      auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
+      mod = (*pf)();
+    } else {
+      LOG(FATAL) << "Executor not supported";
+    }
   }
   ~GraphCodegen() {}
 
-  void Init(runtime::Module* m, TargetsMap targets) { CallFunc("init", m, targets); }
+  void Init(runtime::Module* m, TargetsMap targets) {
+    if (executor_ == Executor::GraphRuntime) {
+      CallFunc("init", m, targets);
+    } else if (executor_ == Executor::Aot) {
+      CallFunc("init", m, targets, target_host_);
+    } else {
+      LOG(FATAL) << "Executor not supported";
+    }
+  }
 
   void Codegen(const Function& func) { CallFunc("codegen", func); }
 
-  std::string GetJSON() { return CallFunc<std::string>("get_graph_json", nullptr); }
+  std::string GetGraph() { return CallFunc<std::string>("get_graph", nullptr); }
 
   Array<tvm::runtime::Module> GetExternalModules() {
     return CallFunc<Array<tvm::runtime::Module>>("get_external_modules", nullptr);
@@ -101,7 +121,18 @@ struct GraphCodegen {
     return ret;
   }
 
+  runtime::AOTMetadata GetAOTMetdata() {
+    if (executor_ == Executor::Aot) {
+      return CallFunc<runtime::AOTMetadata>("get_aot_metadata");
+    } else {
+      // Graph runtime does not need AOT metadata
+      return runtime::AOTMetadata();
+    }
+  }
+
  protected:
+  Executor executor_;
+  Target target_host_;
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
   R CallFunc(const std::string& name, Args... args) {
@@ -129,9 +160,9 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \return The corresponding member function.
    */
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "get_graph_json") {
+    if (name == "get_graph") {
       return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetGraphJSON(); });
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetGraph(); });
     } else if (name == "get_module") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
@@ -177,7 +208,7 @@ class RelayBuildModule : public runtime::ModuleNode {
    *
    * \return const std::string graph_json
    */
-  const std::string& GetGraphJSON() { return ret_.graph_json; }
+  const std::string& GetGraph() { return ret_.graph; }
 
   /*!
    * \brief Get the Module object
@@ -473,15 +504,22 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     // Relay IRModule -> IRModule optimizations.
     relay_module = Optimize(relay_module, targets_, params);
+
+    Target target_host = GetTargetHost();
+    // If no target_host has been set, we choose a default one, which is
+    // llvm if "codegen.LLVMModuleCreate" is accessible.
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+
     // Get the updated function.
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
     // Generate code for the updated function.
-    graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
+    graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen(target_host));
     graph_codegen_->Init(nullptr, targets_);
     graph_codegen_->Codegen(func);
 
-    ret_.graph_json = graph_codegen_->GetJSON();
+    ret_.graph = graph_codegen_->GetGraph();
     ret_.params = graph_codegen_->GetParams();
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
@@ -524,7 +562,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     }
 
     auto ext_mods = graph_codegen_->GetExternalModules();
-    ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, GetTargetHost());
+    ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, GetTargetHost(),
+                                                  graph_codegen_->GetAOTMetdata());
   }
 
  private:
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 3ea8a2bed91b..4fe47c6692e2 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -250,7 +250,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     size_t count = storage_device_map_.count(expr);
     ICHECK_GT(count, 0) << "Expr is not existing in storage plan";
     auto storage_device_info = storage_device_map_[expr];
-    ICHECK_EQ(storage_device_info.size(), 2);
+    ICHECK_EQ(storage_device_info.size(), 3);
     // storage
     std::vector<int64_t> storage_info;
     for (auto& v : storage_device_info[0]) {
@@ -604,7 +604,7 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
         Function func = args[0];
         this->output_ = this->codegen_->Codegen(func);
       });
-    } else if (name == "get_graph_json") {
+    } else if (name == "get_graph") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.graph_json; });
     } else if (name == "list_params_name") {
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index cf843236da61..bfd1cdc1f77c 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -209,6 +209,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (const auto& kv : token_map_) {
       std::vector<Integer> storage_ids;
       std::vector<Integer> device_types;
+      std::vector<Integer> sid_sizes;
       for (StorageToken* tok : kv.second) {
         if (tok->device_type) {
           num_annotated_nodes++;
@@ -216,8 +217,9 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
         device_types.push_back(tok->device_type);
+        sid_sizes.push_back(GetMemorySize(tok));
       }
-      smap.Set(GetRef<Expr>(kv.first), Array<IntegerArray>({storage_ids, device_types}));
+      smap.Set(GetRef<Expr>(kv.first), Array<IntegerArray>({storage_ids, device_types, sid_sizes}));
     }
     // Either all or none of the nodes should be annotated.
     if (num_annotated_nodes != 0 && num_annotated_nodes != num_nodes) {
diff --git a/src/runtime/crt/aot/tvm_executor.c b/src/runtime/crt/aot/tvm_executor.c
new file mode 100644
index 000000000000..74069c6af26e
--- /dev/null
+++ b/src/runtime/crt/aot/tvm_executor.c
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file src/runtime/crt/aot/tvm_executor.c
+ * \brief Internal implementation of the AOT Executor
+ */
+
+#include "tvm_executor.h"
+
+#include <dlpack/dlpack.h>
+
+#include "tvm_backend.h"
+#include "tvm_error.h"
+
+tvm_workspace_t* tvm_runtime_workspace;
+
+tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs,
+                                tvm_context_t* context) {
+  static DLContext fake_ctx = {kDLCPU, 0};
+  static int64_t fake_dims = 0;
+  static int64_t fake_shape = {0};
+
+  DLTensor tensors[model->num_input_tensors + model->num_output_tensors];     // NOLINT
+  TVMValue tvm_values[model->num_input_tensors + model->num_output_tensors];  // NOLINT
+  int32_t tvm_typeids[model->num_input_tensors + model->num_output_tensors];  // NOLINT
+
+  for (int i = 0; i < model->num_input_tensors; i++) {
+    tensors[i] = (DLTensor){
+        .ctx = fake_ctx,
+        .data = inputs[i],
+        .shape = &fake_shape,
+        .ndim = fake_dims,
+        .byte_offset = 0,
+        .strides = NULL,
+    };
+    tvm_values[i].v_handle = &tensors[i];
+  }
+
+  for (int i = 0; i < model->num_output_tensors; i++) {
+    tensors[model->num_input_tensors + i] = (DLTensor){
+        .ctx = fake_ctx,
+        .data = outputs[i],
+        .shape = &fake_shape,
+        .ndim = fake_dims,
+        .byte_offset = 0,
+        .strides = NULL,
+    };
+    tvm_values[model->num_input_tensors + i].v_handle = &tensors[model->num_input_tensors + i];
+  }
+
+  return model->run_func(&tvm_values, &tvm_typeids, 0, NULL, 0, context);
+}
+
+void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
+                               int dtype_bits_hint) {
+  uint32_t offset = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT - 1);
+  uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
+  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset;
+  uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
+
+  if (next_alloc > workspace_end) {
+    return NULL;
+  }
+
+  tvm_runtime_workspace->next_alloc = next_alloc;
+  return current_alloc;
+}
+
+tvm_crt_error_t TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
+  tvm_runtime_workspace->next_alloc = ptr;
+  return 0;
+}
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 2fe9e73aeddc..614d9d10d43d 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -877,8 +877,14 @@ int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_bl
 void TVMGraphExecutor_Run(TVMGraphExecutor* executor) {
   // setup the array and requirements.
   uint32_t idx;
+<<<<<<< HEAD:src/runtime/crt/graph_executor/graph_executor.c
   for (idx = 0; idx < executor->op_execs_count; ++idx) {
     if (executor->op_execs[idx].fexec) {
+=======
+
+  for (idx = 0; idx < runtime->op_execs_count; ++idx) {
+    if (runtime->op_execs[idx].fexec) {
+>>>>>>> a01a38ec7... [AOT] Introducing AOT in TVM:src/runtime/crt/graph_runtime/graph_runtime.c
 #if TVM_CRT_DEBUG
       printf("calling: %s (%d)\n", executor->op_execs[idx].name, idx);
 #endif  // TVM_CRT_DEBUG
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 03dba399fcb4..aa819ea2343c 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -32,6 +32,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "runtime_base.h"
@@ -39,6 +40,37 @@
 namespace tvm {
 namespace runtime {
 
+/*!
+ * \brief Structure used by the AOT to fill the tvm_module_t structure
+ */
+class AOTMetadataNode : public Object {
+ public:
+  /*! \brief number of inputs of the main function */
+  int num_inputs = 1;
+  /*! \brief number of outputs of the main function */
+  int num_outputs = 1;
+
+  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
+  static constexpr const char* _type_key = "AOTMetadataObj";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AOTMetadataNode, Object);
+};
+
+/*!
+ * \brief Managed reference to AOTMetadataNode.
+ */
+class AOTMetadata : public ObjectRef {
+ public:
+  TVM_DLL AOTMetadata(int num_inputs, int num_outputs) {
+    auto n = make_object<AOTMetadataNode>();
+    n->num_inputs = num_inputs;
+    n->num_outputs = num_outputs;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_OBJECT_REF_METHODS(AOTMetadata, ObjectRef, AOTMetadataNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(AOTMetadataNode);
+};
+
 /*!
  * \brief Create a metadata module object.
  *
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 8184e9189c4b..55b445c34b4a 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -46,7 +46,8 @@ namespace codegen {
  */
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params,
-    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target) {
+    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
+    runtime::AOTMetadata aot_metadata) {
   // Here we split modules into two groups:
   //  1. Those modules which can be exported to C-runtime. These are DSO-exportable
   //     (i.e. llvm or c) modules which return nothing from get_const_vars().
@@ -114,7 +115,7 @@ runtime::Module CreateMetadataModule(
 
     if (target->kind->name == "c") {
       crt_exportable_modules.push_back(target_module);
-      target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target);
+      target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target, aot_metadata);
     } else if (target->kind->name == "llvm") {
 #ifdef TVM_LLVM_VERSION
       crt_exportable_modules.push_back(target_module);
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
index 83cb29dd5a46..49404a63fdeb 100644
--- a/src/target/metadata_module.h
+++ b/src/target/metadata_module.h
@@ -33,12 +33,15 @@
 #include <string>
 #include <unordered_map>
 
+#include "../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params,
-    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target);
+    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
+    int num_inputs = 1, int num_outputs = 1);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index af4bb48d1d73..ac0f0b9f07dc 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -40,13 +40,21 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool is_aot_executor,
+                        std::string target_str) {
   emit_asserts_ = emit_asserts;
+  is_aot_executor_ = is_aot_executor;
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#define TVM_EXPORTS\n";
-  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
-  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  if (is_aot_executor) {
+    decl_stream << "#include \"tvm_executor.h\"\n";
+    decl_stream << "#include \"dlpack/dlpack.h\"\n";
+  } else {
+    decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+    decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  }
+
   decl_stream << "#include <math.h>\n";
   decl_stream << "void* " << module_name_ << " = NULL;\n";
   CodeGenC::Init(output_ssa);
@@ -211,21 +219,34 @@ void CodeGenCHost::PrintGetFuncFromBackend(const std::string& func_name,
   this->stream << "}\n";
 }
 
-void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, int num_args) {
+void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, PrimExpr values,
+                                 int num_args) {
   this->PrintIndent();
+  std::string stack_value = "stack_value";
+  if (const VarNode* stack_value_var = values.as<VarNode>()) {
+    stack_value = stack_value_var->name_hint;
+  }
   std::string ret_val = GetUniqueName("ret_val");
   std::string ret_type_code = GetUniqueName("ret_type_code");
   this->stream << "TVMValue " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
-  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
-               << ", "
+
+  if (is_aot_executor_) {
+    this->stream << "if (" << packed_func_name << "( "
+                 << "(TVMValue*) " << stack_value;
+  } else {
+    this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
+                 << "(TVMValue*) stack_value";
+  }
+  this->stream << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
-               << "&" << ret_val << ", "
-               << "&" << ret_type_code << ") != 0) {\n";
+               << "&" << ret_val << ", ";
+  this->stream << "&" << ret_type_code;
+  this->stream << (is_aot_executor_ ? ", NULL" : "") << ") != 0) {\n";
+
   int func_call_scope = this->BeginScope();
   this->PrintIndent();
   this->stream << "return -1;\n";
@@ -277,8 +298,11 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
       declared_globals_[packed_func_name] = unique_name;
       decl_stream << "static void* " << unique_name << " = NULL;\n";
     }
-    this->PrintGetFuncFromBackend(func_name, unique_name);
+    if (!is_aot_executor_) {
+      this->PrintGetFuncFromBackend(func_name, unique_name);
+    }
     this->PrintFuncCall(unique_name, num_args);
+
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
     this->PrintIndent();
     this->stream << "return -1;\n";
@@ -327,15 +351,19 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, const char* compare,
 }
 
 runtime::Module BuildCHost(IRModule mod, Target target) {
+  bool is_aot_executor = (target->GetAttr<String>("executor").value_or("graph_runtime") == "aot");
+
   using tvm::runtime::Registry;
   bool output_ssa = false;
   bool emit_asserts = false;
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts, target->str());
+  cg.Init(output_ssa, emit_asserts, is_aot_executor, target->str());
 
   Map<String, LinkedParam> linked_params;
   bool found_linked_params = false;
   bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
+  PrimFunc aot_executor_fn;
+
   for (auto kv : mod->functions) {
     if (could_have_linked_params &&
         kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
@@ -347,6 +375,17 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
       found_linked_params = true;
       continue;
     }
+    // Make sure that the executor function is the last one to be code generated so that all the
+    // symbols are available to tvm_run_func
+    if (is_aot_executor) {
+      auto fun_name = std::string(kv.first->name_hint);
+      const bool is_aot_executor_fn =
+          (fun_name.rfind(::tvm::runtime::symbol::tvm_run_func_prefix, 0) == 0);
+      if (is_aot_executor_fn) {
+        aot_executor_fn = Downcast<PrimFunc>(kv.second);
+        continue;
+      }
+    }
 
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
@@ -358,6 +397,12 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
     cg.LinkParameters(linked_params);
   }
 
+  if (is_aot_executor) {
+    ICHECK(aot_executor_fn.defined())
+        << "When using aot executor the executor function should be defined";
+    cg.AddFunction(aot_executor_fn);
+  }
+
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
     ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index eace09f13a07..caf17ff832db 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -38,7 +38,7 @@ namespace codegen {
 class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts, std::string target_str);
+  void Init(bool output_ssa, bool emit_asserts, bool is_aot_executor, std::string target_str);
 
   void AddFunction(const PrimFunc& f);
 
@@ -69,9 +69,10 @@ class CodeGenCHost final : public CodeGenC {
   Array<String> function_names_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
+  bool is_aot_executor_;
 
   void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name);
-  void PrintFuncCall(const std::string& packed_func_name, int num_args);
+  void PrintFuncCall(const std::string& packed_func_name, PrimExpr values, int num_args);
 
   /*!
    * \brief Print ternary conditional operator implementing binary `op`
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 3baa44eb639f..e91d78f580f2 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -155,7 +155,8 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
  */
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
-    const Array<runtime::Module>& ext_modules, Target target);
+    const Array<runtime::Module>& ext_modules, Target target,
+    runtime::AOTMetadata aot_metadata = runtime::AOTMetadata());
 
 /*!
  * \brief Create a source module for viewing and limited saving for device.
@@ -175,8 +176,8 @@ runtime::Module DeviceSourceModuleCreate(
  * \param target the target the modules are compiled for.
  * \return The wrapped module.
  */
-runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
-                                               Target target);
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
+                                               runtime::AOTMetadata aot_metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 26f1850c0e47..68de392e06f6 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -130,8 +130,8 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
 class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
  public:
   CSourceCrtMetadataModuleNode(const Array<String>& func_names, const std::string& fmt,
-                               Target target)
-      : fmt_(fmt), func_names_(func_names), target_(target) {
+                               Target target, runtime::AOTMetadata aot_metadata)
+      : fmt_(fmt), func_names_(func_names), target_(target), aot_metadata_(aot_metadata) {
     CreateSource();
   }
   const char* type_key() const { return "c"; }
@@ -159,6 +159,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   std::string fmt_;
   Array<String> func_names_;
   Target target_;
+  runtime::AOTMetadata aot_metadata_;
 
   void CreateFuncRegistry() {
     code_ << "#include <tvm/runtime/crt/module.h>\n";
@@ -191,17 +192,35 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
+  void GenerateAOTDescriptor() {
+    code_ << "#include <tvm_executor.h>\n";
+    code_ << "#ifdef __cplusplus\n";
+    code_ << "extern \"C\"\n";
+    code_ << "#endif\n";
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+             "out_type_code, void* resource_handle);\n";
+    code_ << "const tvm_model_t network = {\n"
+          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_run_func_prefix << ",\n"
+          << "    .num_input_tensors = " << aot_metadata_->num_inputs << ",\n"
+          << "    .num_output_tensors = " << aot_metadata_->num_outputs << ", \n"
+          << "};\n";
+  }
+
   void CreateSource() {
     if (target_->GetAttr<Bool>("system-lib").value_or(Bool(false)) && !func_names_.empty()) {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
+    if (target_->GetAttr<String>("executor").value_or("graph_runtime") == "aot") {
+      GenerateAOTDescriptor();
+    }
     code_ << ";";
   }
 };
 
-runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
-                                               Target target) {
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
+                                               runtime::AOTMetadata aot_metadata) {
   Array<String> func_names;
   for (runtime::Module mod : modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
@@ -212,7 +231,7 @@ runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& mod
       }
     }
   }
-  auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "cc", target);
+  auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "cc", target, aot_metadata);
   auto csrc_metadata_module = runtime::Module(n);
   for (const auto& mod : modules) {
     csrc_metadata_module.Import(mod);
@@ -283,7 +302,8 @@ TVM_REGISTER_GLOBAL("runtime.CSourceModuleCreate")
 
 TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule")
     .set_body_typed([](const Array<runtime::Module>& modules, Target target) {
-      return CreateCSourceCrtMetadataModule(modules, target);
+      // Note that we don't need metadata when we compile a single operator
+      return CreateCSourceCrtMetadataModule(modules, target, runtime::AOTMetadata());
     });
 
 }  // namespace codegen
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index 45858b9f4ef2..f4f52eccd1dd 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -29,6 +29,8 @@
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
+#include "../../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
@@ -38,7 +40,8 @@ namespace codegen {
  * \param target TVM target.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
-                                               tvm::Target target);
+                                               tvm::Target target,
+                                               runtime::AOTMetadata aot_metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 08842554257b..474b1b0d8ac4 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -227,6 +227,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
+    .add_attr_option<String>("executor")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
@@ -308,8 +309,7 @@ TVM_REGISTER_TARGET_KIND("ext_dev", kDLExtDev)  // line break
 TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU)  // line break
     .add_attr_option<Bool>("system-lib");
 
-TVM_REGISTER_TARGET_KIND("composite", kDLCPU)
-    .add_attr_option<Array<Target>>("devices");
+TVM_REGISTER_TARGET_KIND("composite", kDLCPU).add_attr_option<Array<Target>>("devices");
 
 /**********  Registry  **********/
 
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 1117571c8b75..c4d76d4a7494 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -174,6 +174,9 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_stack_make_array)
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(tvm_call_unpacked)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_trace_packed)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 8d2857ef7a40..ef50dae82ce0 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -297,6 +297,7 @@ class BuiltinLower : public StmtExprMutator {
     Array<PrimExpr> packed_args = {op->args[0], scope.stack_value, scope.stack_tcode,
                                    ConstInt32(arg_stack_begin),
                                    ConstInt32(arg_stack_begin + op->args.size() - 1)};
+
     // call_packed_lowered needs to do the type casting properly
     return Call(op->dtype, builtin::tvm_call_packed_lowered(), packed_args);
   }
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 344fd3d40ba8..e505912d9ffe 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -112,7 +112,7 @@ TEST(Relay, BuildModule) {
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();
   auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph_json", false);
+  auto json_f = build_mod.GetFunction("get_graph", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
   Map<tvm::Integer, tvm::Target> targets;
   Target llvm_tgt = Target("llvm");
diff --git a/tests/cpp/utvm_runtime_standalone_test.cc b/tests/cpp/utvm_runtime_standalone_test.cc
index 5c642a37d6bc..a020aaf55f17 100644
--- a/tests/cpp/utvm_runtime_standalone_test.cc
+++ b/tests/cpp/utvm_runtime_standalone_test.cc
@@ -85,7 +85,7 @@ TEST(MicroStandaloneRuntime, BuildModule) {
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();
   auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph_json", false);
+  auto json_f = build_mod.GetFunction("get_graph", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
   Map<tvm::Integer, tvm::Target> targets;
 
diff --git a/tests/crt/aot_executor_test.cc b/tests/crt/aot_executor_test.cc
new file mode 100644
index 000000000000..753d9d9dc4de
--- /dev/null
+++ b/tests/crt/aot_executor_test.cc
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dlpack/dlpack.h>
+#include <gtest/gtest.h>
+
+#include "tvm_executor.h"
+
+int32_t test_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
+                      void* out_ret_tcode, void* resource_handle) {
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, NoOp) {
+  const tvm_model_t test_model = {
+      .num_input_tensors = 0,
+      .num_output_tensors = 0,
+      .run_func = &test_run_func,
+  };
+
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&test_model, NULL, NULL, NULL));
+}
+
+int32_t error_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
+                       void* out_ret_tcode, void* resource_handle) {
+  return kTvmErrorPlatformNoMemory;
+}
+
+TEST(AOTRuntime, Error) {
+  const tvm_model_t error_model = {
+      .num_input_tensors = 0,
+      .num_output_tensors = 0,
+      .run_func = &error_run_func,
+  };
+
+  ASSERT_EQ(kTvmErrorPlatformNoMemory, tvm_runtime_run(&error_model, NULL, NULL, NULL));
+}
+
+int32_t identity_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
+                          void* out_ret_tcode, void* resource_handle) {
+  void* arg0 = (((TVMValue*)args)[0].v_handle);
+  void* arg1 = (((TVMValue*)args)[1].v_handle);
+  void* placeholder = (((DLTensor*)arg0)[0].data);
+  void* T_id = (((DLTensor*)arg1)[0].data);
+  ((uint32_t*)T_id)[(0)] = ((uint32_t*)placeholder)[(0)];
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, Identity) {
+  const tvm_model_t identity_model = {
+      .num_input_tensors = 1,
+      .num_output_tensors = 1,
+      .run_func = &identity_run_func,
+  };
+
+  uint32_t inputs1[1] = {404};
+  void* inputs[] = {inputs1};
+  uint32_t outputs1[1];
+  void* outputs[] = {outputs1};
+
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&identity_model, inputs, outputs, NULL));
+  ASSERT_EQ(outputs1[0], 404);
+}
+
+int32_t add_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
+                     void* out_ret_tcode, void* resource_handle) {
+  void* arg0 = (((TVMValue*)args)[0].v_handle);
+  void* arg1 = (((TVMValue*)args)[1].v_handle);
+  void* placeholder = (((DLTensor*)arg0)[0].data);
+  void* T_add = (((DLTensor*)arg1)[0].data);
+  ((uint32_t*)T_add)[(0)] = ((uint32_t*)placeholder)[(0)] + ((uint32_t*)placeholder)[(1)];
+  return kTvmErrorNoError;
+
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, Add) {
+  const tvm_model_t add_model = {
+      .num_input_tensors = 1,
+      .num_output_tensors = 1,
+      .run_func = &add_run_func,
+  };
+
+  uint32_t inputs1[2] = {404, 500};
+  void* inputs[] = {inputs1};
+  uint32_t outputs1[1];
+  void* outputs[] = {outputs1};
+
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&add_model, inputs, outputs, NULL));
+  ASSERT_EQ(outputs1[0], 904);
+}
+
+int32_t multiple_inputs_run_func(void* args, void* arg_type_ids, int32_t num_args,
+                                 void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
+  void* arg0 = (((TVMValue*)args)[0].v_handle);
+  void* arg1 = (((TVMValue*)args)[1].v_handle);
+  void* arg2 = (((TVMValue*)args)[2].v_handle);
+  void* placeholder = (((DLTensor*)arg0)[0].data);
+  void* placeholder1 = (((DLTensor*)arg1)[0].data);
+  void* T_add = (((DLTensor*)arg2)[0].data);
+  ((uint32_t*)T_add)[(0)] = ((uint32_t*)placeholder)[(0)] + ((uint32_t*)placeholder)[(1)] +
+                            ((uint32_t*)placeholder1)[(0)] + ((uint32_t*)placeholder1)[(1)];
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, MultipleInputs) {
+  const tvm_model_t multiple_inputs_model = {
+      .num_input_tensors = 2,
+      .num_output_tensors = 1,
+      .run_func = &multiple_inputs_run_func,
+  };
+
+  uint32_t inputs1[2] = {404, 500};
+  uint32_t inputs2[2] = {200, 202};
+  void* inputs[] = {inputs1, inputs2};
+
+  uint32_t outputs1[1];
+  void* outputs[] = {outputs1};
+
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_inputs_model, inputs, outputs, NULL));
+  ASSERT_EQ(outputs1[0], 1306);
+}
+
+int32_t multiple_outputs_run_func(void* args, void* arg_type_ids, int32_t num_args,
+                                  void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
+  void* arg0 = (((TVMValue*)args)[0].v_handle);
+  void* arg1 = (((TVMValue*)args)[1].v_handle);
+  void* arg2 = (((TVMValue*)args)[2].v_handle);
+  void* placeholder = (((DLTensor*)arg0)[0].data);
+  void* T_split1 = (((DLTensor*)arg1)[0].data);
+  void* T_split2 = (((DLTensor*)arg2)[0].data);
+  ((uint32_t*)T_split1)[(0)] = ((uint32_t*)placeholder)[(0)];
+  ((uint32_t*)T_split2)[(0)] = ((uint32_t*)placeholder)[(1)];
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, MultipleOutputs) {
+  const tvm_model_t multiple_outputs_model = {
+      .num_input_tensors = 1,
+      .num_output_tensors = 2,
+      .run_func = &multiple_outputs_run_func,
+  };
+
+  uint32_t inputs1[2] = {404, 500};
+  void* inputs[] = {inputs1};
+
+  uint32_t outputs1[1];
+  uint32_t outputs2[1];
+  void* outputs[] = {outputs1, outputs2};
+
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_outputs_model, inputs, outputs, NULL));
+  ASSERT_EQ(outputs1[0], 404);
+  ASSERT_EQ(outputs2[0], 500);
+}
+
+int32_t resource_handle_check_run_func(void* args, void* arg_type_ids, int32_t num_args,
+                                       void* out_ret_value, void* out_ret_tcode,
+                                       void* resource_handle) {
+  if (resource_handle == NULL) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+  return kTvmErrorNoError;
+}
+
+TEST(AOTRuntime, ContextPassing) {
+  tvm_context_t stub_context = {};
+  const tvm_model_t resource_handle_check_model = {
+      .num_input_tensors = 0,
+      .num_output_tensors = 0,
+      .run_func = &resource_handle_check_run_func,
+  };
+
+  ASSERT_EQ(kTvmErrorNoError,
+            tvm_runtime_run(&resource_handle_check_model, NULL, NULL, &stub_context));
+  ASSERT_EQ(kTvmErrorFunctionCallWrongArgType,
+            tvm_runtime_run(&resource_handle_check_model, NULL, NULL, NULL));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
new file mode 100644
index 000000000000..a5df9a5b6477
--- /dev/null
+++ b/tests/crt/aot_memory_test.cc
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "tvm_backend.h"
+
+// TODO(Mousius) - Move memory allocation to individual networks
+extern tvm_workspace_t* tvm_runtime_workspace;
+
+/*
+ * Tests allocations are properly aligned when allocated
+ */
+TEST(AOTMemory, Allocate) {
+  static uint8_t model_memory[80];
+  tvm_workspace_t workspace = {
+      .next_alloc = model_memory,
+      .workspace = model_memory,
+      .workspace_size = 80,
+  };
+  tvm_runtime_workspace = &workspace;
+
+  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_one, &model_memory[0]);
+
+  void* block_two = TVMBackendAllocWorkspace(0, 0, 2, 0, 0);
+  ASSERT_EQ(block_two, &model_memory[16]);
+
+  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 24, 0, 0);
+  ASSERT_EQ(two_blocks, &model_memory[32]);
+
+  void* block_three = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_three, &model_memory[64]);
+}
+
+/*
+ * Tests resetting the stack after dealloc
+ */
+TEST(AOTMemory, Free) {
+  static uint8_t model_memory[80];
+  tvm_workspace_t workspace = {
+      .next_alloc = model_memory,
+      .workspace = model_memory,
+      .workspace_size = 80,
+  };
+  tvm_runtime_workspace = &workspace;
+
+  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_one, &model_memory[0]);
+
+  void* block_two = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_two, &model_memory[16]);
+  ASSERT_EQ(0, TVMBackendFreeWorkspace(0, 0, block_two));
+
+  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 2, 0, 0);
+  ASSERT_EQ(two_blocks, &model_memory[16]);
+  ASSERT_EQ(0, TVMBackendFreeWorkspace(0, 0, two_blocks));
+
+  void* block_three = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_three, &model_memory[16]);
+}
+
+/*
+ * Tests we return NULL if we over allocate
+ */
+TEST(AOTMemory, OverAllocate) {
+  static uint8_t model_memory[72];
+  tvm_workspace_t workspace = {
+      .next_alloc = model_memory,
+      .workspace = model_memory,
+      .workspace_size = 72,
+  };
+  tvm_runtime_workspace = &workspace;
+
+  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_one, &model_memory[0]);
+
+  void* block_two = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  ASSERT_EQ(block_two, &model_memory[16]);
+
+  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 64, 0, 0);
+  ASSERT_EQ(two_blocks, (void*)NULL);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
new file mode 100644
index 000000000000..66dd6e6ae21f
--- /dev/null
+++ b/tests/python/relay/aot/aot_test.mk
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# Makefile to build ethosu_test_runner
+# Setup build environment
+#
+AOT_ROOT ?= $(TVM_ROOT)/src/runtime/crt/aot
+
+ENABLE_TVM_PLATFORM_ABORT_BACKTRACE = 0
+DMLC_CORE=$(TVM_ROOT)/3rdparty/dmlc-core
+PKG_COMPILE_OPTS = -g 
+CC = gcc
+AR = ar
+RANLIB = ranlib
+CC_OPTS = CC=$(CC) AR=$(AR) RANLIB=$(RANLIB)
+
+
+PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
+	-I$(TVM_ROOT)/include/tvm/runtime/crt/aot \
+    -I$(TVM_ROOT)/src/runtime/crt/include \
+	-I$(DMLC_CORE)/include \
+	-I$(TVM_ROOT)/3rdparty/dlpack/include \
+	-I$(AOT_ROOT)\
+	-I$(build_dir)
+
+$(ifeq VERBOSE,1)
+QUIET ?=
+$(else)
+QUIET ?= @
+$(endif)
+
+CRT_SRCS = $(shell find $(CRT_ROOT))
+
+aot_test_runner: $(build_dir)/aot_test_runner
+
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o 
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
+
+$(build_dir)/lib1.o: $(build_dir)/../codegen/host/src/lib1.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+
+$(build_dir)/lib0.o: $(build_dir)/../codegen/host/src/lib0.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+
+$(build_dir)/tvm_executor.o: $(TVM_ROOT)/src/runtime/crt/aot/tvm_executor.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+
+clean:
+	$(QUIET)rm -rf $(build_dir)/crt
+cleanall:
+	$(QUIET)rm -rf $(build_dir)
+# Don't define implicit rules; they tend to match on logical target names that aren't targets (i.e. bundle_static)
+.SUFFIXES:
+.DEFAULT: ethosu_test_runner
diff --git a/tests/python/relay/aot/infra.py b/tests/python/relay/aot/infra.py
new file mode 100644
index 000000000000..475b150ccd65
--- /dev/null
+++ b/tests/python/relay/aot/infra.py
@@ -0,0 +1,213 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module provides infrastructure to verify the correctness of
+the command stream produced.
+Currently it will invoke vela to generate a vela-optimized tflite
+in which the command stream is contained as a custom operator.
+This class include methods to parse the custom operator to extract
+the command stream and perform an equivalency check for single operator
+test cases.
+"""
+import tflite
+import os
+import io
+import struct
+import numpy as np
+import pathlib
+import shutil
+import subprocess
+import tempfile
+import tarfile
+
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.op.contrib import get_pattern_table
+from tvm.contrib import utils
+from tvm.relay.backend import compile_engine
+from tvm.contrib import utils
+from tvm.contrib import graph_runtime
+from tvm.micro import export_model_library_format
+
+
+def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
+    """
+    This method runs a process and logs the output to both a log file and stdout
+    """
+    with subprocess.Popen(
+        cmd, cwd=cwd, shell=True, bufsize=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+    ) as proc, open(logfile, "a") as f:
+        while True:
+            data = proc.stdout.readline()
+            result = proc.poll()
+            # process is done if there is no data and the result is valid
+            if data == b"" and result is not None:
+                return int(result)
+            if data:
+                text = data.decode("ascii", errors="backslashreplace")
+                f.write(text)
+                if stdout:
+                    print(text, end="")
+
+
+def create_main(test_name, input_list, output_list, output_path):
+    file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
+    # create header file
+    raw_path = file_path.with_suffix(".c").resolve()
+    with open(raw_path, "w") as main_file:
+        main_file.write("#include <stdio.h>\n")
+        main_file.write("#include <tvm_executor.h>\n")
+        main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
+        main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
+
+        for i in range(0, len(input_list)):
+            main_file.write('#include "input_data%i.h"\n' % i)
+        for i in range(0, len(output_list)):
+            main_file.write('#include "expected_output_data%i.h"\n' % i)
+            main_file.write('#include "output_data%i.h"\n' % i)
+
+        main_file.write("extern tvm_model_t network;\n")
+        main_file.write("extern tvm_workspace_t *tvm_runtime_workspace;\n")
+        main_file.write("int main(){\n")
+        main_file.write("void* inputs[%i] = { " % (len(input_list)))
+
+        for i in range(0, len(input_list)):
+            main_file.write("input_data%i, " % i)
+        main_file.write("};\n")
+
+        main_file.write("void* outputs[%i]  = { " % (len(output_list)))
+        for i in range(0, len(output_list)):
+            main_file.write("output_data%i, " % i)
+        main_file.write("};\n")
+
+        main_file.write("")
+        main_file.write(
+            "tvm_workspace_t app_workspace = {.next_alloc=g_aot_memory, .workspace=g_aot_memory, .workspace_size=WORKSPACE_SIZE};\n"
+        )
+        main_file.write("tvm_runtime_workspace = &app_workspace;\n")
+        main_file.write("tvm_runtime_run(&network, inputs, outputs, NULL);")
+
+        for i in range(0, len(output_list)):
+            main_file.write("for (int i = 0; i<output_data%i_len; i++){\n" % i)
+            main_file.write(
+                'if (output_data%s[i]!=expected_output_data%s[i]){printf("ko\\n");return -1;}\n'
+                % (i, i)
+            )
+            main_file.write("}\n")
+
+        main_file.write('printf("ok\\n");')
+        main_file.write("return 0;")
+        main_file.write("}\n")
+
+
+def create_header_file(tensor_name, npy_data, output_path):
+    """
+    This method generates a header file containing the data contained in the numpy array provided.
+    It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone ethosu_test_runner.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
+    # create header file
+    raw_path = file_path.with_suffix(".h").resolve()
+    with open(raw_path, "w") as header_file:
+        header_file.write("#include <stddef.h>\n")
+        header_file.write("#include <stdint.h>\n")
+        header_file.write("#include <dlpack/dlpack.h>\n")
+        header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+
+        if npy_data.dtype == "int8":
+            header_file.write(f"int8_t {tensor_name}[] =")
+        elif npy_data.dtype == "int32":
+            header_file.write(f"int32_t {tensor_name}[] = ")
+        elif npy_data.dtype == "uint8":
+            header_file.write(f"uint8_t {tensor_name}[] = ")
+        elif npy_data.dtype == "float32":
+            header_file.write(f"float {tensor_name}[] = ")
+
+        header_file.write("{")
+        for i in np.ndindex(npy_data.shape):
+            header_file.write(f"{npy_data[i]}, ")
+        header_file.write("};\n\n")
+
+
+def verify_source(mod, input_list, output_list, params=None):
+    """
+    This method verifies the generated source
+    """
+    target = "c -runtime=c --link-params --executor=aot"
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        lib = tvm.relay.build(mod, target, target_host=target, params=params)
+
+    tmp_path = utils.tempdir()
+    tmp_dir = tmp_path.temp_dir
+
+    base_path = os.path.join(tmp_dir, "test")
+    build_path = os.path.join(base_path, "build")
+    os.makedirs(build_path, exist_ok=True)
+
+    tar_file = os.path.join(base_path, "test.tar")
+    export_model_library_format(lib, tar_file)
+    t = tarfile.open(tar_file)
+    t.extractall(base_path)
+
+    for i in range(len(input_list)):
+        create_header_file((f"input_data{i}"), input_list[i], build_path)
+
+    for i in range(len(output_list)):
+        create_header_file(
+            (f"output_data{i}"),
+            np.zeros(output_list[i].shape, output_list[i].dtype),
+            build_path,
+        )
+        create_header_file((f"expected_output_data{i}"), output_list[i], build_path)
+
+    create_main("test.c", input_list, output_list, build_path)
+
+    # Verify that compiles fine
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    makefile = os.path.join(file_dir, "aot_test.mk")
+    make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.."
+
+    compile_log_path = os.path.join(build_path, "test_compile.log")
+    ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False)
+    assert ret == 0
+
+    # Verify that runs fine
+    run_log_path = os.path.join(build_path, "test_run.log")
+    ret = subprocess_with_stdout_and_log("./aot_test_runner", build_path, run_log_path, False)
+    assert ret == 0
+
+
+def generate_ref_data(mod, input_data, params=None, target="llvm"):
+    """Generate reference data through executing the relay module"""
+    compile_engine.get().clear()
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params=params)
+
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    lib.export_library(lib_path)
+    lib = tvm.runtime.load_module(lib_path)
+    grt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+    grt_mod.set_input(**input_data)
+    grt_mod.run()
+    output_count = grt_mod.get_num_outputs()
+    out = [grt_mod.get_output(i).asnumpy() for i in range(output_count)]
+    return out
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
new file mode 100644
index 000000000000..b6480e039c61
--- /dev/null
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -0,0 +1,258 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tflite
+import os
+import io
+import struct
+import numpy as np
+import pathlib
+import shutil
+import subprocess
+import tempfile
+import tarfile
+
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.op.contrib import get_pattern_table
+from tvm.contrib import utils
+from tvm.relay.backend import compile_engine
+from tvm.contrib import utils
+from tvm.contrib import graph_runtime
+from tvm.micro import export_model_library_format
+from tvm.relay import testing
+
+from infra import *
+
+
+def test_conv_with_params():
+    RELAY_MODEL = """
+#[version = "0.0.5"]
+def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
+    %1 = nn.conv2d(
+         %data,
+         %weight,
+         padding=[2, 2],
+         channels=8,
+         kernel_size=[5, 5],
+         data_layout="NCHW",
+         kernel_layout="OIHW",
+         out_dtype="int32");
+  %1
+}
+"""
+    mod = tvm.parser.fromtext(RELAY_MODEL)
+    main_func = mod["main"]
+    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+
+    weight_data = np.ones(shape_dict["weight"]).astype(type_dict["weight"])
+    input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
+
+    params = {"weight": weight_data}
+    inputs = {"data": input_data}
+    output_list = generate_ref_data(mod, inputs, params)
+
+    input_list = [input_data]
+    verify_source(mod, input_list, output_list, params)
+
+
+def test_add_with_params():
+    x = relay.var("x", shape=(1, 10))
+    y = relay.var("y", shape=(1, 10))
+    z = relay.add(x, y)
+    func = relay.Function([x, y], z)
+
+    x_in = np.ones((1, 10)).astype("float32")
+    y_in = np.random.uniform(size=(1, 10)).astype("float32")
+
+    params = {"x": x_in}
+    inputs = {"y": y_in}
+    output_list = generate_ref_data(func, inputs, params)
+
+    input_list = [y_in]
+    verify_source(func, input_list, output_list, params)
+
+
+def test_conv2d():
+    """Test a subgraph with a single conv2d operator."""
+
+    def conv2d_direct():
+        dtype = "float32"
+        ishape = (1, 32, 14, 14)
+        w1shape = (32, 32, 3, 3)
+
+        data0 = relay.var("data", shape=ishape, dtype=dtype)
+        weight0 = relay.var("weight", shape=w1shape, dtype=dtype)
+        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1))
+        main_f = relay.Function([data0, weight0], out)
+        mod = tvm.IRModule()
+        mod["main"] = main_f
+        mod = transform.InferType()(mod)
+
+        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
+
+        return mod, {"data": i_data, "weight": w1_data}, (1, 32, 14, 14)
+
+    def group_conv2d():
+        dtype = "float32"
+        ishape = (1, 32, 14, 14)
+        w2shape = (32, 1, 3, 3)
+
+        data0 = relay.var("data", shape=(ishape), dtype=dtype)
+        weight0 = relay.var("weight", shape=(w2shape), dtype=dtype)
+        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=32)
+        main_f = relay.Function([data0, weight0], out)
+        mod = tvm.IRModule()
+        mod["main"] = main_f
+        mod = transform.InferType()(mod)
+
+        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+        w_data = np.random.uniform(0, 1, w2shape).astype(dtype)
+
+        return mod, {"data": i_data, "weight": w_data}, (1, 32, 14, 14)
+
+    for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
+        output_list = generate_ref_data(mod, inputs)
+        input_list = [inputs["data"], inputs["weight"]]
+        verify_source(mod, input_list, output_list)
+
+
+def test_concatenate():
+    dtype = "float32"
+    x = relay.var("x", shape=(10, 5), dtype=dtype)
+    y = relay.var("y", shape=(10, 5), dtype=dtype)
+    t = relay.var("z", shape=(), dtype=dtype)
+    z = relay.concatenate((x, y), axis=1)
+    z = relay.add(z, t)
+    # Check result.
+    func = relay.Function([x, y, t], z)
+    x_data = np.random.rand(10, 5).astype(dtype)
+    y_data = np.random.rand(10, 5).astype(dtype)
+    t_data = np.random.uniform(size=()).astype(dtype)
+    inputs = {"x": x_data, "y": y_data, "z": t_data}
+
+    output_list = generate_ref_data(func, inputs)
+    input_list = [inputs["x"], inputs["y"], inputs["z"]]
+    verify_source(func, input_list, output_list)
+
+
+def test_nested_tuples():
+    x = relay.var("x", shape=(10,))
+    x1 = x + relay.const(1.0)
+    x2 = x1 + relay.const(1.0)
+    x3 = x2 + relay.const(1.0)
+    x4 = x3 + relay.const(1.0)
+    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
+    func = relay.Function([x], out)
+
+    x_data = np.random.uniform(size=(10,)).astype(np.float32)
+    inputs = {"x": x_data}
+    output_list = generate_ref_data(func, inputs)
+    input_list = [x_data]
+    verify_source(func, input_list, output_list)
+
+
+def test_tuple_getitem():
+    func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
+    output_list = generate_ref_data(func, {})
+    input_list = []
+    verify_source(func, input_list, output_list)
+
+
+def test_id():
+    x = relay.var("x", "float32")
+    ident = relay.Function([x], x)
+    one = np.array(1.0, "float32")
+    inputs = {"x": one}
+    output_list = generate_ref_data(ident, inputs)
+    input_list = [one]
+    verify_source(ident, input_list, output_list)
+
+
+def test_add_const():
+    two = relay.add(relay.const(1), relay.const(1))
+    func = relay.Function([], two)
+    output_list = generate_ref_data(func, {})
+    input_list = []
+    verify_source(func, input_list, output_list)
+
+
+def test_mul_param():
+    x = relay.var("x", shape=(10, 10))
+    y = relay.var("y", shape=(1, 10))
+    func = relay.Function([x, y], relay.multiply(x, y))
+    x_data = np.random.rand(10, 10).astype("float32")
+    y_data = np.random.rand(1, 10).astype("float32")
+    inputs = {"x": x_data, "y": y_data}
+    output_list = generate_ref_data(func, inputs)
+    input_list = [inputs["x"], inputs["y"]]
+    verify_source(func, input_list, output_list)
+
+
+def test_subtract():
+    i = relay.var("i", shape=[], dtype="int32")
+    sub = relay.subtract(i, relay.const(1, dtype="int32"))
+    func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
+    i_data = np.array(1, dtype="int32")
+    inputs = {"i": i_data}
+    output_list = generate_ref_data(func, inputs)
+    input_list = [inputs["i"]]
+    verify_source(func, input_list, output_list)
+
+
+def test_tuple_output():
+    x = relay.var("x", shape=(6, 9))
+    y = relay.split(x, 3).astuple()
+    a = relay.TupleGetItem(y, 0)
+    b = relay.TupleGetItem(y, 1)
+    c = relay.TupleGetItem(y, 2)
+    out = relay.Tuple([a, b])
+    func = relay.Function([x], out)
+    x_data = np.random.rand(6, 9).astype("float32")
+    inputs = {"x": x_data}
+    output_list = generate_ref_data(func, inputs)
+    input_list = [inputs["x"]]
+    verify_source(func, input_list, output_list)
+
+
+def test_mobilenet():
+    mod, params = testing.mobilenet.get_workload(batch_size=1)
+    data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
+    data = np.random.uniform(size=data_shape).astype("float32")
+    inputs = {"data": data}
+    output_list = generate_ref_data(mod, inputs, params)
+    input_list = [inputs["data"]]
+    verify_source(mod, input_list, output_list, params)
+
+
+if __name__ == "__main__":
+    test_tuple_output()
+    test_mobilenet()
+    test_subtract()
+    test_mul_param()
+    test_id()
+    test_add_const()
+    test_tuple_getitem()
+    test_nested_tuples()
+    test_concatenate()
+    test_conv_with_params()
+    test_add_with_params()
+    test_conv2d()
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 8e6fe298351e..ccf48e077511 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -134,7 +134,7 @@ def test_plan_memory():
     storage_ids = set()
     device_types = set()
     for k, v in smap.items():
-        assert len(v) == 2
+        assert len(v) == 3
         for x in v[0]:
             storage_ids.add(x.value)
         for x in v[1]:
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index a9c31f5ccedd..abf795cd46cc 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -266,7 +266,7 @@ def check_storage_and_device_types():
         storage_ids = []
         device_types = []
         for _, storage_dev_type in smap.items():
-            assert len(storage_dev_type) == 2
+            assert len(storage_dev_type) == 3
             for sid in storage_dev_type[0]:
                 storage_ids.append(sid.value)
             for did in storage_dev_type[1]:
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 6d678b8a3753..2c0316d6582e 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -157,8 +157,8 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
         factory = tvm.relay.build(relay_mod, target=TARGET)
 
     with _make_session(workspace, factory.get_lib()) as sess:
-        graph_mod = tvm.micro.create_local_graph_executor(
-            factory.get_json(), sess.get_system_lib(), sess.device
+        graph_mod = tvm.micro.create_local_graph_runtime(
+            factory.get_graph(), sess.get_system_lib(), sess.context
         )
         A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 3ad515604d0b..8305f002a6a3 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -199,7 +199,7 @@ def test_llvm_link_params():
             assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
             assert mod.get_function("TVMSystemLibEntryPoint") != None
 
-            graph = json.loads(lib.graph_json)
+            graph = json.loads(lib.graph)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, mod, graph, p) or found_one
 
@@ -310,7 +310,7 @@ def test_c_link_params():
             lib_mod = tvm.runtime.load_module(lib_path)
 
             #            lib_mod = lib_factory['default']()
-            graph = json.loads(lib.graph_json)
+            graph = json.loads(lib.graph)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, lib_mod, graph, p)
 
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index db6c55bca12a..642a521fe620 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -35,7 +35,7 @@
 def validate_graph_json(extract_dir, factory):
     with open(os.path.join(extract_dir, "runtime-config", "graph", "graph.json")) as graph_f:
         graph_json = graph_f.read()
-        assert graph_json == factory.graph_json
+        assert graph_json == factory.graph
 
         # Just check it parses and looks roughly right.
         graph = json.loads(graph_json)
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 1d80c60de790..9bcdf509c7e7 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -539,11 +539,11 @@ def test_debug_graph_executor():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # debug graph executor wrapper
-    debug_g_mod = debug_executor.GraphModuleDebug(
-        complied_graph_lib["debug_create"]("default", dev),
-        [dev],
-        complied_graph_lib.get_json(),
+    # debug graph runtime wrapper
+    debug_g_mod = debug_runtime.GraphModuleDebug(
+        complied_graph_lib["debug_create"]("default", ctx),
+        [ctx],
+        complied_graph_lib.get_graph(),
         None,
     )
     debug_g_mod.set_input("data", data)

From 984e03ee2c8c561c84948e3f1260f5c1188d629a Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 1 Apr 2021 19:58:46 +0100
Subject: [PATCH 02/33] Rebasing 2

Change-Id: Ia0a533a49960f1cb4bf3c3833511e539cf7c459f
---
 src/relay/backend/aot_codegen.cc                | 5 +++--
 src/relay/backend/build_module.cc               | 6 ------
 src/runtime/crt/graph_executor/graph_executor.c | 6 ------
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/relay/backend/aot_codegen.cc b/src/relay/backend/aot_codegen.cc
index 401334ef11cf..a6d79cd200f5 100644
--- a/src/relay/backend/aot_codegen.cc
+++ b/src/relay/backend/aot_codegen.cc
@@ -28,6 +28,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
 
 #include <algorithm>
 #include <list>
@@ -468,8 +469,8 @@ class AOTCodegen : public ExprVisitor {
     }
 
     // Define the attributes
-    body = tir::AttrStmt(PrimExpr(), tir::attr::device_context_type, 1, body);
-    body = tir::AttrStmt(PrimExpr(), tir::attr::device_context_id, 0, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
 
     // Make the PrimFunc
     return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, tir::Buffer>(),
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index f1df6b1ad181..cd6ccae031be 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -505,12 +505,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Relay IRModule -> IRModule optimizations.
     relay_module = Optimize(relay_module, targets_, params);
 
-    Target target_host = GetTargetHost();
-    // If no target_host has been set, we choose a default one, which is
-    // llvm if "codegen.LLVMModuleCreate" is accessible.
-    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
-
     // Get the updated function.
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 614d9d10d43d..2fe9e73aeddc 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -877,14 +877,8 @@ int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_bl
 void TVMGraphExecutor_Run(TVMGraphExecutor* executor) {
   // setup the array and requirements.
   uint32_t idx;
-<<<<<<< HEAD:src/runtime/crt/graph_executor/graph_executor.c
   for (idx = 0; idx < executor->op_execs_count; ++idx) {
     if (executor->op_execs[idx].fexec) {
-=======
-
-  for (idx = 0; idx < runtime->op_execs_count; ++idx) {
-    if (runtime->op_execs[idx].fexec) {
->>>>>>> a01a38ec7... [AOT] Introducing AOT in TVM:src/runtime/crt/graph_runtime/graph_runtime.c
 #if TVM_CRT_DEBUG
       printf("calling: %s (%d)\n", executor->op_execs[idx].name, idx);
 #endif  // TVM_CRT_DEBUG

From ff439d4a15f627acba3de1b16292b9006b65a1a9 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 7 Apr 2021 20:05:38 +0100
Subject: [PATCH 03/33] Applying comments/refactoring

Change-Id: Iea1832355f8b1d4c921d02c6b4ceec7db3a681c1
---
 cmake/modules/StandaloneCrt.cmake             |  10 +-
 include/tvm/runtime/crt/aot/tvm_backend.h     | 104 ---------
 include/tvm/runtime/crt/aot/tvm_error.h       |  68 ------
 .../{aot/tvm_executor.h => aot_executor.h}    |   7 +-
 include/tvm/runtime/crt/stack_memory.h        |  55 +++++
 include/tvm/target/target_kind.h              |   6 +
 include/tvm/tir/builtin.h                     |   4 -
 python/tvm/micro/model_library_format.py      |  31 +--
 python/tvm/relay/backend/executor_factory.py  | 216 ++++++++++++++++++
 .../relay/backend/graph_executor_factory.py   |  97 --------
 python/tvm/relay/build_module.py              |  27 ++-
 src/relay/backend/aot_codegen.cc              |  29 +--
 src/relay/backend/build_module.cc             |  53 ++++-
 src/relay/backend/graph_plan_memory.cc        |   7 +-
 src/runtime/crt/Makefile                      |   1 +
 src/runtime/crt/aot/tvm_executor.c            |  91 --------
 src/runtime/crt/common/aot_backend_api.c      |  59 +++++
 src/runtime/crt/memory/stack_memory.c         |  47 ++++
 src/target/source/codegen_c_host.cc           |  15 +-
 src/target/source/codegen_source_base.h       |   2 +
 src/target/source/source_module.cc            |   3 +-
 tests/crt/aot_executor_test.cc                |   4 +-
 tests/crt/aot_memory_test.cc                  |  63 ++---
 tests/python/relay/aot/aot_test.mk            |  17 +-
 tests/python/relay/aot/infra.py               |  31 ++-
 tests/python/relay/aot/test_crt_aot.py        |   2 +-
 26 files changed, 556 insertions(+), 493 deletions(-)
 delete mode 100644 include/tvm/runtime/crt/aot/tvm_backend.h
 delete mode 100644 include/tvm/runtime/crt/aot/tvm_error.h
 rename include/tvm/runtime/crt/{aot/tvm_executor.h => aot_executor.h} (93%)
 create mode 100644 include/tvm/runtime/crt/stack_memory.h
 create mode 100644 python/tvm/relay/backend/executor_factory.py
 delete mode 100644 python/tvm/relay/backend/graph_executor_factory.py
 delete mode 100644 src/runtime/crt/aot/tvm_executor.c
 create mode 100644 src/runtime/crt/common/aot_backend_api.c
 create mode 100644 src/runtime/crt/memory/stack_memory.c

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index ea9b393afdcc..620f7552cef6 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -40,16 +40,15 @@ if(USE_MICRO)
          "3rdparty/dmlc-core/include *.h -> include"
          "include/tvm/runtime c_*_api.h -> include/tvm/runtime"
          "include/tvm/runtime/crt *.h -> include/tvm/runtime/crt"
-         "include/tvm/runtime/crt/aot *.h -> src/runtime/crt/aot"
          "src/runtime/crt Makefile -> ."
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
+         "src/runtime/crt/aot_executor *.c -> src/runtime/crt/aot_executor"
          "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
          "src/runtime/crt/host crt_config.h -> template/host"
          "src/runtime/crt/host *.cc -> template/host"
          "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
-         "src/runtime/crt/aot *.c -> src/runtime/crt/aot"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
          "src/runtime/minrpc *.h -> src/runtime/minrpc"
@@ -99,7 +98,7 @@ if(USE_MICRO)
     set(make_quiet )
     endif(${VERBOSE})
 
-    list(APPEND crt_libraries memory graph_executor utvm_rpc_server utvm_rpc_common common)  # NOTE: listed in link order.
+    list(APPEND crt_libraries memory graph_executor aot_executor utvm_rpc_server utvm_rpc_common common)  # NOTE: listed in link order.
     foreach(crt_lib_name IN LISTS crt_libraries)
       list(APPEND crt_library_paths "host_standalone_crt/lib${crt_lib_name}.a")
     endforeach()
@@ -137,7 +136,6 @@ if(USE_MICRO)
     file(GLOB TEST_SRCS ${CMAKE_SOURCE_DIR}/tests/crt/*_test.cc)
     find_path(GTEST_INCLUDE_DIR gtest/gtest.h)
     find_library(GTEST_LIB gtest "$ENV{GTEST_LIB}")
-    set(aot_executor_src "${standalone_crt_base}/src/runtime/crt/aot/tvm_executor.c")
 
     # Create the `crttest` target if we can find GTest.  If not, we create dummy
     # targets that give the user an informative error message.
@@ -147,9 +145,7 @@ if(USE_MICRO)
         string(REPLACE ".cc" "" __execname ${__srcname})
         add_executable(${__execname} ${__srcpath})
         list(APPEND TEST_EXECS ${__execname})
-        target_sources(${__execname} PRIVATE ${aot_executor_src})
-        target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/src/runtime/crt/host)
-        target_include_directories(${__execname} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/src/runtime/crt/aot)
+        target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_SOURCE_DIR}/src/runtime/crt/host)
         target_compile_options(${__execname} PRIVATE -pthread)
         target_link_libraries(${__execname} ${cmake_crt_libraries} ${GTEST_LIB} pthread)
         set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
diff --git a/include/tvm/runtime/crt/aot/tvm_backend.h b/include/tvm/runtime/crt/aot/tvm_backend.h
deleted file mode 100644
index 1875cea10a6b..000000000000
--- a/include/tvm/runtime/crt/aot/tvm_backend.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file include/tvm/runtime/crt/aot/tvm_backend.h
- * \brief Backend functions for the AOT executor
- *
- * These are not designed to user-facing and may change without warning
- */
-
-#ifndef TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
-#define TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tvm_error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! Memory alignment for allocator */
-#ifndef TVM_RUNTIME_ALLOC_ALIGNMENT
-#define TVM_RUNTIME_ALLOC_ALIGNMENT 16
-#endif
-
-/*! The AOT runtime links staticly */
-#define TVM_DLL
-
-/*!
- * \brief Minimal TVMValue
- */
-typedef union {
-  int64_t v_int64; /** Currently used for parameter lookup */
-  void* v_handle;  /** Pointer to other values */
-} TVMValue;
-
-/*!
- * \brief Packed function signature definition
- */
-typedef int32_t(tvm_function_t)(void* args, void* arg_type_ids, int32_t num_args,
-                                void* out_ret_value, void* out_ret_tcode, void* resource_handle);
-
-/*!
- * \brief Workspace memory structure
- */
-typedef struct {
-  uint8_t* next_alloc;   /** Pointer to the next block of bytes to allocate */
-  uint8_t* workspace;    /** Pointer to start of the workspace */
-  size_t workspace_size; /** Total number of bytes in the workspace */
-} tvm_workspace_t;
-
-/**
- * \brief Backend function to allocate temporal workspace.
- *
- * \note The result allocated space is ensured to be aligned to TVM_RUNTIME_ALLOC_ALIGNMENT.
- * \note Currently matches CRT runtime signature but this will change in future to accommodate
- * memory planning
- *
- * \param device_type Ignored
- * \param device_id Ignored
- * \param nbytes The size of the space requested.
- * \param dtype_code_hint Ignored
- * \param dtype_bits_hint Ignored
- * \return void* NULL on error, a valid pointer on success
- */
-void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
-                               int dtype_bits_hint);
-
-/*!
- * \brief Backend function to free temporal workspace.
- *
- * \note Currently matches CRT runtime signature but this will change in future to accomodate memory
- * planning
- *
- * \param ptr The result allocated space pointer.
- * \param device_type Ignored
- * \param device_id Ignored
- * \return tvm_crt_error_t Containing any error statuses
- */
-tvm_crt_error_t TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TVM_RUNTIME_CRT_AOT_TVM_BACKEND_H_
diff --git a/include/tvm/runtime/crt/aot/tvm_error.h b/include/tvm/runtime/crt/aot/tvm_error.h
deleted file mode 100644
index 4b90c1afd9fe..000000000000
--- a/include/tvm/runtime/crt/aot/tvm_error.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file include/tvm/runtime/crt/aot/tvm_error.h
- * \brief Defines a subset of error codes returned by the CRT AOT executor.
- */
-
-#ifndef TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
-#define TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define TVM_CRT_ERROR_CATEGORY_Pos 8
-#define TVM_CRT_ERROR_CATEGORY_Msk (0xff << TVM_CRT_ERROR_CATEGORY_Pos)
-#define TVM_CRT_ERROR_CODE_Pos 0
-#define TVM_CRT_ERROR_CODE_Msk (0xff << TVM_CRT_ERROR_CODE_Pos)
-
-#define DEFINE_TVM_CRT_ERROR(category, code) \
-  (((category) << TVM_CRT_ERROR_CATEGORY_Pos) | ((code) << TVM_CRT_ERROR_CODE_Pos))
-typedef enum {
-  kTvmErrorCategoryPlatform = 5,
-  kTvmErrorCategoryFunctionCall = 8,
-} tvm_crt_error_category_t;
-
-typedef enum {
-  kTvmErrorNoError = 0,
-
-  // Platform
-  kTvmErrorPlatformCheckFailure = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 0),
-  kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
-  kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
-  kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
-  kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
-
-  // Function Calls - common problems encountered calling functions.
-  kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
-  kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
-  kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
-
-  // System errors are always negative integers; this mask indicates presence of a system error.
-  // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
-  kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
-} tvm_crt_error_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TVM_RUNTIME_CRT_AOT_TVM_ERROR_H_
diff --git a/include/tvm/runtime/crt/aot/tvm_executor.h b/include/tvm/runtime/crt/aot_executor.h
similarity index 93%
rename from include/tvm/runtime/crt/aot/tvm_executor.h
rename to include/tvm/runtime/crt/aot_executor.h
index efa5e7b06750..b152e3aa7332 100644
--- a/include/tvm/runtime/crt/aot/tvm_executor.h
+++ b/include/tvm/runtime/crt/aot_executor.h
@@ -52,8 +52,7 @@
 
 #include <stdint.h>
 
-#include "tvm_backend.h"
-#include "tvm_error.h"
+#include "error_codes.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -68,6 +67,9 @@ extern "C" {
 typedef struct {
 } tvm_context_t;
 
+typedef int32_t(tvm_function_t)(void* args, void* arg_type_ids, int32_t num_args,
+                                void* out_ret_value, void* out_ret_tcode, void* resource_handle);
+
 /*!
  * \brief TVM Model descriptor to describe the
  *  model to the runtime.
@@ -76,7 +78,6 @@ typedef struct {
   uint32_t num_input_tensors;  /** Number of expected input tensors */
   uint32_t num_output_tensors; /** Number of expected output tensors */
   tvm_function_t* run_func;    /** Generated model function, called through tvm_runtime_run */
-  tvm_workspace_t* workspace;  /** Memory workspace for the model to use */
 } tvm_model_t;
 
 /*!
diff --git a/include/tvm/runtime/crt/stack_memory.h b/include/tvm/runtime/crt/stack_memory.h
new file mode 100644
index 000000000000..563311eced43
--- /dev/null
+++ b/include/tvm/runtime/crt/stack_memory.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+#ifndef TVM_RUNTIME_CRT_STACK_MEMORY_H_
+#define TVM_RUNTIME_CRT_STACK_MEMORY_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "error_codes.h"
+
+/*! Memory alignment for allocator */
+
+#ifndef TVM_RUNTIME_ALLOC_ALIGNMENT
+#define TVM_RUNTIME_ALLOC_ALIGNMENT 16
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  uint8_t* next_alloc;   /** Pointer to the next block of bytes to allocate */
+  uint8_t* workspace;    /** Pointer to start of the workspace */
+  size_t workspace_size; /** Total number of bytes in the workspace */
+} tvm_workspace_t;
+
+void MemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
+                        size_t workspace_size);
+
+void* MemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes);
+
+tvm_crt_error_t MemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_STACK_MEMORY_H_
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index e7da2dd413a0..2b9d2c5f5a69 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -140,6 +140,12 @@ static constexpr const char* kTvmRuntimeCpp = "c++";
 /*! \brief Value used with --runtime in target specs to indicate the C runtime. */
 static constexpr const char* kTvmRuntimeCrt = "c";
 
+/*! \brief Value used with --executor in target specs to indicate the graph executor. */
+static constexpr const char* kTvmExecutorGraph = "graph";
+
+/*! \brief Value used with --executor in target specs to indicate the aot executor. */
+static constexpr const char* kTvmExecutorAot = "aot";
+
 /*!
  * \brief Helper structure to register TargetKind
  * \sa TVM_REGISTER_TARGET_KIND
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 33c234eeede5..d8248d4e1a87 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -346,10 +346,6 @@ TVM_DLL const Op& tvm_stack_make_array();
  */
 TVM_DLL const Op& tvm_call_packed();
 
-// This achieve the same of a packed call, but with an extern call
-// directly to the operator
-TVM_DLL const Op& tvm_call_unpacked();
-
 /*!
  * \brief See pesudo code
  *
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index a3f0d4153aa6..64fcb9bf8790 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -24,7 +24,7 @@
 import tarfile
 
 from ..contrib import utils
-from ..relay.backend import graph_executor_factory
+from ..relay.backend import executor_factory
 from ..relay import param_dict
 
 
@@ -86,10 +86,6 @@ def _build_memory_map(graph_str):
     list :
         A list with one entry per storage id describing that memory.
     """
-    memory_map = []
-    if graph_str.startswith("primfn"):
-        return memory_map
-
     graph = json.loads(graph_str)
 
     seen_storage_ids = set()
@@ -120,7 +116,7 @@ def _build_memory_map(graph_str):
     return memory_map
 
 
-def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactoryModule, file_name):
+def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, file_name):
     """Export the build artifact in Model Library Format.
 
     This function creates a .tar archive containing the build artifacts in a standardized
@@ -129,27 +125,21 @@ def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactory
 
     Parameters
     ----------
-    mod : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
+    mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule
         The return value of tvm.relay.build, which will be exported into Model Library Format.
     file_name : str
         Path to the .tar archive to generate.
     """
     tempdir = utils.tempdir()
-    is_aot = False
-    for v in mod.target.values():
-        if v.attrs.get("executor", "graph_runtime") == "aot":
-            is_aot = True
-            break
-
-    runtime = ["graph"]
-    if is_aot:
-        runtime = ["aot"]
+    is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+    memory_map = [] if is_aot else _build_memory_map(mod.get_internal_repr())
+    runtime = ["aot"] if is_aot else ["graph"]
 
     metadata = {
         "version": 1,
         "model_name": mod.libmod_name,
         "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": _build_memory_map(mod.graph),
+        "memory": memory_map,
         "target": {int(k): str(v) for k, v in mod.target.items()},
         "runtimes": runtime,
     }
@@ -170,11 +160,8 @@ def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactory
     with open(tempdir.relpath("relay.txt"), "w") as f:
         f.write(str(mod.ir_mod))
 
-    if not is_aot:
-        graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
-        os.makedirs(graph_config_dir_path)
-        with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
-            f.write(mod.graph)
+    graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
+    mod.save_config(graph_config_dir_path)
 
     with tarfile.open(file_name, "w") as tar_f:
 
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
new file mode 100644
index 000000000000..22712781355d
--- /dev/null
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Graph executor factory."""
+import warnings
+from ..._ffi.base import string_types
+from ..._ffi.registry import get_global_func
+from ...runtime import ndarray
+from tvm import tir
+
+
+class ExecutorFactoryModule:
+    """Graph executor factory module.
+    This is a module of graph executor factory
+
+    Parameters
+    ----------
+    graph_str : str
+        Depending on executor:
+        * Graph executor: the graph to be deployed in json format output by graph compiler.
+        The graph can contain operator(tvm_op) that points to the name of
+        PackedFunc in the libmod.
+        * AOT executor: the string representation of the TIR executor PrimFunction
+    target : tvm.Target
+        The Target used to build this module.
+    libmod : tvm.Module
+        The module of the corresponding function
+    libmod_name: str
+        The name of module
+    params : dict of str to NDArray
+        The parameters of module
+    """
+
+    def get_internal_repr(self):
+        return self.internal_repr
+
+    def get_params(self):
+        return None
+
+    def get_lib(self):
+        return None
+
+    def __getitem__(self, item):
+        return None
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def save_config(self, config_path):
+        pass
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.internal_repr, self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
+
+class AOTExecutorFactoryModule(ExecutorFactoryModule):
+    """Graph executor factory module.
+    This is a module of graph executor factory
+
+    Parameters
+    ----------
+    graph_str : str
+        Depending on executor:
+        * Graph executor: the graph to be deployed in json format output by graph compiler.
+        The graph can contain operator(tvm_op) that points to the name of
+        PackedFunc in the libmod.
+        * AOT executor: the string representation of the TIR executor PrimFunction
+    target : tvm.Target
+        The Target used to build this module.
+    libmod : tvm.Module
+        The module of the corresponding function
+    libmod_name: str
+        The name of module
+    params : dict of str to NDArray
+        The parameters of module
+    """
+
+    def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, params):
+        assert isinstance(runner_function, tir.PrimFunc)
+        args = []
+        for k, v in params.items():
+            args.append(k)
+            args.append(ndarray.array(v))
+
+        self.ir_mod = ir_mod
+        self.target = target
+        self.internal_repr = runner_function
+        self.lib = libmod
+        self.libmod_name = libmod_name
+        self.params = params
+        self.iter_cnt = 0
+
+    # Sometimes we want to get params explicitly.
+    # For example, we want to save its params value to
+    # an independent file.
+    def get_params(self):
+        return self.params
+
+    def get_runner_function(self):
+        return self.internal_repr
+
+    def get_lib(self):
+        return self.lib
+
+    def __getitem__(self, item):
+        return self.module.__getitem__(item)
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.graph, self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
+
+class GraphExecutorFactoryModule(ExecutorFactoryModule):
+    """Graph executor factory module.
+    This is a module of graph executor factory
+
+    Parameters
+    ----------
+    graph_str : str
+        Depending on executor:
+        * Graph executor: the graph to be deployed in json format output by graph compiler.
+        The graph can contain operator(tvm_op) that points to the name of
+        PackedFunc in the libmod.
+        * AOT executor: the string representation of the TIR executor PrimFunction
+    target : tvm.Target
+        The Target used to build this module.
+    libmod : tvm.Module
+        The module of the corresponding function
+    libmod_name: str
+        The name of module
+    params : dict of str to NDArray
+        The parameters of module
+    """
+
+    def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
+        assert isinstance(graph_str, string_types)
+        fcreate = get_global_func("tvm.graph_executor_factory.create")
+        args = []
+        for k, v in params.items():
+            args.append(k)
+            args.append(ndarray.array(v))
+
+        self.ir_mod = ir_mod
+        self.target = target
+        self.module = fcreate(graph_str, libmod, libmod_name, *args)
+        self.internal_repr = graph_str
+        self.lib = libmod
+        self.libmod_name = libmod_name
+        self.params = params
+        self.iter_cnt = 0
+
+    def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
+        return self.module.export_library(file_name, fcompile, addons, **kwargs)
+
+    def save_config(self, config_path):
+        os.makedirs(config_path)
+        with open(os.path.join(config_path, "graph.json"), "w") as f:
+            f.write(mod.graph)
+
+    # Sometimes we want to get params explicitly.
+    # For example, we want to save its params value to
+    # an independent file.
+    def get_params(self):
+        return self.params
+
+    def get_graph(self):
+        return self.internal_repr
+
+    def get_lib(self):
+        return self.lib
+
+    def __getitem__(self, item):
+        return self.module.__getitem__(item)
diff --git a/python/tvm/relay/backend/graph_executor_factory.py b/python/tvm/relay/backend/graph_executor_factory.py
deleted file mode 100644
index bc543d90c8fb..000000000000
--- a/python/tvm/relay/backend/graph_executor_factory.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Graph executor factory."""
-import warnings
-from ..._ffi.base import string_types
-from ..._ffi.registry import get_global_func
-from ...runtime import ndarray
-
-
-class GraphExecutorFactoryModule:
-    """Graph executor factory module.
-    This is a module of graph executor factory
-
-    Parameters
-    ----------
-    graph_json_str : str
-        The graph to be deployed in json format output by graph compiler.
-        The graph can contain operator(tvm_op) that points to the name of
-        PackedFunc in the libmod.
-    target : tvm.Target
-        The Target used to build this module.
-    libmod : tvm.Module
-        The module of the corresponding function
-    libmod_name: str
-        The name of module
-    params : dict of str to NDArray
-        The parameters of module
-    """
-
-    def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
-        assert isinstance(graph_json_str, string_types)
-        fcreate = get_global_func("tvm.graph_executor_factory.create")
-        args = []
-        for k, v in params.items():
-            args.append(k)
-            args.append(ndarray.array(v))
-
-        self.ir_mod = ir_mod
-        self.target = target
-        self.module = fcreate(graph_str, libmod, libmod_name, *args)
-        self.graph = graph_str
-        self.lib = libmod
-        self.libmod_name = libmod_name
-        self.params = params
-        self.iter_cnt = 0
-
-    def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
-        return self.module.export_library(file_name, fcompile, addons, **kwargs)
-
-    # Sometimes we want to get params explicitly.
-    # For example, we want to save its params value to
-    # an independent file.
-    def get_params(self):
-        return self.params
-
-    def get_graph(self):
-        return self.graph
-
-    def get_lib(self):
-        return self.lib
-
-    def __getitem__(self, item):
-        return self.module.__getitem__(item)
-
-    def __iter__(self):
-        warnings.warn(
-            "legacy graph executor behavior of producing json / lib / params will be "
-            "removed in the next release."
-            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
-            " new recommended usage.",
-            DeprecationWarning,
-            2,
-        )
-        return self
-
-    def __next__(self):
-        if self.iter_cnt > 2:
-            raise StopIteration
-
-        objs = [self.graph, self.lib, self.params]
-        obj = objs[self.iter_cnt]
-        self.iter_cnt += 1
-        return obj
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 88a49fde0461..a9458073b746 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -34,7 +34,7 @@
 from . import expr as _expr
 from . import function as _function
 from .transform import InferType
-from .backend import graph_executor_factory as _graph_executor_factory
+from .backend import executor_factory as _executor_factory
 from .backend import interpreter as _interpreter
 from .backend.vm import VMExecutor
 
@@ -78,11 +78,13 @@ class BuildModule(object):
     def __init__(self):
         self.mod = _build_module._BuildModule()
         self._get_graph = self.mod["get_graph"]
+        self._get_runner_function = self.mod["get_runner_function"]
         self._get_module = self.mod["get_module"]
         self._build = self.mod["build"]
         self._optimize = self.mod["optimize"]
         self._set_params_func = self.mod["set_params"]
         self._get_params_func = self.mod["get_params"]
+        self._get_executor = self.mod["get_executor"]
 
     def build(self, mod, target=None, target_host=None, params=None):
         """
@@ -143,11 +145,13 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
-        graph = self.get_graph()
         mod = self.get_module()
         params = self.get_params()
+        internal_repr = (
+            self._get_runner_function() if self.get_executor() == "aot" else self.get_graph()
+        )
 
-        return graph, mod, params
+        return internal_repr, mod, params
 
     def optimize(self, mod, target=None, params=None):
         """
@@ -203,6 +207,9 @@ def get_params(self):
             ret[key] = value.data
         return ret
 
+    def get_executor(self):
+        return self._get_executor()
+
 
 @register_func("tvm.relay.module_export_library")
 def _module_export(module, file_name):  # fcompile, addons, kwargs?
@@ -287,11 +294,17 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
+        runtime_repr, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
+
+        if bld_mod.get_executor() == "aot":
+            executor_factory = _executor_factory.AOTExecutorFactoryModule(
+                ir_mod, target, runtime_repr, runtime_mod, mod_name, params
+            )
+        else:
+            executor_factory = _executor_factory.GraphExecutorFactoryModule(
+                ir_mod, target, runtime_repr, runtime_mod, mod_name, params
+            )
 
-        graph, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
-        executor_factory = _graph_executor_factory.GraphExecutorFactoryModule(
-            ir_mod, target, graph_json, runtime_mod, mod_name, params
-        )
         return executor_factory
 
 
diff --git a/src/relay/backend/aot_codegen.cc b/src/relay/backend/aot_codegen.cc
index a6d79cd200f5..1e6edbd62edb 100644
--- a/src/relay/backend/aot_codegen.cc
+++ b/src/relay/backend/aot_codegen.cc
@@ -50,7 +50,7 @@ using TargetsMap = std::unordered_map<int, Target>;
 
 /*! \brief Lowered outputs */
 struct AOTLoweredOutput {
-  std::string graph_tir;
+  tir::PrimFunc runner_func;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
   std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
@@ -102,8 +102,6 @@ class AotReturnSidVisitor : public ExprVisitor {
   IntegerArray return_sid_;
 };
 
-using TIRNetwork = tvm::Array<tir::Stmt>;
-
 /*! \brief Code generator for graph runtime */
 class AOTCodegen : public ExprVisitor {
  protected:
@@ -179,7 +177,7 @@ class AOTCodegen : public ExprVisitor {
     // TODO(giuseros): Using call_extern to call into lookup_linked_param. This is because the
     // builtin::ret is not supported yet in the c target. Once return is supported we can use
     // tvm_call_packed_lowered().
-    int param_sid = param_storage_ids_[reverse_params_lookup_[expr]];
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
     auto lookup_linked_param_fn = tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
     auto param_array = te::Var(make_string("param_", param_sid, "_array"), DataType::Handle());
 
@@ -224,7 +222,7 @@ class AOTCodegen : public ExprVisitor {
       // Input variable
       int main_index = std::distance(input_vars_.begin(), input_iter);
       return {main_signature_[main_index]};
-    } else if (reverse_params_lookup_.find(arg) != reverse_params_lookup_.end()) {
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
       // Parameter of the network
       return {pack_param(arg)};
     } else {
@@ -387,7 +385,7 @@ class AOTCodegen : public ExprVisitor {
 
     param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
     params_[name] = op->data;
-    reverse_params_lookup_.Set(expr, name);
+    params_by_expr_.Set(expr, name);
 
     // If the Constant node is an output node we need to copy the content of the parameter to the
     // output A Var node can only produce a single output
@@ -409,7 +407,7 @@ class AOTCodegen : public ExprVisitor {
 
   void VisitExpr_(const LetNode* op) override {
     // TODO(giuseros): support Let nodes in AOT
-    throw std::invalid_argument("Let not yet implemented in AOT");
+    CHECK(false) << "Let not yet implemented in AOT";
   }
   void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); }
   void VisitExpr_(const OpNode* op) override {
@@ -448,7 +446,7 @@ class AOTCodegen : public ExprVisitor {
       // Only allocate sids that are needed
       const bool is_input =
           (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != input_vars_.end());
-      const bool is_param = (reverse_params_lookup_.find(kv.first) != reverse_params_lookup_.end());
+      const bool is_param = (params_by_expr_.find(kv.first) != params_by_expr_.end());
       if (is_input || is_param) {
         continue;
       }
@@ -478,14 +476,17 @@ class AOTCodegen : public ExprVisitor {
   }
 
  protected:
-  /*! \brief nodes */
   /*! \brief mod */
   runtime::Module* mod_;
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
   std::vector<Expr> input_vars_;
+  /*! \brief input and output variables belonging to the main function signature */
   Array<tir::Var> main_signature_;
   /*! \brief target device */
   TargetsMap targets_;
+  /*! \brief target host */
   Target target_host_;
+  /*! PrimFunc attributes */
   Map<String, ObjectRef> dict_attrs_;
 
   /*!
@@ -494,8 +495,10 @@ class AOTCodegen : public ExprVisitor {
    * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
    * used to lookup the parameter.
    */
-  Map<Expr, String> reverse_params_lookup_;
   std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief mapping between expression and parameters */
+  Map<Expr, String> params_by_expr_;
+  /*! \brief mapping between parameter names ("p0", "p1", etc..) and storage identifiers*/
   std::unordered_map<std::string, int64_t> param_storage_ids_;
 
   /*! \brief plan memory of device result */
@@ -582,7 +585,7 @@ class AOTCodegen : public ExprVisitor {
       ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
     }
 
-    ret.graph_tir = PrettyPrint(prim_func);
+    ret.runner_func = prim_func;
     ret.aot_metadata = runtime::AOTMetadata(input_vars_.size(), return_sid_.size());
     return ret;
   }
@@ -613,9 +616,9 @@ class AOTCodegenModule : public runtime::ModuleNode {
         Function func = args[0];
         this->output_ = this->codegen_->Codegen(func);
       });
-    } else if (name == "get_graph") {
+    } else if (name == "get_runner_function") {
       return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.graph_tir; });
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.runner_func; });
     } else if (name == "list_params_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         Array<runtime::String> ret;
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index cd6ccae031be..aed6fcabdc6f 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -43,7 +43,7 @@ namespace backend {
 using TargetsMap = Map<tvm::Integer, tvm::Target>;
 using namespace tvm::relay::transform;
 
-enum class Executor { GraphRuntime, Aot };
+enum class Executor { Graph, Aot };
 
 /*!
  * \brief Output of building module
@@ -51,6 +51,7 @@ enum class Executor { GraphRuntime, Aot };
  */
 struct BuildOutput {
   std::string graph;
+  tir::PrimFunc runner_function;
   runtime::Module mod;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
@@ -62,12 +63,13 @@ struct BuildOutput {
 struct GraphCodegen {
  public:
   explicit GraphCodegen(Target target_host) : target_host_(target_host) {
-    const String executor_str = target_host->GetAttr<String>("executor").value_or("graph_runtime");
-    if (executor_str == "graph_runtime") {
-      executor_ = Executor::GraphRuntime;
+    const String executor_str =
+        target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
+    if (executor_str == kTvmExecutorGraph) {
+      executor_ = Executor::Graph;
       auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
       mod = (*pf)();
-    } else if (executor_str == "aot") {
+    } else if (executor_str == kTvmExecutorAot) {
       executor_ = Executor::Aot;
       auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
       mod = (*pf)();
@@ -78,7 +80,7 @@ struct GraphCodegen {
   ~GraphCodegen() {}
 
   void Init(runtime::Module* m, TargetsMap targets) {
-    if (executor_ == Executor::GraphRuntime) {
+    if (executor_ == Executor::Graph) {
       CallFunc("init", m, targets);
     } else if (executor_ == Executor::Aot) {
       CallFunc("init", m, targets, target_host_);
@@ -89,7 +91,21 @@ struct GraphCodegen {
 
   void Codegen(const Function& func) { CallFunc("codegen", func); }
 
-  std::string GetGraph() { return CallFunc<std::string>("get_graph", nullptr); }
+  std::string GetGraph() {
+    if (executor_ == Executor::Graph) {
+      return CallFunc<std::string>("get_graph", nullptr);
+    } else {
+      return "";
+    }
+  }
+
+  tir::PrimFunc GetRunnerFunction() {
+    if (executor_ == Executor::Aot) {
+      return CallFunc<tir::PrimFunc>("get_runner_function");
+    } else {
+      return tir::PrimFunc();
+    }
+  }
 
   Array<tvm::runtime::Module> GetExternalModules() {
     return CallFunc<Array<tvm::runtime::Module>>("get_external_modules", nullptr);
@@ -121,7 +137,7 @@ struct GraphCodegen {
     return ret;
   }
 
-  runtime::AOTMetadata GetAOTMetdata() {
+  runtime::AOTMetadata GetAOTMetadata() {
     if (executor_ == Executor::Aot) {
       return CallFunc<runtime::AOTMetadata>("get_aot_metadata");
     } else {
@@ -163,6 +179,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (name == "get_graph") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetGraph(); });
+    } else if (name == "get_runner_function") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetRunnerFunction(); });
     } else if (name == "get_module") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
@@ -197,6 +216,14 @@ class RelayBuildModule : public runtime::ModuleNode {
         ICHECK_EQ(args.num_args, 2);
         *rv = this->Optimize(args[0], args[1], this->params_);
       });
+    } else if (name == "get_executor") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        auto target_host = GetTargetHost();
+        const String executor_str =
+            target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
+
+        *rv = executor_str;
+      });
     } else {
       LOG(FATAL) << "Unknown packed function: " << name;
       return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
@@ -210,6 +237,13 @@ class RelayBuildModule : public runtime::ModuleNode {
    */
   const std::string& GetGraph() { return ret_.graph; }
 
+  /*!
+   * \brief Get the GraphJSON for runtime
+   *
+   * \return const std::string graph_json
+   */
+  const tir::PrimFunc& GetRunnerFunction() { return ret_.runner_function; }
+
   /*!
    * \brief Get the Module object
    *
@@ -514,6 +548,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     graph_codegen_->Codegen(func);
 
     ret_.graph = graph_codegen_->GetGraph();
+    ret_.runner_function = graph_codegen_->GetRunnerFunction();
     ret_.params = graph_codegen_->GetParams();
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
@@ -557,7 +592,7 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     auto ext_mods = graph_codegen_->GetExternalModules();
     ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, GetTargetHost(),
-                                                  graph_codegen_->GetAOTMetdata());
+                                                  graph_codegen_->GetAOTMetadata());
   }
 
  private:
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index bfd1cdc1f77c..351469d6e1ca 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -209,7 +209,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (const auto& kv : token_map_) {
       std::vector<Integer> storage_ids;
       std::vector<Integer> device_types;
-      std::vector<Integer> sid_sizes;
+      std::vector<Integer> sid_sizes_byte;
       for (StorageToken* tok : kv.second) {
         if (tok->device_type) {
           num_annotated_nodes++;
@@ -217,9 +217,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
         device_types.push_back(tok->device_type);
-        sid_sizes.push_back(GetMemorySize(tok));
+        sid_sizes_byte.push_back(GetMemorySize(tok));
       }
-      smap.Set(GetRef<Expr>(kv.first), Array<IntegerArray>({storage_ids, device_types, sid_sizes}));
+      smap.Set(GetRef<Expr>(kv.first),
+               Array<IntegerArray>({storage_ids, device_types, sid_sizes_byte}));
     }
     // Either all or none of the nodes should be annotated.
     if (num_annotated_nodes != 0 && num_annotated_nodes != num_nodes) {
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 8d3acab1858b..38c53d273a6e 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -68,6 +68,7 @@ endef
 LIBS = \
 	src/runtime/crt/common \
 	src/runtime/crt/graph_executor \
+	src/runtime/crt/aot_executor \
 	src/runtime/crt/graph_executor_module \
 	src/runtime/crt/memory \
 	src/runtime/crt/utvm_rpc_common \
diff --git a/src/runtime/crt/aot/tvm_executor.c b/src/runtime/crt/aot/tvm_executor.c
deleted file mode 100644
index 74069c6af26e..000000000000
--- a/src/runtime/crt/aot/tvm_executor.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-// LINT_C_FILE
-
-/*!
- * \file src/runtime/crt/aot/tvm_executor.c
- * \brief Internal implementation of the AOT Executor
- */
-
-#include "tvm_executor.h"
-
-#include <dlpack/dlpack.h>
-
-#include "tvm_backend.h"
-#include "tvm_error.h"
-
-tvm_workspace_t* tvm_runtime_workspace;
-
-tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs,
-                                tvm_context_t* context) {
-  static DLContext fake_ctx = {kDLCPU, 0};
-  static int64_t fake_dims = 0;
-  static int64_t fake_shape = {0};
-
-  DLTensor tensors[model->num_input_tensors + model->num_output_tensors];     // NOLINT
-  TVMValue tvm_values[model->num_input_tensors + model->num_output_tensors];  // NOLINT
-  int32_t tvm_typeids[model->num_input_tensors + model->num_output_tensors];  // NOLINT
-
-  for (int i = 0; i < model->num_input_tensors; i++) {
-    tensors[i] = (DLTensor){
-        .ctx = fake_ctx,
-        .data = inputs[i],
-        .shape = &fake_shape,
-        .ndim = fake_dims,
-        .byte_offset = 0,
-        .strides = NULL,
-    };
-    tvm_values[i].v_handle = &tensors[i];
-  }
-
-  for (int i = 0; i < model->num_output_tensors; i++) {
-    tensors[model->num_input_tensors + i] = (DLTensor){
-        .ctx = fake_ctx,
-        .data = outputs[i],
-        .shape = &fake_shape,
-        .ndim = fake_dims,
-        .byte_offset = 0,
-        .strides = NULL,
-    };
-    tvm_values[model->num_input_tensors + i].v_handle = &tensors[model->num_input_tensors + i];
-  }
-
-  return model->run_func(&tvm_values, &tvm_typeids, 0, NULL, 0, context);
-}
-
-void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
-                               int dtype_bits_hint) {
-  uint32_t offset = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT - 1);
-  uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
-  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset;
-  uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
-
-  if (next_alloc > workspace_end) {
-    return NULL;
-  }
-
-  tvm_runtime_workspace->next_alloc = next_alloc;
-  return current_alloc;
-}
-
-tvm_crt_error_t TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
-  tvm_runtime_workspace->next_alloc = ptr;
-  return 0;
-}
diff --git a/src/runtime/crt/common/aot_backend_api.c b/src/runtime/crt/common/aot_backend_api.c
new file mode 100644
index 000000000000..782ea89f0cb2
--- /dev/null
+++ b/src/runtime/crt/common/aot_backend_api.c
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/crt/error_codes.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/platform.h>
+
+#include "crt_config.h"
+
+void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
+                               int dtype_bits_hint) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  void* ptr = 0;
+  DLDevice dev = {device_type, device_id};
+  assert(nbytes > 0);
+  err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr);
+  CHECK_EQ(err, kTvmErrorNoError,
+           "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type,
+           device_id, nbytes, dtype_code_hint, dtype_bits_hint, err);
+  return ptr;
+}
+
+int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  DLDevice dev = {device_type, device_id};
+  err = TVMPlatformMemoryFree(ptr, dev);
+  return err;
+}
+
+int TVMBackendParallelLaunch(FTVMParallelLambda flambda, void* cdata, int num_task) {
+  TVMParallelGroupEnv env;
+  env.num_task = 1;
+  flambda(0, &env, cdata);
+  return 0;
+}
diff --git a/src/runtime/crt/memory/stack_memory.c b/src/runtime/crt/memory/stack_memory.c
new file mode 100644
index 000000000000..ac805b09d564
--- /dev/null
+++ b/src/runtime/crt/memory/stack_memory.c
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+#include <tvm/runtime/crt/stack_memory.h>
+
+void* MemoryManager_Allocate(tvm_workspace_t *tvm_runtime_workspace, int32_t nbytes) {
+  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT - 1);
+  uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
+  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
+  uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
+
+  if (next_alloc > workspace_end) {
+    return NULL;
+  }
+
+  tvm_runtime_workspace->next_alloc = next_alloc;
+  return current_alloc;
+}
+
+tvm_crt_error_t MemoryManager_Free(tvm_workspace_t *tvm_runtime_workspace, void* ptr) {
+  tvm_runtime_workspace->next_alloc = ptr;
+  return 0;
+}
+
+void MemoryManager_Init(tvm_workspace_t *tvm_runtime_workspace, uint8_t* g_aot_memory, size_t workspace_size){
+	tvm_runtime_workspace->next_alloc = g_aot_memory;
+	tvm_runtime_workspace->workspace = g_aot_memory;
+	tvm_runtime_workspace->workspace_size = workspace_size;
+}
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index ac0f0b9f07dc..91ffa9546367 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -47,13 +47,8 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool is_aot_executor
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#define TVM_EXPORTS\n";
-  if (is_aot_executor) {
-    decl_stream << "#include \"tvm_executor.h\"\n";
-    decl_stream << "#include \"dlpack/dlpack.h\"\n";
-  } else {
-    decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
-    decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
-  }
+  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
 
   decl_stream << "#include <math.h>\n";
   decl_stream << "void* " << module_name_ << " = NULL;\n";
@@ -351,7 +346,8 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, const char* compare,
 }
 
 runtime::Module BuildCHost(IRModule mod, Target target) {
-  bool is_aot_executor = (target->GetAttr<String>("executor").value_or("graph_runtime") == "aot");
+  bool is_aot_executor =
+      (target->GetAttr<String>("executor").value_or(kTvmExecutorGraph) == kTvmExecutorAot);
 
   using tvm::runtime::Registry;
   bool output_ssa = false;
@@ -399,7 +395,8 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
 
   if (is_aot_executor) {
     ICHECK(aot_executor_fn.defined())
-        << "When using aot executor the executor function should be defined";
+        << "When using aot executor the executor function "
+        << ::tvm::runtime::symbol::tvm_lookup_linked_param << " should be defined";
     cg.AddFunction(aot_executor_fn);
   }
 
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index e91d78f580f2..32377df41f12 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -1,3 +1,4 @@
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -174,6 +175,7 @@ runtime::Module DeviceSourceModuleCreate(
  * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime.
  * \param modules The modules to be wrapped.
  * \param target the target the modules are compiled for.
+ * \param aot_metadata the metadata needed for aot code generation.
  * \return The wrapped module.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 68de392e06f6..bf24692c3484 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -193,7 +193,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   void GenerateAOTDescriptor() {
-    code_ << "#include <tvm_executor.h>\n";
+    code_ << "#include \"aot_executor.h\"\n";
+    code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
     code_ << "#endif\n";
diff --git a/tests/crt/aot_executor_test.cc b/tests/crt/aot_executor_test.cc
index 753d9d9dc4de..5f5cfdb5d6c7 100644
--- a/tests/crt/aot_executor_test.cc
+++ b/tests/crt/aot_executor_test.cc
@@ -19,8 +19,8 @@
 
 #include <dlpack/dlpack.h>
 #include <gtest/gtest.h>
-
-#include "tvm_executor.h"
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/aot_executor.h>
 
 int32_t test_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
                       void* out_ret_tcode, void* resource_handle) {
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index a5df9a5b6477..15780e0b20f5 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -18,34 +18,27 @@
  */
 
 #include <gtest/gtest.h>
-
-#include "tvm_backend.h"
-
-// TODO(Mousius) - Move memory allocation to individual networks
-extern tvm_workspace_t* tvm_runtime_workspace;
+#include <tvm/runtime/crt/stack_memory.h>
 
 /*
  * Tests allocations are properly aligned when allocated
  */
 TEST(AOTMemory, Allocate) {
   static uint8_t model_memory[80];
-  tvm_workspace_t workspace = {
-      .next_alloc = model_memory,
-      .workspace = model_memory,
-      .workspace_size = 80,
-  };
-  tvm_runtime_workspace = &workspace;
-
-  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  tvm_workspace_t tvm_runtime_workspace;
+
+  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+
+  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = TVMBackendAllocWorkspace(0, 0, 2, 0, 0);
+  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 2);
   ASSERT_EQ(block_two, &model_memory[16]);
 
-  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 24, 0, 0);
+  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 24);
   ASSERT_EQ(two_blocks, &model_memory[32]);
 
-  void* block_three = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  void* block_three = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_three, &model_memory[64]);
 }
 
@@ -54,25 +47,21 @@ TEST(AOTMemory, Allocate) {
  */
 TEST(AOTMemory, Free) {
   static uint8_t model_memory[80];
-  tvm_workspace_t workspace = {
-      .next_alloc = model_memory,
-      .workspace = model_memory,
-      .workspace_size = 80,
-  };
-  tvm_runtime_workspace = &workspace;
-
-  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  tvm_workspace_t tvm_runtime_workspace;
+  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+
+  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_two, &model_memory[16]);
-  ASSERT_EQ(0, TVMBackendFreeWorkspace(0, 0, block_two));
+  ASSERT_EQ(0, MemoryManager_Free(&tvm_runtime_workspace, block_two));
 
-  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 2, 0, 0);
+  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 2);
   ASSERT_EQ(two_blocks, &model_memory[16]);
-  ASSERT_EQ(0, TVMBackendFreeWorkspace(0, 0, two_blocks));
+  ASSERT_EQ(0, MemoryManager_Free(&tvm_runtime_workspace, two_blocks));
 
-  void* block_three = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  void* block_three = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_three, &model_memory[16]);
 }
 
@@ -81,20 +70,16 @@ TEST(AOTMemory, Free) {
  */
 TEST(AOTMemory, OverAllocate) {
   static uint8_t model_memory[72];
-  tvm_workspace_t workspace = {
-      .next_alloc = model_memory,
-      .workspace = model_memory,
-      .workspace_size = 72,
-  };
-  tvm_runtime_workspace = &workspace;
-
-  void* block_one = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  tvm_workspace_t tvm_runtime_workspace;
+  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+
+  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = TVMBackendAllocWorkspace(0, 0, 1, 0, 0);
+  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_two, &model_memory[16]);
 
-  void* two_blocks = TVMBackendAllocWorkspace(0, 0, 64, 0, 0);
+  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 64);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
 
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index 66dd6e6ae21f..91030fac787a 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -29,8 +29,9 @@ CC_OPTS = CC=$(CC) AR=$(AR) RANLIB=$(RANLIB)
 
 
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
-	-I$(TVM_ROOT)/include/tvm/runtime/crt/aot \
-    -I$(TVM_ROOT)/src/runtime/crt/include \
+	-I$(TVM_ROOT)/include/tvm/runtime/crt \
+	-I$(TVM_ROOT)/src/runtime/crt/host \
+	-I$(TVM_ROOT)/include \
 	-I$(DMLC_CORE)/include \
 	-I$(TVM_ROOT)/3rdparty/dlpack/include \
 	-I$(AOT_ROOT)\
@@ -46,7 +47,7 @@ CRT_SRCS = $(shell find $(CRT_ROOT))
 
 aot_test_runner: $(build_dir)/aot_test_runner
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o 
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o  $(build_dir)/stack_memory.o $(build_dir)/crt_backend_api.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
 
@@ -58,7 +59,15 @@ $(build_dir)/lib0.o: $(build_dir)/../codegen/host/src/lib0.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
-$(build_dir)/tvm_executor.o: $(TVM_ROOT)/src/runtime/crt/aot/tvm_executor.c
+$(build_dir)/tvm_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_executor.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+
+$(build_dir)/stack_memory.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_memory.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+
+$(build_dir)/crt_backend_api.o: $(TVM_ROOT)/src/runtime/crt/common/aot_backend_api.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
diff --git a/tests/python/relay/aot/infra.py b/tests/python/relay/aot/infra.py
index 475b150ccd65..0ccd474ed046 100644
--- a/tests/python/relay/aot/infra.py
+++ b/tests/python/relay/aot/infra.py
@@ -39,7 +39,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.op.contrib import get_pattern_table
-from tvm.contrib import utils
+from tvm.contrib import utils, graph_executor
 from tvm.relay.backend import compile_engine
 from tvm.contrib import utils
 from tvm.contrib import graph_runtime
@@ -72,7 +72,8 @@ def create_main(test_name, input_list, output_list, output_path):
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
         main_file.write("#include <stdio.h>\n")
-        main_file.write("#include <tvm_executor.h>\n")
+        main_file.write('#include "aot_executor.h"\n')
+        main_file.write('#include "stack_memory.h"\n')
         main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
 
@@ -83,7 +84,23 @@ def create_main(test_name, input_list, output_list, output_path):
             main_file.write('#include "output_data%i.h"\n' % i)
 
         main_file.write("extern tvm_model_t network;\n")
-        main_file.write("extern tvm_workspace_t *tvm_runtime_workspace;\n")
+        main_file.write("tvm_workspace_t app_workspace;\n")
+        main_file.write(
+            """
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr){
+    (*out_ptr) = MemoryManager_Allocate(&app_workspace, num_bytes);
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev){
+    MemoryManager_Free(&app_workspace,ptr);
+}
+
+void  TVMPlatformAbort(tvm_crt_error_t code) { }
+
+void TVMLogf(const char* msg, ...) { }
+      
+        """
+        )
         main_file.write("int main(){\n")
         main_file.write("void* inputs[%i] = { " % (len(input_list)))
 
@@ -96,11 +113,7 @@ def create_main(test_name, input_list, output_list, output_path):
             main_file.write("output_data%i, " % i)
         main_file.write("};\n")
 
-        main_file.write("")
-        main_file.write(
-            "tvm_workspace_t app_workspace = {.next_alloc=g_aot_memory, .workspace=g_aot_memory, .workspace_size=WORKSPACE_SIZE};\n"
-        )
-        main_file.write("tvm_runtime_workspace = &app_workspace;\n")
+        main_file.write("MemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);")
         main_file.write("tvm_runtime_run(&network, inputs, outputs, NULL);")
 
         for i in range(0, len(output_list)):
@@ -205,7 +218,7 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
     lib_path = temp.relpath(lib_name)
     lib.export_library(lib_path)
     lib = tvm.runtime.load_module(lib_path)
-    grt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+    grt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu()))
     grt_mod.set_input(**input_data)
     grt_mod.run()
     output_count = grt_mod.get_num_outputs()
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index b6480e039c61..2638737adbee 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -34,7 +34,7 @@
 from tvm.contrib import utils
 from tvm.relay.backend import compile_engine
 from tvm.contrib import utils
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.micro import export_model_library_format
 from tvm.relay import testing
 

From 9370ed45374b2b3391c89cfbdec087ef91737c54 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 12 Apr 2021 22:03:57 +0100
Subject: [PATCH 04/33] Fixing comments + refactoring - 2

Change-Id: I7200cc17b297e42bf67dcdef6f643e86991ca0a8
---
 apps/bundle_deploy/bundle.c                   |   2 +-
 apps/bundle_deploy/bundle_static.c            |   2 +-
 include/tvm/runtime/crt/aot_executor.h        |  34 ++---
 .../crt/{memory.h => page_allocator.h}        |   8 +-
 .../crt/{stack_memory.h => stack_allocator.h} |  24 ++--
 python/tvm/micro/model_library_format.py      |  11 +-
 python/tvm/relay/backend/executor_factory.py  | 133 +++++-------------
 .../relay/backend/graph_executor_codegen.py   |   2 +-
 python/tvm/relay/build_module.py              |  31 ++--
 src/relay/backend/aot_codegen.cc              |  96 ++++++++-----
 src/relay/backend/build_module.cc             |  40 +++---
 src/relay/backend/graph_executor_codegen.cc   |   2 +-
 src/runtime/crt/aot_executor/aot_executor.c   |  65 +++++++++
 src/runtime/crt/common/crt_backend_api.c      |   2 +-
 src/runtime/crt/common/crt_runtime_api.c      |   2 -
 src/runtime/crt/common/ndarray.c              |   2 +-
 .../crt/graph_executor/graph_executor.c       |   2 +-
 src/runtime/crt/graph_executor/load_json.c    |   2 +-
 src/runtime/crt/host/main.cc                  |   2 +-
 .../memory/{memory.h => page_allocator.h}     |  10 +-
 .../crt/memory/{memory.c => page_allocator.c} |   3 +-
 .../{stack_memory.c => stack_allocator.c}     |  17 +--
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |   2 +-
 tests/cpp/relay_build_module_test.cc          |   2 +-
 tests/cpp/utvm_runtime_standalone_test.cc     |   2 +-
 tests/crt/aot_executor_test.cc                |  61 +++-----
 tests/crt/aot_memory_test.cc                  |  34 ++---
 tests/crt/framing_test.cc                     |   2 +-
 tests/crt/memory_test.cc                      |   4 +-
 tests/crt/session_test.cc                     |   2 +-
 tests/python/relay/aot/aot_test.mk            |   4 +-
 tests/python/relay/aot/infra.py               |  14 +-
 tests/python/relay/aot/test_crt_aot.py        |  15 +-
 tests/python/unittest/test_crt.py             |   2 +-
 .../test_runtime_module_based_interface.py    |   2 +-
 35 files changed, 315 insertions(+), 323 deletions(-)
 rename include/tvm/runtime/crt/{memory.h => page_allocator.h} (94%)
 rename include/tvm/runtime/crt/{stack_memory.h => stack_allocator.h} (57%)
 create mode 100644 src/runtime/crt/aot_executor/aot_executor.c
 rename src/runtime/crt/include/tvm/runtime/crt/internal/memory/{memory.h => page_allocator.h} (94%)
 rename src/runtime/crt/memory/{memory.c => page_allocator.c} (99%)
 rename src/runtime/crt/memory/{stack_memory.c => stack_allocator.c} (70%)

diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c
index 9083f7b5f48b..4dbe1141c6d4 100644
--- a/apps/bundle_deploy/bundle.c
+++ b/apps/bundle_deploy/bundle.c
@@ -23,8 +23,8 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/graph_executor.h>
-#include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/packed_func.h>
+#include <tvm/runtime/crt/page_allocator.h>
 
 #ifdef ENABLE_TVM_ABORT_BACKTRACE
 #include "backtrace.h"
diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index 62e63d6b4fe2..6e3867e4f4a2 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -22,7 +22,7 @@
 #include <stdlib.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/graph_executor.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/packed_func.h>
 #include <unistd.h>
 
diff --git a/include/tvm/runtime/crt/aot_executor.h b/include/tvm/runtime/crt/aot_executor.h
index b152e3aa7332..6d3e80e2ccaf 100644
--- a/include/tvm/runtime/crt/aot_executor.h
+++ b/include/tvm/runtime/crt/aot_executor.h
@@ -47,52 +47,38 @@
  * }
  */
 
-#ifndef TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
-#define TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
+#ifndef TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
 
 #include <stdint.h>
-
-#include "error_codes.h"
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*!
- * \brief Context information for future integrations
- *  which is passed through to the operators.
- *
- * \note Can be used for drivers and platform specific information.
- */
-typedef struct {
-} tvm_context_t;
-
-typedef int32_t(tvm_function_t)(void* args, void* arg_type_ids, int32_t num_args,
-                                void* out_ret_value, void* out_ret_tcode, void* resource_handle);
-
 /*!
  * \brief TVM Model descriptor to describe the
  *  model to the runtime.
  */
 typedef struct {
-  uint32_t num_input_tensors;  /** Number of expected input tensors */
-  uint32_t num_output_tensors; /** Number of expected output tensors */
-  tvm_function_t* run_func;    /** Generated model function, called through tvm_runtime_run */
+  uint32_t num_input_tensors;     /** Number of expected input tensors */
+  uint32_t num_output_tensors;    /** Number of expected output tensors */
+  TVMBackendPackedCFunc run_func; /** Generated model function, called through tvm_runtime_run */
 } tvm_model_t;
 
 /*!
- * \brief Main entry point for
+ * \brief Main entry point to execute the AOT runner function
  * \param model Model descriptor structure to reference for runtime information
  * \param inputs Pointer to input pointer(s)
  * \param outputs Pointer to output pointer(s)
- * \param context Context information to be passed through to operators
  * \return tvm_status_t containing success or errors from the model run
  */
-tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs,
-                                tvm_context_t* context);
+tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_AOT_TVM_EXECUTOR_H_
+#endif  // TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
diff --git a/include/tvm/runtime/crt/memory.h b/include/tvm/runtime/crt/page_allocator.h
similarity index 94%
rename from include/tvm/runtime/crt/memory.h
rename to include/tvm/runtime/crt/page_allocator.h
index c830116528e0..a379c6b8ded5 100644
--- a/include/tvm/runtime/crt/memory.h
+++ b/include/tvm/runtime/crt/page_allocator.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file tvm/runtime/crt/memory.h
+ * \file tvm/runtime/crt/page_allocator.h
  * \brief An implementation of a dynamic memory allocator for microcontrollers.
  */
 
-#ifndef TVM_RUNTIME_CRT_MEMORY_H_
-#define TVM_RUNTIME_CRT_MEMORY_H_
+#ifndef TVM_RUNTIME_CRT_PAGE_ALLOCATOR_H_
+#define TVM_RUNTIME_CRT_PAGE_ALLOCATOR_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,4 +79,4 @@ tvm_crt_error_t MemoryManagerCreate(MemoryManagerInterface** manager, uint8_t* m
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_MEMORY_H_
+#endif  // TVM_RUNTIME_CRT_PAGE_ALLOCATOR_H_
diff --git a/include/tvm/runtime/crt/stack_memory.h b/include/tvm/runtime/crt/stack_allocator.h
similarity index 57%
rename from include/tvm/runtime/crt/stack_memory.h
rename to include/tvm/runtime/crt/stack_allocator.h
index 563311eced43..43db589831d9 100644
--- a/include/tvm/runtime/crt/stack_memory.h
+++ b/include/tvm/runtime/crt/stack_allocator.h
@@ -18,8 +18,8 @@
  */
 
 // LINT_C_FILE
-#ifndef TVM_RUNTIME_CRT_STACK_MEMORY_H_
-#define TVM_RUNTIME_CRT_STACK_MEMORY_H_
+#ifndef TVM_RUNTIME_CRT_STACK_ALLOCATOR_H_
+#define TVM_RUNTIME_CRT_STACK_ALLOCATOR_H_
 #include <stddef.h>
 #include <stdint.h>
 
@@ -27,8 +27,8 @@
 
 /*! Memory alignment for allocator */
 
-#ifndef TVM_RUNTIME_ALLOC_ALIGNMENT
-#define TVM_RUNTIME_ALLOC_ALIGNMENT 16
+#ifndef TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES
+#define TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES 16
 #endif
 
 #ifdef __cplusplus
@@ -36,20 +36,20 @@ extern "C" {
 #endif
 
 typedef struct {
-  uint8_t* next_alloc;   /** Pointer to the next block of bytes to allocate */
-  uint8_t* workspace;    /** Pointer to start of the workspace */
-  size_t workspace_size; /** Total number of bytes in the workspace */
+  uint8_t* next_alloc;    // Pointer to the next block of TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES
+  uint8_t* workspace;     // Pointer to start of the workspace
+  size_t workspace_size;  // Total number of bytes in the workspace
 } tvm_workspace_t;
 
-void MemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
-                        size_t workspace_size);
+void StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
+                             size_t workspace_size);
 
-void* MemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes);
+void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes);
 
-tvm_crt_error_t MemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);
+tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_STACK_MEMORY_H_
+#endif  // TVM_RUNTIME_CRT_STACK_ALLOCATOR_H_
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 64fcb9bf8790..833740ab7fc2 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -73,7 +73,7 @@ def _populate_codegen_dir(mod, codegen_dir: str):
         dso_mod.save(file_name)
 
 
-def _build_memory_map(graph_str):
+def _build_memory_map(graph_json):
     """Build a simpler memory map from graph JSON.
 
     Parameters
@@ -86,7 +86,7 @@ def _build_memory_map(graph_str):
     list :
         A list with one entry per storage id describing that memory.
     """
-    graph = json.loads(graph_str)
+    graph = json.loads(graph_json)
 
     seen_storage_ids = set()
     for node_id, storage_id in enumerate(graph["attrs"]["storage_id"][1]):
@@ -160,8 +160,11 @@ def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, fil
     with open(tempdir.relpath("relay.txt"), "w") as f:
         f.write(str(mod.ir_mod))
 
-    graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
-    mod.save_config(graph_config_dir_path)
+    if not is_aot:
+        graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
+        os.makedirs(graph_config_dir_path)
+        with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
+            f.write(mod.save_executor_config())
 
     with tarfile.open(file_name, "w") as tar_f:
 
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index 22712781355d..2b1e1e597387 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -14,84 +14,51 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph executor factory."""
+"""Executor factory modules."""
 import warnings
+from abc import abstractmethod
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
+
 from tvm import tir
 
 
 class ExecutorFactoryModule:
-    """Graph executor factory module.
-    This is a module of graph executor factory
-
-    Parameters
-    ----------
-    graph_str : str
-        Depending on executor:
-        * Graph executor: the graph to be deployed in json format output by graph compiler.
-        The graph can contain operator(tvm_op) that points to the name of
-        PackedFunc in the libmod.
-        * AOT executor: the string representation of the TIR executor PrimFunction
-    target : tvm.Target
-        The Target used to build this module.
-    libmod : tvm.Module
-        The module of the corresponding function
-    libmod_name: str
-        The name of module
-    params : dict of str to NDArray
-        The parameters of module
+    """Common interface for executor factory modules
+    This class describes the common API of different
+    factory modules
     """
 
+    @abstractmethod
     def get_internal_repr(self):
-        return self.internal_repr
+        """Common function to return the internal representation
+        the executor relies upon to execute the network
+        """
+        raise NotImplementedError
 
+    @abstractmethod
     def get_params(self):
-        return None
-
+        """
+        Sometimes we want to get params explicitly.
+        For example, we want to save its params value to
+        an independent file.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
     def get_lib(self):
-        return None
-
-    def __getitem__(self, item):
-        return None
-
-    def __iter__(self):
-        warnings.warn(
-            "legacy graph executor behavior of producing json / lib / params will be "
-            "removed in the next release."
-            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
-            " new recommended usage.",
-            DeprecationWarning,
-            2,
-        )
-        return self
-
-    def save_config(self, config_path):
-        pass
-
-    def __next__(self):
-        if self.iter_cnt > 2:
-            raise StopIteration
-
-        objs = [self.internal_repr, self.lib, self.params]
-        obj = objs[self.iter_cnt]
-        self.iter_cnt += 1
-        return obj
+        """ Return the generated library"""
+        raise NotImplementedError
 
 
 class AOTExecutorFactoryModule(ExecutorFactoryModule):
-    """Graph executor factory module.
-    This is a module of graph executor factory
+    """AOT executor factory module.
 
     Parameters
     ----------
-    graph_str : str
-        Depending on executor:
-        * Graph executor: the graph to be deployed in json format output by graph compiler.
-        The graph can contain operator(tvm_op) that points to the name of
-        PackedFunc in the libmod.
-        * AOT executor: the string representation of the TIR executor PrimFunction
+    runner_function : the PrimFunc containing of the TIR main executor function.
     target : tvm.Target
         The Target used to build this module.
     libmod : tvm.Module
@@ -111,7 +78,7 @@ def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, params)
 
         self.ir_mod = ir_mod
         self.target = target
-        self.internal_repr = runner_function
+        self.runner_func = runner_function
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
@@ -123,35 +90,12 @@ def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, params)
     def get_params(self):
         return self.params
 
-    def get_runner_function(self):
-        return self.internal_repr
+    def get_internal_repr(self):
+        return self.runner_func
 
     def get_lib(self):
         return self.lib
 
-    def __getitem__(self, item):
-        return self.module.__getitem__(item)
-
-    def __iter__(self):
-        warnings.warn(
-            "legacy graph executor behavior of producing json / lib / params will be "
-            "removed in the next release."
-            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
-            " new recommended usage.",
-            DeprecationWarning,
-            2,
-        )
-        return self
-
-    def __next__(self):
-        if self.iter_cnt > 2:
-            raise StopIteration
-
-        objs = [self.graph, self.lib, self.params]
-        obj = objs[self.iter_cnt]
-        self.iter_cnt += 1
-        return obj
-
 
 class GraphExecutorFactoryModule(ExecutorFactoryModule):
     """Graph executor factory module.
@@ -159,12 +103,9 @@ class GraphExecutorFactoryModule(ExecutorFactoryModule):
 
     Parameters
     ----------
-    graph_str : str
-        Depending on executor:
-        * Graph executor: the graph to be deployed in json format output by graph compiler.
+    graph_str : the json graph to be deployed in json format output by graph compiler.
         The graph can contain operator(tvm_op) that points to the name of
         PackedFunc in the libmod.
-        * AOT executor: the string representation of the TIR executor PrimFunction
     target : tvm.Target
         The Target used to build this module.
     libmod : tvm.Module
@@ -186,7 +127,7 @@ def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
         self.ir_mod = ir_mod
         self.target = target
         self.module = fcreate(graph_str, libmod, libmod_name, *args)
-        self.internal_repr = graph_str
+        self.graph = graph_str
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
@@ -195,20 +136,18 @@ def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
     def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
         return self.module.export_library(file_name, fcompile, addons, **kwargs)
 
-    def save_config(self, config_path):
-        os.makedirs(config_path)
-        with open(os.path.join(config_path, "graph.json"), "w") as f:
-            f.write(mod.graph)
+    def save_executor_config(self):
+        return self.graph
 
-    # Sometimes we want to get params explicitly.
-    # For example, we want to save its params value to
-    # an independent file.
     def get_params(self):
         return self.params
 
-    def get_graph(self):
+    def get_graph_json(self):
         return self.internal_repr
 
+    def get_internal_repr(self):
+        return self.graph
+
     def get_lib(self):
         return self.lib
 
diff --git a/python/tvm/relay/backend/graph_executor_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
index 6dcc5655aa9a..f24bf2c2b55b 100644
--- a/python/tvm/relay/backend/graph_executor_codegen.py
+++ b/python/tvm/relay/backend/graph_executor_codegen.py
@@ -46,7 +46,7 @@ def __init__(self, mod, target):
         self._mod = _build_module._GraphExecutorCodegen()
         self._init = self._mod["init"]
         self._codegen = self._mod["codegen"]
-        self._get_graph_json = self._mod["get_graph"]
+        self._get_graph_json = self._mod["get_graph_json"]
         self._list_params_name = self._mod["list_params_name"]
         self._get_param_by_name = self._mod["get_param_by_name"]
         self._get_irmodule = self._mod["get_irmodule"]
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index a9458073b746..826f50858770 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -77,14 +77,14 @@ class BuildModule(object):
 
     def __init__(self):
         self.mod = _build_module._BuildModule()
-        self._get_graph = self.mod["get_graph"]
+        self._get_graph_json = self.mod["get_graph_json"]
         self._get_runner_function = self.mod["get_runner_function"]
         self._get_module = self.mod["get_module"]
         self._build = self.mod["build"]
         self._optimize = self.mod["optimize"]
         self._set_params_func = self.mod["set_params"]
         self._get_params_func = self.mod["get_params"]
-        self._get_executor = self.mod["get_executor"]
+        self._get_executor_type = self.mod["get_executor_type"]
 
     def build(self, mod, target=None, target_host=None, params=None):
         """
@@ -148,7 +148,9 @@ def build(self, mod, target=None, target_host=None, params=None):
         mod = self.get_module()
         params = self.get_params()
         internal_repr = (
-            self._get_runner_function() if self.get_executor() == "aot" else self.get_graph()
+            self._get_runner_function()
+            if self.get_executor_type() == "aot"
+            else self.get_graph_json()
         )
 
         return internal_repr, mod, params
@@ -191,9 +193,9 @@ def optimize(self, mod, target=None, params=None):
     def _set_params(self, params):
         self._set_params_func(_convert_param_map(params))
 
-    def get_graph(self):
+    def get_graph_json(self):
         """Return the json file of the built program."""
-        return self._get_graph()
+        return self._get_graph_json()
 
     def get_module(self):
         """Return the built module."""
@@ -207,8 +209,9 @@ def get_params(self):
             ret[key] = value.data
         return ret
 
-    def get_executor(self):
-        return self._get_executor()
+    def get_executor_type(self):
+        """ Return the executor TVM is building for """
+        return self._get_executor_type()
 
 
 @register_func("tvm.relay.module_export_library")
@@ -258,7 +261,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     Returns
     -------
-    factory_module : tvm.relay.backend.graph_executor_factory.ExecutorFactoryModule
+    factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
             The runtime factory for the TVM graph executor.
     """
     # pylint: enable=line-too-long
@@ -294,16 +297,18 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        runtime_repr, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
+        internal_repr, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
 
-        if bld_mod.get_executor() == "aot":
+        if bld_mod.get_executor_type() == "aot":
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
-                ir_mod, target, runtime_repr, runtime_mod, mod_name, params
+                ir_mod, target, internal_repr, runtime_mod, mod_name, params
             )
-        else:
+        elif bld_mod.get_executor_type() == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
-                ir_mod, target, runtime_repr, runtime_mod, mod_name, params
+                ir_mod, target, internal_repr, runtime_mod, mod_name, params
             )
+        else:
+            assert False, "Executor not supported"
 
         return executor_factory
 
diff --git a/src/relay/backend/aot_codegen.cc b/src/relay/backend/aot_codegen.cc
index 1e6edbd62edb..5a509a8219c1 100644
--- a/src/relay/backend/aot_codegen.cc
+++ b/src/relay/backend/aot_codegen.cc
@@ -102,7 +102,7 @@ class AotReturnSidVisitor : public ExprVisitor {
   IntegerArray return_sid_;
 };
 
-/*! \brief Code generator for graph runtime */
+/*! \brief Code generator for AOT executor */
 class AOTCodegen : public ExprVisitor {
  protected:
   /*!
@@ -367,8 +367,9 @@ class AOTCodegen : public ExprVisitor {
     Expr expr = GetRef<Expr>(op);
 
     // If the Var node is an output node we need to copy the content of the variable to the output
-    // A Var node can only produce a single output
+    // It's safe to check the SID here because Var StorageToken are never reallocated
     Array<IntegerArray> sids = storage_device_map_[expr];
+
     auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
                                  static_cast<int>((sids[0][0].as<IntImmNode>())->value));
     if (output_iter != return_sid_.end()) {
@@ -459,6 +460,8 @@ class AOTCodegen : public ExprVisitor {
           continue;
         }
 
+        // TODO(giuseros): we should allocate this one time outside the PrimFunc
+        // so we dont' pay the price of allocation for every inference
         if (!allocated[sid]) {
           body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), body);
         }
@@ -602,57 +605,39 @@ class AOTCodegenModule : public runtime::ModuleNode {
         void* mod = args[0];
         Map<Integer, tvm::Target> tmp = args[1];
         tvm::Target target_host = args[2];
-        TargetsMap targets;
-        for (const auto& it : tmp) {
-          auto dev_type = it.first.as<tir::IntImmNode>();
-          ICHECK(dev_type);
-          targets[dev_type->value] = it.second;
-        }
-        codegen_ = std::make_shared<AOTCodegen>(reinterpret_cast<runtime::Module*>(mod), targets,
-                                                target_host);
+        init(mod, tmp, target_host);
       });
     } else if (name == "codegen") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         Function func = args[0];
-        this->output_ = this->codegen_->Codegen(func);
+        this->output_ = codegen(func);
       });
     } else if (name == "get_runner_function") {
-      return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.runner_func; });
-    } else if (name == "list_params_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        Array<runtime::String> ret;
-        for (const auto& kv : this->output_.params) {
-          ret.push_back(kv.first);
-        }
-        *rv = ret;
-      });
+        *rv = get_runner_function();
+      });  // c; });
+    } else if (name == "list_params_name") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = list_params_name(); });
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        auto it = this->output_.params.find(key);
-        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-        *rv = (*it).second.second;
+        *rv = get_param_by_name(key);
       });
     } else if (name == "get_param_id") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        auto it = this->output_.params.find(key);
-        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-        *rv = (*it).second.first;
+        *rv = get_param_id(key);
       });
     } else if (name == "get_irmodule") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->output_.lowered_funcs;
-      });
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_irmodule(); });
     } else if (name == "get_external_modules") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->output_.external_mods;
-      });
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_external_modules(); });
     } else if (name == "get_aot_metadata") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->output_.aot_metadata;
-      });
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_aot_metadata(); });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
@@ -661,6 +646,47 @@ class AOTCodegenModule : public runtime::ModuleNode {
   const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; }
 
  private:
+  void init(void* mod, Map<Integer, tvm::Target> tmp, Target target_host) {
+    TargetsMap targets;
+    for (const auto& it : tmp) {
+      auto dev_type = it.first.as<tir::IntImmNode>();
+      ICHECK(dev_type);
+      targets[dev_type->value] = it.second;
+    }
+    codegen_ =
+        std::make_shared<AOTCodegen>(reinterpret_cast<runtime::Module*>(mod), targets, target_host);
+  }
+
+  AOTLoweredOutput codegen(Function func) { return this->codegen_->Codegen(func); }
+
+  tir::PrimFunc get_runner_function() { return this->output_.runner_func; }
+
+  Array<runtime::String> list_params_name() {
+    Array<runtime::String> ret;
+    for (const auto& kv : this->output_.params) {
+      ret.push_back(kv.first);
+    }
+    return ret;
+  }
+
+  runtime::NDArray get_param_by_name(String key) {
+    auto it = this->output_.params.find(key);
+    CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+    return (*it).second.second;
+  }
+
+  Array<tvm::runtime::Module> get_external_modules() { return output_.external_mods; }
+
+  int get_param_id(String key) {
+    auto it = this->output_.params.find(key);
+    CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+    return (*it).second.first;
+  }
+
+  Map<String, IRModule> get_irmodule() { return this->output_.lowered_funcs; }
+
+  runtime::AOTMetadata get_aot_metadata() { return output_.aot_metadata; }
+
   std::shared_ptr<AOTCodegen> codegen_;
   AOTLoweredOutput output_;
 };
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index aed6fcabdc6f..2f192c0b0c17 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -43,14 +43,16 @@ namespace backend {
 using TargetsMap = Map<tvm::Integer, tvm::Target>;
 using namespace tvm::relay::transform;
 
-enum class Executor { Graph, Aot };
+/*!
+ * Type of supported executors
+ */
+enum class ExecutorType { Graph, Aot };
 
 /*!
  * \brief Output of building module
- *
  */
 struct BuildOutput {
-  std::string graph;
+  std::string graph_json;
   tir::PrimFunc runner_function;
   runtime::Module mod;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
@@ -66,23 +68,23 @@ struct GraphCodegen {
     const String executor_str =
         target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
     if (executor_str == kTvmExecutorGraph) {
-      executor_ = Executor::Graph;
+      executor_ = ExecutorType::Graph;
       auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
       mod = (*pf)();
     } else if (executor_str == kTvmExecutorAot) {
-      executor_ = Executor::Aot;
+      executor_ = ExecutorType::Aot;
       auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
       mod = (*pf)();
     } else {
-      LOG(FATAL) << "Executor not supported";
+      LOG(FATAL) << "Executor " << executor_str << " not supported";
     }
   }
   ~GraphCodegen() {}
 
   void Init(runtime::Module* m, TargetsMap targets) {
-    if (executor_ == Executor::Graph) {
+    if (executor_ == ExecutorType::Graph) {
       CallFunc("init", m, targets);
-    } else if (executor_ == Executor::Aot) {
+    } else if (executor_ == ExecutorType::Aot) {
       CallFunc("init", m, targets, target_host_);
     } else {
       LOG(FATAL) << "Executor not supported";
@@ -91,16 +93,16 @@ struct GraphCodegen {
 
   void Codegen(const Function& func) { CallFunc("codegen", func); }
 
-  std::string GetGraph() {
-    if (executor_ == Executor::Graph) {
-      return CallFunc<std::string>("get_graph", nullptr);
+  std::string GetJSON() {
+    if (executor_ == ExecutorType::Graph) {
+      return CallFunc<std::string>("get_graph_json", nullptr);
     } else {
       return "";
     }
   }
 
   tir::PrimFunc GetRunnerFunction() {
-    if (executor_ == Executor::Aot) {
+    if (executor_ == ExecutorType::Aot) {
       return CallFunc<tir::PrimFunc>("get_runner_function");
     } else {
       return tir::PrimFunc();
@@ -138,7 +140,7 @@ struct GraphCodegen {
   }
 
   runtime::AOTMetadata GetAOTMetadata() {
-    if (executor_ == Executor::Aot) {
+    if (executor_ == ExecutorType::Aot) {
       return CallFunc<runtime::AOTMetadata>("get_aot_metadata");
     } else {
       // Graph runtime does not need AOT metadata
@@ -147,7 +149,7 @@ struct GraphCodegen {
   }
 
  protected:
-  Executor executor_;
+  ExecutorType executor_;
   Target target_host_;
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
@@ -176,9 +178,9 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \return The corresponding member function.
    */
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    if (name == "get_graph") {
+    if (name == "get_graph_json") {
       return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetGraph(); });
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetJSON(); });
     } else if (name == "get_runner_function") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetRunnerFunction(); });
@@ -216,7 +218,7 @@ class RelayBuildModule : public runtime::ModuleNode {
         ICHECK_EQ(args.num_args, 2);
         *rv = this->Optimize(args[0], args[1], this->params_);
       });
-    } else if (name == "get_executor") {
+    } else if (name == "get_executor_type") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         auto target_host = GetTargetHost();
         const String executor_str =
@@ -235,7 +237,7 @@ class RelayBuildModule : public runtime::ModuleNode {
    *
    * \return const std::string graph_json
    */
-  const std::string& GetGraph() { return ret_.graph; }
+  const std::string& GetJSON() { return ret_.graph_json; }
 
   /*!
    * \brief Get the GraphJSON for runtime
@@ -547,7 +549,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     graph_codegen_->Init(nullptr, targets_);
     graph_codegen_->Codegen(func);
 
-    ret_.graph = graph_codegen_->GetGraph();
+    ret_.graph_json = graph_codegen_->GetJSON();
     ret_.runner_function = graph_codegen_->GetRunnerFunction();
     ret_.params = graph_codegen_->GetParams();
 
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 4fe47c6692e2..16a07ec15a40 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -604,7 +604,7 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
         Function func = args[0];
         this->output_ = this->codegen_->Codegen(func);
       });
-    } else if (name == "get_graph") {
+    } else if (name == "get_graph_json") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.graph_json; });
     } else if (name == "list_params_name") {
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
new file mode 100644
index 000000000000..0bd85bb366a8
--- /dev/null
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Main entry point for
+ * \param model Model descriptor structure to reference for runtime information
+ * \param inputs Pointer to input pointer(s)
+ * \param outputs Pointer to output pointer(s)
+ * \param context Context information to be passed through to operators
+ * \return tvm_status_t containing success or errors from the model run
+ */
+#include "tvm/runtime/crt/aot_executor.h"
+#include "tvm/runtime/c_runtime_api.h"
+
+tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs) {
+  static DLDevice fake_device = {kDLCPU, 0};
+  static int64_t fake_dims = 0;
+  static int64_t fake_shape = {0};
+
+  DLTensor tensors[model->num_input_tensors + model->num_output_tensors];     // NOLINT
+  TVMValue tvm_values[model->num_input_tensors + model->num_output_tensors];  // NOLINT
+  int32_t tvm_typeids[model->num_input_tensors + model->num_output_tensors];  // NOLINT
+
+  for (int i = 0; i < model->num_input_tensors; i++) {
+    tensors[i] = (DLTensor){
+        .device = fake_device,
+        .data = inputs[i],
+        .shape = &fake_shape,
+        .ndim = fake_dims,
+        .byte_offset = 0,
+        .strides = NULL,
+    };
+    tvm_values[i].v_handle = &tensors[i];
+  }
+
+  for (int i = 0; i < model->num_output_tensors; i++) {
+    tensors[model->num_input_tensors + i] = (DLTensor){
+        .device = fake_device,
+        .data = outputs[i],
+        .shape = &fake_shape,
+        .ndim = fake_dims,
+        .byte_offset = 0,
+        .strides = NULL,
+    };
+    tvm_values[model->num_input_tensors + i].v_handle = &tensors[model->num_input_tensors + i];
+  }
+
+  return model->run_func(tvm_values, tvm_typeids, 0, NULL, 0, NULL);
+}
diff --git a/src/runtime/crt/common/crt_backend_api.c b/src/runtime/crt/common/crt_backend_api.c
index 9a12bc28240a..56bbbedc1d64 100644
--- a/src/runtime/crt/common/crt_backend_api.c
+++ b/src/runtime/crt/common/crt_backend_api.c
@@ -27,7 +27,7 @@
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/platform.h>
 
 #include "crt_config.h"
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index f73449829bd6..f34bbd4fec95 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -31,8 +31,6 @@
 #include <tvm/runtime/crt/func_registry.h>
 #include <tvm/runtime/crt/internal/common/ndarray.h>
 #include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
-#include <tvm/runtime/crt/internal/memory/memory.h>
-#include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/platform.h>
 
 // Handle internal errors
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index fb8fc8022f43..c97f7658938f 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -25,7 +25,7 @@
  */
 
 #include <tvm/runtime/crt/internal/common/ndarray.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/platform.h>
 
 #include "crt_config.h"
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 2fe9e73aeddc..bf64096441be 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -27,9 +27,9 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
 #include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/module.h>
 #include <tvm/runtime/crt/packed_func.h>
+#include <tvm/runtime/crt/page_allocator.h>
 
 #include "crt_config.h"
 
diff --git a/src/runtime/crt/graph_executor/load_json.c b/src/runtime/crt/graph_executor/load_json.c
index dd2faecdc538..f1c1f6768168 100644
--- a/src/runtime/crt/graph_executor/load_json.c
+++ b/src/runtime/crt/graph_executor/load_json.c
@@ -26,7 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <tvm/runtime/crt/internal/graph_executor/load_json.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/platform.h>
 
 // the node entry structure in serialized format
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index e2e4672cbc9d..c624462ba68c 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -25,7 +25,7 @@
 #include <time.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/utvm_rpc_server.h>
 #include <unistd.h>
 
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/memory/memory.h b/src/runtime/crt/include/tvm/runtime/crt/internal/memory/page_allocator.h
similarity index 94%
rename from src/runtime/crt/include/tvm/runtime/crt/internal/memory/memory.h
rename to src/runtime/crt/include/tvm/runtime/crt/internal/memory/page_allocator.h
index aae045a0f24d..7d40c03f2673 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/memory/memory.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/memory/page_allocator.h
@@ -18,17 +18,17 @@
  */
 
 /*!
- * \file runtime/crt/include/tvm/runtime/crt/internal/memory/memory.h
+ * \file runtime/crt/include/tvm/runtime/crt/internal/memory/page_allocator.h
  * \brief Defines data types and functions used in the internal memory manager.
  *     Exposed for testing.
  */
 
-#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_MEMORY_H_
-#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_MEMORY_H_
+#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_PAGE_ALLOCATOR_H_
+#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_PAGE_ALLOCATOR_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/error_codes.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 
 #include "crt_config.h"
 
@@ -109,4 +109,4 @@ typedef struct MemoryManager {
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_MEMORY_H_
+#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_MEMORY_PAGE_ALLOCATOR_H_
diff --git a/src/runtime/crt/memory/memory.c b/src/runtime/crt/memory/page_allocator.c
similarity index 99%
rename from src/runtime/crt/memory/memory.c
rename to src/runtime/crt/memory/page_allocator.c
index ed18544c2181..18dd818602f9 100644
--- a/src/runtime/crt/memory/memory.c
+++ b/src/runtime/crt/memory/page_allocator.c
@@ -33,9 +33,8 @@
 #include <string.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/error_codes.h>
-#include <tvm/runtime/crt/internal/memory/memory.h>
+#include <tvm/runtime/crt/internal/memory/page_allocator.h>
 #include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/platform.h>
 
 // construct a new page
diff --git a/src/runtime/crt/memory/stack_memory.c b/src/runtime/crt/memory/stack_allocator.c
similarity index 70%
rename from src/runtime/crt/memory/stack_memory.c
rename to src/runtime/crt/memory/stack_allocator.c
index ac805b09d564..5464b92b86c3 100644
--- a/src/runtime/crt/memory/stack_memory.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -19,10 +19,10 @@
 
 // LINT_C_FILE
 
-#include <tvm/runtime/crt/stack_memory.h>
+#include <tvm/runtime/crt/stack_allocator.h>
 
-void* MemoryManager_Allocate(tvm_workspace_t *tvm_runtime_workspace, int32_t nbytes) {
-  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT - 1);
+void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes) {
+  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 1);
   uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
@@ -35,13 +35,14 @@ void* MemoryManager_Allocate(tvm_workspace_t *tvm_runtime_workspace, int32_t nby
   return current_alloc;
 }
 
-tvm_crt_error_t MemoryManager_Free(tvm_workspace_t *tvm_runtime_workspace, void* ptr) {
+tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
   tvm_runtime_workspace->next_alloc = ptr;
   return 0;
 }
 
-void MemoryManager_Init(tvm_workspace_t *tvm_runtime_workspace, uint8_t* g_aot_memory, size_t workspace_size){
-	tvm_runtime_workspace->next_alloc = g_aot_memory;
-	tvm_runtime_workspace->workspace = g_aot_memory;
-	tvm_runtime_workspace->workspace_size = workspace_size;
+void StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
+                             size_t workspace_size) {
+  tvm_runtime_workspace->next_alloc = g_aot_memory;
+  tvm_runtime_workspace->workspace = g_aot_memory;
+  tvm_runtime_workspace->workspace_size = workspace_size;
 }
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 8b7c0eb01840..1736f98dad12 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -35,8 +35,8 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/module.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/platform.h>
 #include <tvm/runtime/crt/rpc_common/frame_buffer.h>
 #include <tvm/runtime/crt/rpc_common/framing.h>
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index e505912d9ffe..344fd3d40ba8 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -112,7 +112,7 @@ TEST(Relay, BuildModule) {
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();
   auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph", false);
+  auto json_f = build_mod.GetFunction("get_graph_json", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
   Map<tvm::Integer, tvm::Target> targets;
   Target llvm_tgt = Target("llvm");
diff --git a/tests/cpp/utvm_runtime_standalone_test.cc b/tests/cpp/utvm_runtime_standalone_test.cc
index a020aaf55f17..5c642a37d6bc 100644
--- a/tests/cpp/utvm_runtime_standalone_test.cc
+++ b/tests/cpp/utvm_runtime_standalone_test.cc
@@ -85,7 +85,7 @@ TEST(MicroStandaloneRuntime, BuildModule) {
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();
   auto build_f = build_mod.GetFunction("build", false);
-  auto json_f = build_mod.GetFunction("get_graph", false);
+  auto json_f = build_mod.GetFunction("get_graph_json", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
   Map<tvm::Integer, tvm::Target> targets;
 
diff --git a/tests/crt/aot_executor_test.cc b/tests/crt/aot_executor_test.cc
index 5f5cfdb5d6c7..aa18f2b22b7f 100644
--- a/tests/crt/aot_executor_test.cc
+++ b/tests/crt/aot_executor_test.cc
@@ -22,8 +22,8 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/aot_executor.h>
 
-int32_t test_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
-                      void* out_ret_tcode, void* resource_handle) {
+int test_run_func(TVMValue* args, int* arg_type_ids, int num_args, TVMValue* out_ret_value,
+                  int* out_ret_tcode, void* resource_handle) {
   return kTvmErrorNoError;
 }
 
@@ -34,11 +34,11 @@ TEST(AOTRuntime, NoOp) {
       .run_func = &test_run_func,
   };
 
-  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&test_model, NULL, NULL, NULL));
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&test_model, NULL, NULL));
 }
 
-int32_t error_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
-                       void* out_ret_tcode, void* resource_handle) {
+int32_t error_run_func(TVMValue* args, int* arg_type_ids, int32_t num_args, TVMValue* out_ret_value,
+                       int* out_ret_tcode, void* resource_handle) {
   return kTvmErrorPlatformNoMemory;
 }
 
@@ -49,11 +49,11 @@ TEST(AOTRuntime, Error) {
       .run_func = &error_run_func,
   };
 
-  ASSERT_EQ(kTvmErrorPlatformNoMemory, tvm_runtime_run(&error_model, NULL, NULL, NULL));
+  ASSERT_EQ(kTvmErrorPlatformNoMemory, tvm_runtime_run(&error_model, NULL, NULL));
 }
 
-int32_t identity_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
-                          void* out_ret_tcode, void* resource_handle) {
+int32_t identity_run_func(TVMValue* args, int* arg_type_ids, int32_t num_args,
+                          TVMValue* out_ret_value, int* out_ret_tcode, void* resource_handle) {
   void* arg0 = (((TVMValue*)args)[0].v_handle);
   void* arg1 = (((TVMValue*)args)[1].v_handle);
   void* placeholder = (((DLTensor*)arg0)[0].data);
@@ -74,12 +74,12 @@ TEST(AOTRuntime, Identity) {
   uint32_t outputs1[1];
   void* outputs[] = {outputs1};
 
-  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&identity_model, inputs, outputs, NULL));
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&identity_model, inputs, outputs));
   ASSERT_EQ(outputs1[0], 404);
 }
 
-int32_t add_run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value,
-                     void* out_ret_tcode, void* resource_handle) {
+int32_t add_run_func(TVMValue* args, int* arg_type_ids, int32_t num_args, TVMValue* out_ret_value,
+                     int* out_ret_tcode, void* resource_handle) {
   void* arg0 = (((TVMValue*)args)[0].v_handle);
   void* arg1 = (((TVMValue*)args)[1].v_handle);
   void* placeholder = (((DLTensor*)arg0)[0].data);
@@ -102,12 +102,13 @@ TEST(AOTRuntime, Add) {
   uint32_t outputs1[1];
   void* outputs[] = {outputs1};
 
-  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&add_model, inputs, outputs, NULL));
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&add_model, inputs, outputs));
   ASSERT_EQ(outputs1[0], 904);
 }
 
-int32_t multiple_inputs_run_func(void* args, void* arg_type_ids, int32_t num_args,
-                                 void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
+int32_t multiple_inputs_run_func(TVMValue* args, int* arg_type_ids, int32_t num_args,
+                                 TVMValue* out_ret_value, int* out_ret_tcode,
+                                 void* resource_handle) {
   void* arg0 = (((TVMValue*)args)[0].v_handle);
   void* arg1 = (((TVMValue*)args)[1].v_handle);
   void* arg2 = (((TVMValue*)args)[2].v_handle);
@@ -133,12 +134,13 @@ TEST(AOTRuntime, MultipleInputs) {
   uint32_t outputs1[1];
   void* outputs[] = {outputs1};
 
-  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_inputs_model, inputs, outputs, NULL));
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_inputs_model, inputs, outputs));
   ASSERT_EQ(outputs1[0], 1306);
 }
 
-int32_t multiple_outputs_run_func(void* args, void* arg_type_ids, int32_t num_args,
-                                  void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
+int32_t multiple_outputs_run_func(TVMValue* args, int* arg_type_ids, int32_t num_args,
+                                  TVMValue* out_ret_value, int* out_ret_tcode,
+                                  void* resource_handle) {
   void* arg0 = (((TVMValue*)args)[0].v_handle);
   void* arg1 = (((TVMValue*)args)[1].v_handle);
   void* arg2 = (((TVMValue*)args)[2].v_handle);
@@ -164,34 +166,11 @@ TEST(AOTRuntime, MultipleOutputs) {
   uint32_t outputs2[1];
   void* outputs[] = {outputs1, outputs2};
 
-  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_outputs_model, inputs, outputs, NULL));
+  ASSERT_EQ(kTvmErrorNoError, tvm_runtime_run(&multiple_outputs_model, inputs, outputs));
   ASSERT_EQ(outputs1[0], 404);
   ASSERT_EQ(outputs2[0], 500);
 }
 
-int32_t resource_handle_check_run_func(void* args, void* arg_type_ids, int32_t num_args,
-                                       void* out_ret_value, void* out_ret_tcode,
-                                       void* resource_handle) {
-  if (resource_handle == NULL) {
-    return kTvmErrorFunctionCallWrongArgType;
-  }
-  return kTvmErrorNoError;
-}
-
-TEST(AOTRuntime, ContextPassing) {
-  tvm_context_t stub_context = {};
-  const tvm_model_t resource_handle_check_model = {
-      .num_input_tensors = 0,
-      .num_output_tensors = 0,
-      .run_func = &resource_handle_check_run_func,
-  };
-
-  ASSERT_EQ(kTvmErrorNoError,
-            tvm_runtime_run(&resource_handle_check_model, NULL, NULL, &stub_context));
-  ASSERT_EQ(kTvmErrorFunctionCallWrongArgType,
-            tvm_runtime_run(&resource_handle_check_model, NULL, NULL, NULL));
-}
-
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index 15780e0b20f5..259550d0a813 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -18,7 +18,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/runtime/crt/stack_memory.h>
+#include <tvm/runtime/crt/stack_allocator.h>
 
 /*
  * Tests allocations are properly aligned when allocated
@@ -27,18 +27,18 @@ TEST(AOTMemory, Allocate) {
   static uint8_t model_memory[80];
   tvm_workspace_t tvm_runtime_workspace;
 
-  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 2);
+  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
   ASSERT_EQ(block_two, &model_memory[16]);
 
-  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 24);
+  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 24);
   ASSERT_EQ(two_blocks, &model_memory[32]);
 
-  void* block_three = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_three, &model_memory[64]);
 }
 
@@ -48,20 +48,20 @@ TEST(AOTMemory, Allocate) {
 TEST(AOTMemory, Free) {
   static uint8_t model_memory[80];
   tvm_workspace_t tvm_runtime_workspace;
-  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_two, &model_memory[16]);
-  ASSERT_EQ(0, MemoryManager_Free(&tvm_runtime_workspace, block_two));
+  ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
 
-  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 2);
+  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
   ASSERT_EQ(two_blocks, &model_memory[16]);
-  ASSERT_EQ(0, MemoryManager_Free(&tvm_runtime_workspace, two_blocks));
+  ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
 
-  void* block_three = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_three, &model_memory[16]);
 }
 
@@ -71,15 +71,15 @@ TEST(AOTMemory, Free) {
 TEST(AOTMemory, OverAllocate) {
   static uint8_t model_memory[72];
   tvm_workspace_t tvm_runtime_workspace;
-  MemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = MemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_two, &model_memory[16]);
 
-  void* two_blocks = MemoryManager_Allocate(&tvm_runtime_workspace, 64);
+  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 64);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
 
diff --git a/tests/crt/framing_test.cc b/tests/crt/framing_test.cc
index 241e23d877cb..5ee226dc5ee7 100644
--- a/tests/crt/framing_test.cc
+++ b/tests/crt/framing_test.cc
@@ -18,7 +18,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/rpc_common/frame_buffer.h>
 #include <tvm/runtime/crt/rpc_common/framing.h>
 
diff --git a/tests/crt/memory_test.cc b/tests/crt/memory_test.cc
index d876e5c96da9..b11ab774f101 100644
--- a/tests/crt/memory_test.cc
+++ b/tests/crt/memory_test.cc
@@ -18,8 +18,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/runtime/crt/internal/memory/memory.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/internal/memory/page_allocator.h>
+#include <tvm/runtime/crt/page_allocator.h>
 
 #include "crt_config.h"
 #include "platform.cc"
diff --git a/tests/crt/session_test.cc b/tests/crt/session_test.cc
index 60686be25060..9840f55dc685 100644
--- a/tests/crt/session_test.cc
+++ b/tests/crt/session_test.cc
@@ -18,7 +18,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/runtime/crt/memory.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/rpc_common/frame_buffer.h>
 #include <tvm/runtime/crt/rpc_common/session.h>
 
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index 91030fac787a..d65fd1221c20 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -47,7 +47,7 @@ CRT_SRCS = $(shell find $(CRT_ROOT))
 
 aot_test_runner: $(build_dir)/aot_test_runner
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o  $(build_dir)/stack_memory.o $(build_dir)/crt_backend_api.o
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o  $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
 
@@ -63,7 +63,7 @@ $(build_dir)/tvm_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_execut
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
-$(build_dir)/stack_memory.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_memory.c
+$(build_dir)/stack_allocator.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_allocator.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
diff --git a/tests/python/relay/aot/infra.py b/tests/python/relay/aot/infra.py
index 0ccd474ed046..15c2775abcfc 100644
--- a/tests/python/relay/aot/infra.py
+++ b/tests/python/relay/aot/infra.py
@@ -73,7 +73,7 @@ def create_main(test_name, input_list, output_list, output_path):
     with open(raw_path, "w") as main_file:
         main_file.write("#include <stdio.h>\n")
         main_file.write('#include "aot_executor.h"\n')
-        main_file.write('#include "stack_memory.h"\n')
+        main_file.write('#include "stack_allocator.h"\n')
         main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
 
@@ -87,12 +87,12 @@ def create_main(test_name, input_list, output_list, output_path):
         main_file.write("tvm_workspace_t app_workspace;\n")
         main_file.write(
             """
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr){
-    (*out_ptr) = MemoryManager_Allocate(&app_workspace, num_bytes);
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+    (*out_ptr) = StackMemoryManager_Allocate(&app_workspace, num_bytes);
 }
 
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev){
-    MemoryManager_Free(&app_workspace,ptr);
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+    StackMemoryManager_Free(&app_workspace,ptr);
 }
 
 void  TVMPlatformAbort(tvm_crt_error_t code) { }
@@ -113,8 +113,8 @@ def create_main(test_name, input_list, output_list, output_path):
             main_file.write("output_data%i, " % i)
         main_file.write("};\n")
 
-        main_file.write("MemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);")
-        main_file.write("tvm_runtime_run(&network, inputs, outputs, NULL);")
+        main_file.write("StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);")
+        main_file.write("tvm_runtime_run(&network, inputs, outputs);")
 
         for i in range(0, len(output_list)):
             main_file.write("for (int i = 0; i<output_data%i_len; i++){\n" % i)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 2638737adbee..9d2d41450625 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -25,7 +25,7 @@
 import subprocess
 import tempfile
 import tarfile
-
+import pytest
 
 import tvm
 from tvm import relay
@@ -244,15 +244,4 @@ def test_mobilenet():
 
 
 if __name__ == "__main__":
-    test_tuple_output()
-    test_mobilenet()
-    test_subtract()
-    test_mul_param()
-    test_id()
-    test_add_const()
-    test_tuple_getitem()
-    test_nested_tuples()
-    test_concatenate()
-    test_conv_with_params()
-    test_add_with_params()
-    test_conv2d()
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 2c0316d6582e..8fa91b97a714 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -158,7 +158,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
 
     with _make_session(workspace, factory.get_lib()) as sess:
         graph_mod = tvm.micro.create_local_graph_runtime(
-            factory.get_graph(), sess.get_system_lib(), sess.context
+            factory.get_graph_json(), sess.get_system_lib(), sess.context
         )
         A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 9bcdf509c7e7..6bca45a38ea0 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -543,7 +543,7 @@ def test_debug_graph_executor():
     debug_g_mod = debug_runtime.GraphModuleDebug(
         complied_graph_lib["debug_create"]("default", ctx),
         [ctx],
-        complied_graph_lib.get_graph(),
+        complied_graph_lib.get_graph_json(),
         None,
     )
     debug_g_mod.set_input("data", data)

From 837d8e412065fbf5370cf411d1d565353ec1ef09 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 13 Apr 2021 18:02:58 +0100
Subject: [PATCH 05/33] fix linting

Change-Id: Iba6544ac7101595696b352b8702345cf916625f6
---
 apps/bundle_deploy/bundle_static.c          | 2 +-
 src/runtime/crt/aot_executor/aot_executor.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index 6e3867e4f4a2..d0eeec4d956f 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -22,8 +22,8 @@
 #include <stdlib.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/graph_executor.h>
-#include <tvm/runtime/crt/page_allocator.h>
 #include <tvm/runtime/crt/packed_func.h>
+#include <tvm/runtime/crt/page_allocator.h>
 #include <unistd.h>
 
 #ifdef ENABLE_TVM_PLATFORM_ABORT_BACKTRACE
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index 0bd85bb366a8..4f30c152f4af 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -26,6 +26,7 @@
  * \return tvm_status_t containing success or errors from the model run
  */
 #include "tvm/runtime/crt/aot_executor.h"
+
 #include "tvm/runtime/c_runtime_api.h"
 
 tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs) {

From 7d2cdf47e018a775b59bdc6442d12b41ef5c5ed4 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 14 Apr 2021 13:18:50 +0100
Subject: [PATCH 06/33] fix linting - 2

Change-Id: I7f80d16005f2c621d37a9aae2cbbd61df0277cbe
---
 python/tvm/relay/backend/executor_factory.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index 2b1e1e597387..9206d6138c21 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -15,15 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """Executor factory modules."""
-import warnings
 from abc import abstractmethod
 
+from tvm import tir
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
-from tvm import tir
-
 
 class ExecutorFactoryModule:
     """Common interface for executor factory modules

From 61ea6d3b6a33ceb184c1b5b9842b5423dbded1ab Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 14 Apr 2021 20:09:36 +0100
Subject: [PATCH 07/33] fix linting - 3

Change-Id: I7a1ba40afeea46d5f122563a20cd4b2f08751a1e
---
 include/tvm/runtime/crt/aot_executor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/crt/aot_executor.h b/include/tvm/runtime/crt/aot_executor.h
index 6d3e80e2ccaf..35b440d44ca3 100644
--- a/include/tvm/runtime/crt/aot_executor.h
+++ b/include/tvm/runtime/crt/aot_executor.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file include/tvm/runtime/crt/aot/tvm_executor.h
+ * \file include/tvm/runtime/crt/aot_executor.h
  * \brief TVM Executor for the Ahead-of-Time Runtime
  *
  * AOT models are described by the TVM model descriptor format

From 559fcd2c228ef1dd71811b7bc43e95e10f1b46a0 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 16 Apr 2021 14:46:15 +0100
Subject: [PATCH 08/33] fix tests

Change-Id: I1297ccc54dd6d93647f421e0beb226f410bf73f5
---
 python/tvm/relay/backend/executor_factory.py | 49 +++++++++++++++-----
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index 9206d6138c21..ce9711ff746f 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -16,6 +16,7 @@
 # under the License.
 """Executor factory modules."""
 from abc import abstractmethod
+import warnings
 
 from tvm import tir
 
@@ -51,6 +52,35 @@ def get_lib(self):
         """ Return the generated library"""
         raise NotImplementedError
 
+    @abstractmethod
+    def get_internal_repr(self):
+        """ Return the internal representation used to execute the network"""
+        raise NotImplementedError
+
+    def __getitem__(self, item):
+        print(item)
+        return self.module.__getitem__(item)
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.get_internal_repr(), self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
 
 class AOTExecutorFactoryModule(ExecutorFactoryModule):
     """AOT executor factory module.
@@ -102,7 +132,7 @@ class GraphExecutorFactoryModule(ExecutorFactoryModule):
 
     Parameters
     ----------
-    graph_str : the json graph to be deployed in json format output by graph compiler.
+    graph_json_str : the json graph to be deployed in json format output by graph compiler.
         The graph can contain operator(tvm_op) that points to the name of
         PackedFunc in the libmod.
     target : tvm.Target
@@ -115,8 +145,8 @@ class GraphExecutorFactoryModule(ExecutorFactoryModule):
         The parameters of module
     """
 
-    def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
-        assert isinstance(graph_str, string_types)
+    def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
+        assert isinstance(graph_json_str, string_types)
         fcreate = get_global_func("tvm.graph_executor_factory.create")
         args = []
         for k, v in params.items():
@@ -125,8 +155,8 @@ def __init__(self, ir_mod, target, graph_str, libmod, libmod_name, params):
 
         self.ir_mod = ir_mod
         self.target = target
-        self.module = fcreate(graph_str, libmod, libmod_name, *args)
-        self.graph = graph_str
+        self.module = fcreate(graph_json_str, libmod, libmod_name, *args)
+        self.graph_json = graph_json_str
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
@@ -136,19 +166,16 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
         return self.module.export_library(file_name, fcompile, addons, **kwargs)
 
     def save_executor_config(self):
-        return self.graph
+        return self.internal_repr
 
     def get_params(self):
         return self.params
 
     def get_graph_json(self):
-        return self.internal_repr
+        return self.graph_json
 
     def get_internal_repr(self):
-        return self.graph
+        return self.graph_json
 
     def get_lib(self):
         return self.lib
-
-    def __getitem__(self, item):
-        return self.module.__getitem__(item)

From 8cd223e6cfeb4b33e3f34a0ae44b6d8dabb39db4 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 19 Apr 2021 14:31:24 +0100
Subject: [PATCH 09/33] Addressing comments - 3

Change-Id: Id25d1382c30d6d0a0013b5e8986fb8cd886666dc
---
 include/tvm/runtime/crt/page_allocator.h      |   4 +-
 include/tvm/tir/builtin.h                     |  27 ++++
 python/tvm/relay/backend/executor_factory.py  |  26 +--
 python/tvm/relay/build_module.py              |  30 +++-
 ...aot_codegen.cc => aot_executor_codegen.cc} | 136 ++++++----------
 src/relay/backend/build_module.cc             | 150 +++++++-----------
 src/relay/backend/graph_executor_codegen.cc   |  11 +-
 src/relay/backend/utils.h                     |  12 ++
 src/runtime/crt/aot_executor/aot_executor.c   |   5 +-
 src/runtime/crt/common/aot_backend_api.c      |  59 -------
 src/runtime/crt/host/main.cc                  |   3 +-
 .../crt/internal/aot_executor}/aot_executor.h |   7 +-
 src/runtime/crt/memory/page_allocator.c       |  23 +--
 src/runtime/meta_data.h                       |  20 +--
 src/target/metadata_module.cc                 |   4 +-
 src/target/source/codegen_c.cc                |   7 +-
 src/target/source/codegen_c_host.cc           | 128 ++++++++-------
 src/target/source/codegen_c_host.h            |  16 +-
 src/target/source/codegen_source_base.h       |   6 +-
 src/target/source/source_module.cc            |  20 +--
 src/target/source/source_module.h             |   3 +-
 src/tir/op/builtin.cc                         |   5 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  13 +-
 tests/crt/aot_executor_test.cc                |   2 +-
 tests/crt/memory_test.cc                      |   2 +-
 tests/python/relay/aot/aot_test.mk            |   9 +-
 .../relay/aot/{infra.py => aot_test_utils.py} |  22 +--
 tests/python/relay/aot/test_crt_aot.py        |  26 +--
 .../relay/test_backend_graph_executor.py      |   4 +
 tests/python/unittest/test_crt.py             |   7 +-
 tests/python/unittest/test_link_params.py     |   4 +-
 .../test_micro_model_library_format.py        |   2 +-
 .../test_runtime_module_based_interface.py    |  10 +-
 33 files changed, 364 insertions(+), 439 deletions(-)
 rename src/relay/backend/{aot_codegen.cc => aot_executor_codegen.cc} (84%)
 delete mode 100644 src/runtime/crt/common/aot_backend_api.c
 rename {include/tvm/runtime/crt => src/runtime/crt/include/tvm/runtime/crt/internal/aot_executor}/aot_executor.h (90%)
 rename tests/python/relay/aot/{infra.py => aot_test_utils.py} (92%)

diff --git a/include/tvm/runtime/crt/page_allocator.h b/include/tvm/runtime/crt/page_allocator.h
index a379c6b8ded5..7a5de169c72e 100644
--- a/include/tvm/runtime/crt/page_allocator.h
+++ b/include/tvm/runtime/crt/page_allocator.h
@@ -72,8 +72,8 @@ struct MemoryManagerInterface {
  * \param page_size_bytes_log2 log2 of the page size, in bytes.
  * \return kTvmErrorNoError on success.
  */
-tvm_crt_error_t MemoryManagerCreate(MemoryManagerInterface** manager, uint8_t* memory_pool,
-                                    size_t memory_pool_size_bytes, size_t page_size_bytes_log2);
+tvm_crt_error_t PageMemoryManagerCreate(MemoryManagerInterface** manager, uint8_t* memory_pool,
+                                        size_t memory_pool_size_bytes, size_t page_size_bytes_log2);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index d8248d4e1a87..aab5d662d49c 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -346,6 +346,18 @@ TVM_DLL const Op& tvm_stack_make_array();
  */
 TVM_DLL const Op& tvm_call_packed();
 
+/*!
+ * \brief See pesudo code
+ *
+ * return_type tvm_call_packed(fname, TVMValue* args) {
+ * 	   int ret_code;
+ *     TVMValue ret_value;
+ *     (*fname)(args, type_code_of(args), len(args), &ret_value, &ret_code);
+ *     return cast(return_type, ret_value.v_return_type);
+ *  }
+ */
+TVM_DLL const Op& tvm_call_cpacked();
+
 /*!
  * \brief See pesudo code
  *
@@ -392,6 +404,21 @@ TVM_DLL const Op& tvm_thread_context();
  */
 TVM_DLL const Op& tvm_call_packed_lowered();
 
+/*!
+ * \brief Lowered version of call c-packed, the space of value and
+ *  type codes are explicitly allocated.
+ *
+ *  int tvm_call_packed_lowered(fname,
+ *                              TVMValue* value_stack,
+ *                              int* tcode_stack,
+ *                              int begin,
+ *                              int end) {
+ *     fname(TVMArgs(value_stack[begin:end], tcode_stack[begin:end]),
+ *                   TVMRetValue(value_stack + end, tcode_stack + end));
+ *  }
+ */
+TVM_DLL const Op& tvm_call_cpacked_lowered();
+
 /*!
  * \brief Lowered version of trace intrinsic, the space of value and
  *  type codes are explicitly allocated. The return value is the
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index ce9711ff746f..d81305145ae1 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -18,8 +18,6 @@
 from abc import abstractmethod
 import warnings
 
-from tvm import tir
-
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
@@ -52,11 +50,6 @@ def get_lib(self):
         """ Return the generated library"""
         raise NotImplementedError
 
-    @abstractmethod
-    def get_internal_repr(self):
-        """ Return the internal representation used to execute the network"""
-        raise NotImplementedError
-
     def __getitem__(self, item):
         print(item)
         return self.module.__getitem__(item)
@@ -85,9 +78,8 @@ def __next__(self):
 class AOTExecutorFactoryModule(ExecutorFactoryModule):
     """AOT executor factory module.
 
-    Parameters
+    Attributes
     ----------
-    runner_function : the PrimFunc containing of the TIR main executor function.
     target : tvm.Target
         The Target used to build this module.
     libmod : tvm.Module
@@ -98,29 +90,19 @@ class AOTExecutorFactoryModule(ExecutorFactoryModule):
         The parameters of module
     """
 
-    def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, params):
-        assert isinstance(runner_function, tir.PrimFunc)
-        args = []
-        for k, v in params.items():
-            args.append(k)
-            args.append(ndarray.array(v))
-
+    def __init__(self, ir_mod, target, libmod, libmod_name, params):
         self.ir_mod = ir_mod
         self.target = target
-        self.runner_func = runner_function
         self.lib = libmod
         self.libmod_name = libmod_name
         self.params = params
         self.iter_cnt = 0
 
-    # Sometimes we want to get params explicitly.
-    # For example, we want to save its params value to
-    # an independent file.
     def get_params(self):
         return self.params
 
     def get_internal_repr(self):
-        return self.runner_func
+        return None
 
     def get_lib(self):
         return self.lib
@@ -130,7 +112,7 @@ class GraphExecutorFactoryModule(ExecutorFactoryModule):
     """Graph executor factory module.
     This is a module of graph executor factory
 
-    Parameters
+    Attributes
     ----------
     graph_json_str : the json graph to be deployed in json format output by graph compiler.
         The graph can contain operator(tvm_op) that points to the name of
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 826f50858770..81e81852d438 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -78,7 +78,6 @@ class BuildModule(object):
     def __init__(self):
         self.mod = _build_module._BuildModule()
         self._get_graph_json = self.mod["get_graph_json"]
-        self._get_runner_function = self.mod["get_runner_function"]
         self._get_module = self.mod["get_module"]
         self._build = self.mod["build"]
         self._optimize = self.mod["optimize"]
@@ -113,6 +112,7 @@ def build(self, mod, target=None, target_host=None, params=None):
 
         Returns
         -------
+<<<<<<< HEAD
         graph_json : str
             The json string that can be accepted by graph executor.
 
@@ -121,6 +121,10 @@ def build(self, mod, target=None, target_host=None, params=None):
 
         params : dict
             The parameters of the final graph.
+=======
+        factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
+            The runtime factory for the TVM executor.
+>>>>>>> f65012308... Addressing comments - 3
         """
         target = _update_target(target)
         target, target_host = Target.check_and_update_host_consist(
@@ -147,11 +151,7 @@ def build(self, mod, target=None, target_host=None, params=None):
         # Get artifacts
         mod = self.get_module()
         params = self.get_params()
-        internal_repr = (
-            self._get_runner_function()
-            if self.get_executor_type() == "aot"
-            else self.get_graph_json()
-        )
+        internal_repr = self.get_graph_json() if self.get_executor_type() == "graph" else None
 
         return internal_repr, mod, params
 
@@ -261,8 +261,22 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     Returns
     -------
+<<<<<<< HEAD
     factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
             The runtime factory for the TVM graph executor.
+=======
+    internal_repr : str or tir.PrimFunc
+        The internal representation the executor uses to execute the
+        network. Can be a string representing the json graph (if we are
+        building for graph executor) or the PrimFunc representing the
+        AOT runner function
+
+    mod : tvm.Module
+        The module containing necessary libraries.
+
+    params : dict
+        The parameters of the final graph.
+>>>>>>> f65012308... Addressing comments - 3
     """
     # pylint: enable=line-too-long
     # fmt: on
@@ -301,14 +315,14 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
         if bld_mod.get_executor_type() == "aot":
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
-                ir_mod, target, internal_repr, runtime_mod, mod_name, params
+                ir_mod, target, runtime_mod, mod_name, params
             )
         elif bld_mod.get_executor_type() == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
                 ir_mod, target, internal_repr, runtime_mod, mod_name, params
             )
         else:
-            assert False, "Executor not supported"
+            assert False, "Executor " + bld_mod.get_executor_type() + " not supported"
 
         return executor_factory
 
diff --git a/src/relay/backend/aot_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
similarity index 84%
rename from src/relay/backend/aot_codegen.cc
rename to src/relay/backend/aot_executor_codegen.cc
index 5a509a8219c1..c514ecb24fef 100644
--- a/src/relay/backend/aot_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -35,7 +35,6 @@
 #include <string>
 #include <vector>
 
-#include "../../runtime/meta_data.h"
 #include "compile_engine.h"
 #include "utils.h"
 
@@ -48,15 +47,6 @@ using ShapeVector = std::vector<std::vector<int64_t>>;
 using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
 using TargetsMap = std::unordered_map<int, Target>;
 
-/*! \brief Lowered outputs */
-struct AOTLoweredOutput {
-  tir::PrimFunc runner_func;
-  Map<String, IRModule> lowered_funcs;
-  Array<tvm::runtime::Module> external_mods;
-  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
-  runtime::AOTMetadata aot_metadata;
-};
-
 class AotReturnSidVisitor : public ExprVisitor {
  public:
   explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> storage_device_map)
@@ -116,18 +106,6 @@ class AOTCodegen : public ExprVisitor {
     return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), args);
   }
 
-  /*!
-   * \brief Utility function to allocate memory for storage identifiers
-   * \param  memory_size_byte size in bytes of the allocation
-   * \return PrimExpr representing the allocated memory
-   */
-  PrimExpr AllocateBackendMemory(int memory_size_byte) {
-    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
-    // to enable unified memory planning
-    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
-    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 8});
-  }
-
   /*!
    * \brief Utility function to convert a concrete integer to a PrimExpr.
    * \param num the number to convert
@@ -141,7 +119,7 @@ class AOTCodegen : public ExprVisitor {
   /*!
    * \brief Return a vector of variables that represents the sids for the given Relay Expr
    */
-  std::vector<tir::Var> pack_sid(Expr expr) {
+  std::vector<tir::Var> PackSid(Expr expr) {
     Array<IntegerArray> sids = storage_device_map_[expr];
     std::vector<tir::Var> sid_vars;
 
@@ -157,7 +135,7 @@ class AOTCodegen : public ExprVisitor {
         continue;
       }
       // Pack the sid inside the TVMValue
-      auto sid_array = te::Var(make_string("sid_", sid, "_value"), DataType::Handle());
+      auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle());
       auto sid_value = sids_table_[sid];
       tvm::PrimExpr set_tensor =
           tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
@@ -173,17 +151,17 @@ class AOTCodegen : public ExprVisitor {
    * \param expr Relay Expression assicated with the parameter
    * \return Variable that represents the DLTensor associated with the parameters
    */
-  tir::Var pack_param(Expr expr) {
+  tir::Var PackParam(Expr expr) {
     // TODO(giuseros): Using call_extern to call into lookup_linked_param. This is because the
     // builtin::ret is not supported yet in the c target. Once return is supported we can use
     // tvm_call_packed_lowered().
     int param_sid = param_storage_ids_[params_by_expr_[expr]];
     auto lookup_linked_param_fn = tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
-    auto param_array = te::Var(make_string("param_", param_sid, "_array"), DataType::Handle());
+    auto param_array = te::Var(MakeString("param_", param_sid, "_array"), DataType::Handle());
 
     // Compose the lookup_call using a local stack
     Array<tir::Stmt> lookup_call;
-    auto param_var = te::Var(make_string("param_", param_sid, "_value"), DataType::Handle());
+    auto param_var = te::Var(MakeString("param_", param_sid, "_value"), DataType::Handle());
     auto ret_var = te::Var("ret_value", DataType::Handle());
     auto ret_code = te::Var("ret_value", DataType::Handle());
 
@@ -216,7 +194,7 @@ class AOTCodegen : public ExprVisitor {
   /*!
    * brief Given an expression return the variable(s) associated with that expression
    */
-  std::vector<te::Var> find_expr(Expr arg) {
+  std::vector<te::Var> FindExpr(Expr arg) {
     auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
     if (input_iter != input_vars_.end()) {
       // Input variable
@@ -224,45 +202,47 @@ class AOTCodegen : public ExprVisitor {
       return {main_signature_[main_index]};
     } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
       // Parameter of the network
-      return {pack_param(arg)};
+      return {PackParam(arg)};
     } else {
       // Storage identifier (i.e., intermediate memory)
-      return pack_sid(arg);
+      return PackSid(arg);
     }
   }
 
   /*!
    * brief Call a function with a given name
    */
-  void func_call(Call call, std::string func_name) {
+  void CreateFuncCall(Call call, std::string func_name) {
     tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
-    std::vector<tir::Stmt> func_call_stmts;
+    std::vector<tir::Stmt> CreateFuncCall_stmts;
 
     // Pack the inputs
     for (Expr arg : call->args) {
-      auto var_arg = find_expr(arg);
+      auto var_arg = FindExpr(arg);
       args.push_back(var_arg[0]);
     }
 
     auto ret_expr = Downcast<Expr>(call);
 
     // Pack the return(s) value. A call node can produce multiple outputs
-    for (const auto& var : pack_sid(ret_expr)) {
+    for (const auto& var : PackSid(ret_expr)) {
       args.push_back(var);
     }
 
     // Use tvm_call_packed to execute the function
-    func_call_stmts.push_back(tir::Evaluate(
-        tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_packed(), args)));
-    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    CreateFuncCall_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args)));
+    tir::Stmt body = tir::SeqStmt(CreateFuncCall_stmts);
     stmts_.push_back(body);
   }
 
   /*!
    * brief Copy a variable to the output. This function is mainly used in edge cases
    * when we want to return an input or a parameter.
+   * TODO(giuseros): we should try to avoid unnecessary copy to the output, e.g., in a
+   * copy-on-write fashion.
    */
-  void copy_to_output(te::Var out, te::Var in, size_t size) {
+  void CopyToOutput(te::Var out, te::Var in, size_t size) {
     auto retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(),
                                      {in, 0, tir::builtin::kArrData});
 
@@ -285,7 +265,7 @@ class AOTCodegen : public ExprVisitor {
    * Utility function to string together different arguments
    */
   template <typename... Args>
-  std::string make_string(Args const&... args) {
+  std::string MakeString(Args const&... args) {
     std::ostringstream ss;
     using List = int[];
     (void)List{0, ((void)(ss << args), 0)...};
@@ -328,7 +308,7 @@ class AOTCodegen : public ExprVisitor {
       UpdateConstants(func, &params_);
 
       // Generate the TIR function call
-      func_call(GetRef<Call>(op), ext_func->func_name);
+      CreateFuncCall(GetRef<Call>(op), ext_func->func_name);
     }
 
     ICHECK_GE(storage_device_map_.count(expr), 0);
@@ -360,7 +340,7 @@ class AOTCodegen : public ExprVisitor {
     lowered_funcs_[target->str()]->Update(lowered_func->funcs);
 
     // Generate the TIR function call
-    func_call(GetRef<Call>(op), lowered_func->func_name);
+    CreateFuncCall(GetRef<Call>(op), lowered_func->func_name);
   }
 
   void VisitExpr_(const VarNode* op) override {
@@ -374,8 +354,8 @@ class AOTCodegen : public ExprVisitor {
                                  static_cast<int>((sids[0][0].as<IntImmNode>())->value));
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
-      auto var_expr = find_expr(expr);
-      copy_to_output(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]);
+      auto var_expr = FindExpr(expr);
+      CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]);
     }
   }
 
@@ -395,8 +375,7 @@ class AOTCodegen : public ExprVisitor {
                                  static_cast<int>((sids[0][0].as<IntImmNode>())->value));
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
-      copy_to_output(main_signature_[input_vars_.size() + output_index], pack_param(expr),
-                     sids[2][0]);
+      CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]);
     }
   }
 
@@ -418,7 +397,7 @@ class AOTCodegen : public ExprVisitor {
   void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if not supported"); }
   void VisitExpr_(const FunctionNode* op) override {
     ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
-        << "Only functions supported by custom codegen";
+        << "FunctionNode only supported by custom codegen";
   }
   void VisitExpr_(const RefCreateNode* op) override {
     throw std::invalid_argument("reference not supported");
@@ -460,10 +439,10 @@ class AOTCodegen : public ExprVisitor {
           continue;
         }
 
-        // TODO(giuseros): we should allocate this one time outside the PrimFunc
-        // so we dont' pay the price of allocation for every inference
+        // TODO(giuseros): we should allocate this once outside the PrimFunc
+        // so we don't pay the price of allocation for every inference
         if (!allocated[sid]) {
-          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), body);
+          body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body);
         }
         allocated[sid] = true;
       }
@@ -473,9 +452,13 @@ class AOTCodegen : public ExprVisitor {
     body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
     body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
 
+    // Define the PrimFunc attributes
+    Map<String, ObjectRef> dict_attrs;
+    dict_attrs.Set("global_symbol", runtime::String(runtime::symbol::tvm_run_func_prefix));
+
     // Make the PrimFunc
     return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, tir::Buffer>(),
-                         DictAttrs(dict_attrs_));
+                         DictAttrs(dict_attrs));
   }
 
  protected:
@@ -489,8 +472,6 @@ class AOTCodegen : public ExprVisitor {
   TargetsMap targets_;
   /*! \brief target host */
   Target target_host_;
-  /*! PrimFunc attributes */
-  Map<String, ObjectRef> dict_attrs_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -509,14 +490,8 @@ class AOTCodegen : public ExprVisitor {
   std::unordered_map<int, te::Var> sids_table_;
   /*! \brief lowered funcs */
   std::unordered_map<std::string, IRModule> lowered_funcs_;
-  /*! \brief name map */
-  std::unordered_map<std::string, size_t> name_map_;
   /*! \brief compile engine */
   CompileEngine compile_engine_;
-  /*! \brief GraphPlanMemory module */
-  runtime::Module graph_plan_memory_module_;
-  /*! \brief the IR module stored which represents the executor program */
-  Map<String, IRModule> tir_module_;
   /*! \brief the set of statements that make the program */
   std::vector<tir::Stmt> stmts_;
   /*! \brief the list of return sids (note that the function might return more then one output */
@@ -528,10 +503,9 @@ class AOTCodegen : public ExprVisitor {
     compile_engine_ = CompileEngine::Global();
     targets_ = targets;
     target_host_ = target_host;
-    dict_attrs_.Set("global_symbol", runtime::String("tvm__run_func"));
   }
 
-  AOTLoweredOutput Codegen(relay::Function func) {
+  LoweredOutput Codegen(relay::Function func) {
     // Get the module, storage map and token sizes
     auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
     storage_device_map_ = (*pf)(func);
@@ -539,13 +513,13 @@ class AOTCodegen : public ExprVisitor {
     int input_index = 0;
     for (auto input : func->params) {
       input_vars_.push_back(input);
-      main_signature_.push_back(tir::Var(make_string("input_", input_index), DataType::Handle()));
+      main_signature_.push_back(tir::Var(MakeString("input_", input_index), DataType::Handle()));
     }
 
     // Define the storage allocator ids
     for (auto kv : storage_device_map_) {
       for (const auto& sid : kv.second[0]) {
-        te::Var sid_var(make_string("sid_", sid), DataType::Handle());
+        te::Var sid_var(MakeString("sid_", sid), PointerType(PrimType(DataType::Int(8))));
         sids_table_[sid] = sid_var;
       }
     }
@@ -553,13 +527,13 @@ class AOTCodegen : public ExprVisitor {
     // Find the return sid
     return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
     for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
-      main_signature_.push_back(tir::Var(make_string("output_", output_index), DataType::Handle()));
+      main_signature_.push_back(tir::Var(MakeString("output_", output_index), DataType::Handle()));
     }
 
     VisitExpr(func->body);
 
     auto prim_func = CreateMainFunc(func->params.size());
-    AOTLoweredOutput ret;
+    LoweredOutput ret;
 
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
     for (auto param : params_) {
@@ -588,8 +562,7 @@ class AOTCodegen : public ExprVisitor {
       ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
     }
 
-    ret.runner_func = prim_func;
-    ret.aot_metadata = runtime::AOTMetadata(input_vars_.size(), return_sid_.size());
+    ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size());
     return ret;
   }
 };
@@ -600,22 +573,17 @@ class AOTCodegenModule : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        ICHECK_EQ(args.num_args, 3) << "The expected of arguments are: "
-                                    << "runtime::Module mod and Map<int, Target> targets";
+        ICHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
+                                    << "runtime::Module mod and  Map<int, Target> targets";
         void* mod = args[0];
-        Map<Integer, tvm::Target> tmp = args[1];
-        tvm::Target target_host = args[2];
-        init(mod, tmp, target_host);
+        Map<Integer, tvm::Target> targets = args[1];
+        init(mod, targets);
       });
     } else if (name == "codegen") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         Function func = args[0];
         this->output_ = codegen(func);
       });
-    } else if (name == "get_runner_function") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = get_runner_function();
-      });  // c; });
     } else if (name == "list_params_name") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = list_params_name(); });
@@ -635,9 +603,9 @@ class AOTCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_external_modules") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_external_modules(); });
-    } else if (name == "get_aot_metadata") {
+    } else if (name == "get_metadata") {
       return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_aot_metadata(); });
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = output_.metadata; });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
@@ -646,10 +614,14 @@ class AOTCodegenModule : public runtime::ModuleNode {
   const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; }
 
  private:
-  void init(void* mod, Map<Integer, tvm::Target> tmp, Target target_host) {
+  void init(void* mod, Map<Integer, tvm::Target> tmp) {
     TargetsMap targets;
+    Target target_host;
     for (const auto& it : tmp) {
       auto dev_type = it.first.as<tir::IntImmNode>();
+      if (!target_host.defined() && it.second->kind->device_type == kDLCPU) {
+        target_host = it.second;
+      }
       ICHECK(dev_type);
       targets[dev_type->value] = it.second;
     }
@@ -657,9 +629,7 @@ class AOTCodegenModule : public runtime::ModuleNode {
         std::make_shared<AOTCodegen>(reinterpret_cast<runtime::Module*>(mod), targets, target_host);
   }
 
-  AOTLoweredOutput codegen(Function func) { return this->codegen_->Codegen(func); }
-
-  tir::PrimFunc get_runner_function() { return this->output_.runner_func; }
+  LoweredOutput codegen(Function func) { return this->codegen_->Codegen(func); }
 
   Array<runtime::String> list_params_name() {
     Array<runtime::String> ret;
@@ -685,10 +655,8 @@ class AOTCodegenModule : public runtime::ModuleNode {
 
   Map<String, IRModule> get_irmodule() { return this->output_.lowered_funcs; }
 
-  runtime::AOTMetadata get_aot_metadata() { return output_.aot_metadata; }
-
   std::shared_ptr<AOTCodegen> codegen_;
-  AOTLoweredOutput output_;
+  LoweredOutput output_;
 };
 
 runtime::Module CreateAOTCodegenMod() {
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 2f192c0b0c17..57bd48102ae1 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -43,80 +43,20 @@ namespace backend {
 using TargetsMap = Map<tvm::Integer, tvm::Target>;
 using namespace tvm::relay::transform;
 
-/*!
- * Type of supported executors
- */
-enum class ExecutorType { Graph, Aot };
-
 /*!
  * \brief Output of building module
  */
 struct BuildOutput {
   std::string graph_json;
-  tir::PrimFunc runner_function;
   runtime::Module mod;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
 
-/*!
- * \brief GraphCodegen module wrapper
- *
- */
-struct GraphCodegen {
- public:
-  explicit GraphCodegen(Target target_host) : target_host_(target_host) {
-    const String executor_str =
-        target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
-    if (executor_str == kTvmExecutorGraph) {
-      executor_ = ExecutorType::Graph;
-      auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
-      mod = (*pf)();
-    } else if (executor_str == kTvmExecutorAot) {
-      executor_ = ExecutorType::Aot;
-      auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
-      mod = (*pf)();
-    } else {
-      LOG(FATAL) << "Executor " << executor_str << " not supported";
-    }
-  }
-  ~GraphCodegen() {}
-
-  void Init(runtime::Module* m, TargetsMap targets) {
-    if (executor_ == ExecutorType::Graph) {
-      CallFunc("init", m, targets);
-    } else if (executor_ == ExecutorType::Aot) {
-      CallFunc("init", m, targets, target_host_);
-    } else {
-      LOG(FATAL) << "Executor not supported";
-    }
-  }
+struct ExecutorCodegen {
+  void Init(runtime::Module* m, TargetsMap targets) { CallFunc("init", m, targets); }
 
   void Codegen(const Function& func) { CallFunc("codegen", func); }
 
-  std::string GetJSON() {
-    if (executor_ == ExecutorType::Graph) {
-      return CallFunc<std::string>("get_graph_json", nullptr);
-    } else {
-      return "";
-    }
-  }
-
-  tir::PrimFunc GetRunnerFunction() {
-    if (executor_ == ExecutorType::Aot) {
-      return CallFunc<tir::PrimFunc>("get_runner_function");
-    } else {
-      return tir::PrimFunc();
-    }
-  }
-
-  Array<tvm::runtime::Module> GetExternalModules() {
-    return CallFunc<Array<tvm::runtime::Module>>("get_external_modules", nullptr);
-  }
-
-  Map<String, IRModule> GetIRModule() {
-    return CallFunc<Map<String, IRModule>>("get_irmodule", nullptr);
-  }
-
   std::unordered_map<std::string, tvm::runtime::NDArray> GetParams() {
     std::unordered_map<std::string, tvm::runtime::NDArray> ret;
     auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
@@ -139,18 +79,17 @@ struct GraphCodegen {
     return ret;
   }
 
-  runtime::AOTMetadata GetAOTMetadata() {
-    if (executor_ == ExecutorType::Aot) {
-      return CallFunc<runtime::AOTMetadata>("get_aot_metadata");
-    } else {
-      // Graph runtime does not need AOT metadata
-      return runtime::AOTMetadata();
-    }
+  Array<tvm::runtime::Module> GetExternalModules() {
+    return CallFunc<Array<tvm::runtime::Module>>("get_external_modules", nullptr);
   }
 
+  Map<String, IRModule> GetIRModule() {
+    return CallFunc<Map<String, IRModule>>("get_irmodule", nullptr);
+  }
+
+  runtime::Metadata GetMetadata() { return CallFunc<runtime::Metadata>("get_metadata"); }
+
  protected:
-  ExecutorType executor_;
-  Target target_host_;
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
   R CallFunc(const std::string& name, Args... args) {
@@ -165,6 +104,29 @@ struct GraphCodegen {
   }
 };
 
+struct AOTCodegen : public ExecutorCodegen {
+  AOTCodegen() {
+    auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
+    mod = (*pf)();
+  }
+  ~AOTCodegen() {}
+};
+
+/*!
+ * \brief GraphCodegen module wrapper
+ *
+ */
+struct GraphCodegen : public ExecutorCodegen {
+ public:
+  GraphCodegen() {
+    auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
+    mod = (*pf)();
+  }
+
+  std::string GetJSON() { return CallFunc<std::string>("get_graph_json", nullptr); }
+  ~GraphCodegen();
+};
+
 /*!
  * \brief Relay build module
  *
@@ -181,9 +143,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (name == "get_graph_json") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetJSON(); });
-    } else if (name == "get_runner_function") {
-      return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetRunnerFunction(); });
     } else if (name == "get_module") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
@@ -207,11 +166,11 @@ class RelayBuildModule : public runtime::ModuleNode {
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->graph_codegen_->GetIRModule();
+        *rv = this->executor_codegen_->GetIRModule();
       });
     } else if (name == "get_external_modules") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->graph_codegen_->GetExternalModules();
+        *rv = this->executor_codegen_->GetExternalModules();
       });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -239,13 +198,6 @@ class RelayBuildModule : public runtime::ModuleNode {
    */
   const std::string& GetJSON() { return ret_.graph_json; }
 
-  /*!
-   * \brief Get the GraphJSON for runtime
-   *
-   * \return const std::string graph_json
-   */
-  const tir::PrimFunc& GetRunnerFunction() { return ret_.runner_function; }
-
   /*!
    * \brief Get the Module object
    *
@@ -545,20 +497,30 @@ class RelayBuildModule : public runtime::ModuleNode {
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
     // Generate code for the updated function.
-    graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen(target_host));
-    graph_codegen_->Init(nullptr, targets_);
-    graph_codegen_->Codegen(func);
+    const String executor_str =
+        target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
+    if (executor_str == kTvmExecutorGraph) {
+      executor_codegen_ = std::unique_ptr<ExecutorCodegen>(new GraphCodegen());
+    } else {
+      executor_codegen_ = std::unique_ptr<ExecutorCodegen>(new AOTCodegen());
+    }
 
-    ret_.graph_json = graph_codegen_->GetJSON();
-    ret_.runner_function = graph_codegen_->GetRunnerFunction();
-    ret_.params = graph_codegen_->GetParams();
+    executor_codegen_->Init(nullptr, targets_);
+    executor_codegen_->Codegen(func);
+
+    if (executor_str == kTvmExecutorGraph) {
+      ret_.graph_json = reinterpret_cast<GraphCodegen*>(executor_codegen_.get())->GetJSON();
+    } else {
+      ret_.graph_json = "";
+    }
+    ret_.params = executor_codegen_->GetParams();
 
-    auto lowered_funcs = graph_codegen_->GetIRModule();
+    auto lowered_funcs = executor_codegen_->GetIRModule();
 
     // Generate a placeholder function that attaches linked params as its arguments.
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
       CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
-      auto param_ids = graph_codegen_->GetParamIds();
+      auto param_ids = executor_codegen_->GetParamIds();
       auto link_params = Map<String, tir::LinkedParam>();
       for (auto param : ret_.params) {
         link_params.Set(param.first, tir::LinkedParam(param_ids[param.first], param.second));
@@ -592,9 +554,9 @@ class RelayBuildModule : public runtime::ModuleNode {
       ret_.mod = tvm::build(lowered_funcs, target_host_);
     }
 
-    auto ext_mods = graph_codegen_->GetExternalModules();
+    auto ext_mods = executor_codegen_->GetExternalModules();
     ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, GetTargetHost(),
-                                                  graph_codegen_->GetAOTMetadata());
+                                                  executor_codegen_->GetMetadata());
   }
 
  private:
@@ -612,7 +574,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   }
 
  protected:
-  std::unique_ptr<GraphCodegen> graph_codegen_;
+  std::unique_ptr<ExecutorCodegen> executor_codegen_;
   /*! \brief target device */
   TargetsMap targets_;
   /*! \brief target host device */
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 16a07ec15a40..292962469de9 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -51,14 +51,6 @@ using GraphInputObjectPtr = std::shared_ptr<GraphInputNode>;
 using GraphOpObjectPtr = std::shared_ptr<GraphOpNode>;
 using TargetsMap = std::unordered_map<int, Target>;
 
-/*! \brief Lowered outputs */
-struct LoweredOutput {
-  std::string graph_json;
-  Map<String, IRModule> lowered_funcs;
-  Array<tvm::runtime::Module> external_mods;
-  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
-};
-
 /*! \brief Node types */
 enum GraphNodeType {
   kGraphNop,
@@ -637,6 +629,9 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->output_.external_mods;
       });
+    } else if (name == "get_metadata") {
+      return PackedFunc(
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.metadata; });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 6908ca85f582..7322f3dde3f5 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -37,12 +37,24 @@
 #include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "../../runtime/meta_data.h"
+
 namespace tvm {
 namespace relay {
 namespace backend {
 
+/*! \brief Lowered outputs */
+struct LoweredOutput {
+  std::string graph_json;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
+  runtime::Metadata metadata;
+};
+
 /*!
  * \brief A helper to expand the params by adding the ones used in a given expression.
  */
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index 4f30c152f4af..3880493d1780 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -25,9 +25,8 @@
  * \param context Context information to be passed through to operators
  * \return tvm_status_t containing success or errors from the model run
  */
-#include "tvm/runtime/crt/aot_executor.h"
-
-#include "tvm/runtime/c_runtime_api.h"
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/internal/aot_executor/aot_executor.h>
 
 tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void** outputs) {
   static DLDevice fake_device = {kDLCPU, 0};
diff --git a/src/runtime/crt/common/aot_backend_api.c b/src/runtime/crt/common/aot_backend_api.c
deleted file mode 100644
index 782ea89f0cb2..000000000000
--- a/src/runtime/crt/common/aot_backend_api.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-// LINT_C_FILE
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tvm/runtime/c_backend_api.h>
-#include <tvm/runtime/crt/error_codes.h>
-#include <tvm/runtime/crt/logging.h>
-#include <tvm/runtime/crt/platform.h>
-
-#include "crt_config.h"
-
-void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
-                               int dtype_bits_hint) {
-  tvm_crt_error_t err = kTvmErrorNoError;
-  void* ptr = 0;
-  DLDevice dev = {device_type, device_id};
-  assert(nbytes > 0);
-  err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr);
-  CHECK_EQ(err, kTvmErrorNoError,
-           "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type,
-           device_id, nbytes, dtype_code_hint, dtype_bits_hint, err);
-  return ptr;
-}
-
-int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
-  tvm_crt_error_t err = kTvmErrorNoError;
-  DLDevice dev = {device_type, device_id};
-  err = TVMPlatformMemoryFree(ptr, dev);
-  return err;
-}
-
-int TVMBackendParallelLaunch(FTVMParallelLambda flambda, void* cdata, int num_task) {
-  TVMParallelGroupEnv env;
-  env.num_task = 1;
-  flambda(0, &env, cdata);
-  return 0;
-}
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index c624462ba68c..0b0c81169756 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -123,7 +123,8 @@ int testonly_reset_server(TVMValue* args, int* type_codes, int num_args, TVMValu
 
 int main(int argc, char** argv) {
   g_argv = argv;
-  int status = MemoryManagerCreate(&memory_manager, memory, sizeof(memory), 8 /* page_size_log2 */);
+  int status =
+      PageMemoryManagerCreate(&memory_manager, memory, sizeof(memory), 8 /* page_size_log2 */);
   if (status != 0) {
     fprintf(stderr, "error initiailizing memory manager\n");
     return 2;
diff --git a/include/tvm/runtime/crt/aot_executor.h b/src/runtime/crt/include/tvm/runtime/crt/internal/aot_executor/aot_executor.h
similarity index 90%
rename from include/tvm/runtime/crt/aot_executor.h
rename to src/runtime/crt/include/tvm/runtime/crt/internal/aot_executor/aot_executor.h
index 35b440d44ca3..e49ca9933116 100644
--- a/include/tvm/runtime/crt/aot_executor.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/aot_executor/aot_executor.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- * \file include/tvm/runtime/crt/aot_executor.h
  * \brief TVM Executor for the Ahead-of-Time Runtime
  *
  * AOT models are described by the TVM model descriptor format
@@ -47,8 +46,8 @@
  * }
  */
 
-#ifndef TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
-#define TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
+#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_AOT_EXECUTOR_AOT_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_AOT_EXECUTOR_AOT_EXECUTOR_H_
 
 #include <stdint.h>
 #include <tvm/runtime/c_backend_api.h>
@@ -81,4 +80,4 @@ tvm_crt_error_t tvm_runtime_run(const tvm_model_t* model, void** inputs, void**
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
+#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_AOT_EXECUTOR_AOT_EXECUTOR_H_
diff --git a/src/runtime/crt/memory/page_allocator.c b/src/runtime/crt/memory/page_allocator.c
index 18dd818602f9..c016fe2acbef 100644
--- a/src/runtime/crt/memory/page_allocator.c
+++ b/src/runtime/crt/memory/page_allocator.c
@@ -122,8 +122,8 @@ void MultiMap_Insert(struct MultiMap* map, uint32_t npage, Page* p) {
  * \param size The size of memory
  * \return The virtual address
  */
-tvm_crt_error_t MemoryManager_Allocate(MemoryManagerInterface* interface, size_t num_bytes,
-                                       DLDevice dev, void** out_ptr) {
+tvm_crt_error_t PageMemoryManager_Allocate(MemoryManagerInterface* interface, size_t num_bytes,
+                                           DLDevice dev, void** out_ptr) {
   MemoryManager* mgr = (MemoryManager*)interface;
 
   *out_ptr = 0;
@@ -169,8 +169,8 @@ tvm_crt_error_t MemoryManager_Allocate(MemoryManagerInterface* interface, size_t
  * \param num_bytes The size of memory now required.
  * \return kTvmErrorNoError on success.
  */
-tvm_crt_error_t MemoryManager_Realloc(MemoryManagerInterface* interface, void** ptr,
-                                      tvm_index_t num_bytes) {
+tvm_crt_error_t PageMemoryManager_Realloc(MemoryManagerInterface* interface, void** ptr,
+                                          tvm_index_t num_bytes) {
   MemoryManager* mgr = (MemoryManager*)interface;
 
   uint8_t* data = *((uint8_t**)ptr);  // NOLINT(*)
@@ -258,7 +258,7 @@ tvm_crt_error_t MemoryManager_Realloc(MemoryManagerInterface* interface, void**
  * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
  * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
  */
-tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLDevice dev) {
+tvm_crt_error_t PageMemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLDevice dev) {
   MemoryManager* mgr = (MemoryManager*)interface;
 
   TLB* pmap = &(mgr->pmap);
@@ -277,8 +277,9 @@ tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr,
   return kTvmErrorNoError;
 }
 
-tvm_crt_error_t MemoryManagerCreate(MemoryManagerInterface** interface, uint8_t* memory_pool,
-                                    size_t memory_pool_size_bytes, size_t page_size_bytes_log2) {
+tvm_crt_error_t PageMemoryManagerCreate(MemoryManagerInterface** interface, uint8_t* memory_pool,
+                                        size_t memory_pool_size_bytes,
+                                        size_t page_size_bytes_log2) {
   memset(memory_pool, 0, sizeof(memory_pool_size_bytes));
 
   // Allocate enough space for MAX_PAGES.
@@ -291,14 +292,14 @@ tvm_crt_error_t MemoryManagerCreate(MemoryManagerInterface** interface, uint8_t*
   MemoryManager* manager = (MemoryManager*)metadata_cursor;
   *interface = &manager->interface;
   /* handle MemoryManager member functions */
-  manager->interface.Allocate = MemoryManager_Allocate;
+  manager->interface.Allocate = PageMemoryManager_Allocate;
   //  manager->Realloc = MemoryManager_Reallocate;
-  manager->interface.Free = MemoryManager_Free;
+  manager->interface.Free = PageMemoryManager_Free;
 
   metadata_cursor += sizeof(MemoryManager);
 
-  manager->interface.Allocate = MemoryManager_Allocate;
-  manager->interface.Free = MemoryManager_Free;
+  manager->interface.Allocate = PageMemoryManager_Allocate;
+  manager->interface.Free = PageMemoryManager_Free;
   manager->ptable.memory_pool = memory_pool;
 
   /* handle PageTable member functions */
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index aa819ea2343c..6cb39187193b 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -41,9 +41,9 @@ namespace tvm {
 namespace runtime {
 
 /*!
- * \brief Structure used by the AOT to fill the tvm_module_t structure
+ * \brief Structure that can be optionally used by the executor codegen
  */
-class AOTMetadataNode : public Object {
+class MetadataNode : public Object {
  public:
   /*! \brief number of inputs of the main function */
   int num_inputs = 1;
@@ -51,24 +51,24 @@ class AOTMetadataNode : public Object {
   int num_outputs = 1;
 
   static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-  static constexpr const char* _type_key = "AOTMetadataObj";
-  TVM_DECLARE_FINAL_OBJECT_INFO(AOTMetadataNode, Object);
+  static constexpr const char* _type_key = "MetadataObj";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
 };
 
 /*!
- * \brief Managed reference to AOTMetadataNode.
+ * \brief Managed reference to MetadataNode.
  */
-class AOTMetadata : public ObjectRef {
+class Metadata : public ObjectRef {
  public:
-  TVM_DLL AOTMetadata(int num_inputs, int num_outputs) {
-    auto n = make_object<AOTMetadataNode>();
+  TVM_DLL Metadata(int num_inputs, int num_outputs) {
+    auto n = make_object<MetadataNode>();
     n->num_inputs = num_inputs;
     n->num_outputs = num_outputs;
     data_ = std::move(n);
   }
 
-  TVM_DEFINE_OBJECT_REF_METHODS(AOTMetadata, ObjectRef, AOTMetadataNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(AOTMetadataNode);
+  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
 };
 
 /*!
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 55b445c34b4a..4fdcd1b09325 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -47,7 +47,7 @@ namespace codegen {
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
-    runtime::AOTMetadata aot_metadata) {
+    runtime::Metadata metadata) {
   // Here we split modules into two groups:
   //  1. Those modules which can be exported to C-runtime. These are DSO-exportable
   //     (i.e. llvm or c) modules which return nothing from get_const_vars().
@@ -115,7 +115,7 @@ runtime::Module CreateMetadataModule(
 
     if (target->kind->name == "c") {
       crt_exportable_modules.push_back(target_module);
-      target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target, aot_metadata);
+      target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target, metadata);
     } else if (target->kind->name == "llvm") {
 #ifdef TVM_LLVM_VERSION
       crt_exportable_modules.push_back(target_module);
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index b643459be4b6..1627b6003391 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -852,8 +852,11 @@ void CodeGenC::VisitStmt_(const AllocateNode* op) {
   int32_t constant_size = op->constant_allocation_size();
   ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
-  std::string scope = alloc_storage_scope_.at(buffer);
-  PrintStorageScope(scope, stream);
+  auto it = alloc_storage_scope_.find(buffer);
+  if (it != alloc_storage_scope_.end()) {
+    std::string scope = alloc_storage_scope_.at(buffer);
+    PrintStorageScope(scope, stream);
+  }
   PrintType(op->dtype, stream);
   stream << ' ' << vid << '[' << constant_size << "];\n";
 
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 91ffa9546367..8402ccd4737e 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -40,10 +40,8 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool is_aot_executor,
-                        std::string target_str) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str) {
   emit_asserts_ = emit_asserts;
-  is_aot_executor_ = is_aot_executor;
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#define TVM_EXPORTS\n";
@@ -214,33 +212,45 @@ void CodeGenCHost::PrintGetFuncFromBackend(const std::string& func_name,
   this->stream << "}\n";
 }
 
-void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, PrimExpr values,
-                                 int num_args) {
+void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, int num_args) {
   this->PrintIndent();
-  std::string stack_value = "stack_value";
-  if (const VarNode* stack_value_var = values.as<VarNode>()) {
-    stack_value = stack_value_var->name_hint;
-  }
   std::string ret_val = GetUniqueName("ret_val");
   std::string ret_type_code = GetUniqueName("ret_type_code");
   this->stream << "TVMValue " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
+  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
+               << "(TVMValue*) stack_value"
+               << ", "
+               << "(int*) stack_tcode"
+               << ", " << num_args << ", "
+               << "&" << ret_val << ", "
+               << "&" << ret_type_code << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
 
-  if (is_aot_executor_) {
-    this->stream << "if (" << packed_func_name << "( "
-                 << "(TVMValue*) " << stack_value;
-  } else {
-    this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-                 << "(TVMValue*) stack_value";
-  }
-  this->stream << ", "
+void CodeGenCHost::PrintFuncCallC(const std::string& packed_func_name, int num_args) {
+  this->PrintIndent();
+  std::string ret_val = GetUniqueName("ret_val");
+  std::string ret_type_code = GetUniqueName("ret_type_code");
+  this->stream << "TVMValue " << ret_val << ";\n";
+  this->PrintIndent();
+  this->stream << "int " << ret_type_code << ";\n";
+  this->PrintIndent();
+
+  this->stream << "if (" << packed_func_name << "( "
+               << "(TVMValue*) stack_value "
+               << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
-               << "&" << ret_val << ", ";
-  this->stream << "&" << ret_type_code;
-  this->stream << (is_aot_executor_ ? ", NULL" : "") << ") != 0) {\n";
+               << "&" << ret_val << ", "
+               << "&" << ret_type_code << ", NULL) != 0){\n";
 
   int func_call_scope = this->BeginScope();
   this->PrintIndent();
@@ -250,6 +260,29 @@ void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, PrimExpr v
   this->stream << "}\n";
 }
 
+CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op) {
+  const StringImmNode* s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+  int64_t begin = op->args[3].as<IntImmNode>()->value;
+  int64_t end = op->args[4].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+  std::string func_name = s->value;
+  // NOTE: cannot rely on GetUnique for global decl_stream declarations
+  // because it is reset between AddFunction().
+  std::string packed_func_name = func_name + "_packed";
+  std::string unique_name;
+  auto it = declared_globals_.find(packed_func_name);
+  if (it != declared_globals_.end()) {
+    unique_name = it->second;
+  } else {
+    unique_name = GetUniqueName(packed_func_name);
+    declared_globals_[packed_func_name] = unique_name;
+    decl_stream << "static void* " << unique_name << " = NULL;\n";
+  }
+  return {func_name, unique_name, num_args};
+}
+
 void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
   if (op->op.same_as(builtin::tvm_stack_alloca())) {
     std::string stack_name = GetUniqueName("stack");
@@ -274,30 +307,12 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    const StringImmNode* s = op->args[0].as<StringImmNode>();
-    ICHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
-    int64_t begin = op->args[3].as<IntImmNode>()->value;
-    int64_t end = op->args[4].as<IntImmNode>()->value;
-    int64_t num_args = end - begin;
-    ICHECK_GE(num_args, 0);
-    std::string func_name = s->value;
-    // NOTE: cannot rely on GetUnique for global decl_stream declarations
-    // because it is reset between AddFunction().
-    std::string packed_func_name = func_name + "_packed";
-    std::string unique_name;
-    auto it = declared_globals_.find(packed_func_name);
-    if (it != declared_globals_.end()) {
-      unique_name = it->second;
-    } else {
-      unique_name = GetUniqueName(packed_func_name);
-      declared_globals_[packed_func_name] = unique_name;
-      decl_stream << "static void* " << unique_name << " = NULL;\n";
-    }
-    if (!is_aot_executor_) {
-      this->PrintGetFuncFromBackend(func_name, unique_name);
-    }
-    this->PrintFuncCall(unique_name, num_args);
-
+    auto function_info = GetFunctionInfo(op);
+    this->PrintGetFuncFromBackend(function_info.func_name, function_info.func_name_packed);
+    this->PrintFuncCall(function_info.func_name, function_info.num_args);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    auto function_info = GetFunctionInfo(op);
+    this->PrintFuncCallC(function_info.func_name, function_info.num_args);
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
     this->PrintIndent();
     this->stream << "return -1;\n";
@@ -346,14 +361,11 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, const char* compare,
 }
 
 runtime::Module BuildCHost(IRModule mod, Target target) {
-  bool is_aot_executor =
-      (target->GetAttr<String>("executor").value_or(kTvmExecutorGraph) == kTvmExecutorAot);
-
   using tvm::runtime::Registry;
   bool output_ssa = false;
   bool emit_asserts = false;
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts, is_aot_executor, target->str());
+  cg.Init(output_ssa, emit_asserts, target->str());
 
   Map<String, LinkedParam> linked_params;
   bool found_linked_params = false;
@@ -373,14 +385,13 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
     }
     // Make sure that the executor function is the last one to be code generated so that all the
     // symbols are available to tvm_run_func
-    if (is_aot_executor) {
-      auto fun_name = std::string(kv.first->name_hint);
-      const bool is_aot_executor_fn =
-          (fun_name.rfind(::tvm::runtime::symbol::tvm_run_func_prefix, 0) == 0);
-      if (is_aot_executor_fn) {
-        aot_executor_fn = Downcast<PrimFunc>(kv.second);
-        continue;
-      }
+    auto fun_name = std::string(kv.first->name_hint);
+    const bool is_aot_executor_fn =
+        (fun_name.rfind(::tvm::runtime::symbol::tvm_run_func_prefix, 0) == 0);
+
+    if (is_aot_executor_fn) {
+      aot_executor_fn = Downcast<PrimFunc>(kv.second);
+      continue;
     }
 
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
@@ -393,10 +404,7 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
     cg.LinkParameters(linked_params);
   }
 
-  if (is_aot_executor) {
-    ICHECK(aot_executor_fn.defined())
-        << "When using aot executor the executor function "
-        << ::tvm::runtime::symbol::tvm_lookup_linked_param << " should be defined";
+  if (aot_executor_fn.defined()) {
     cg.AddFunction(aot_executor_fn);
   }
 
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index caf17ff832db..2ee31b8c7e0e 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -38,7 +38,7 @@ namespace codegen {
 class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts, bool is_aot_executor, std::string target_str);
+  void Init(bool output_ssa, bool emit_asserts, std::string target_str);
 
   void AddFunction(const PrimFunc& f);
 
@@ -62,6 +62,15 @@ class CodeGenCHost final : public CodeGenC {
   Array<String> GetFunctionNames() { return function_names_; }
 
  private:
+  /* \brief Internal structure to store information about function calls */
+  struct FunctionInfo {
+    /* \brief function name */
+    std::string func_name;
+    /* packed name of the function */
+    std::string func_name_packed;
+    /* number of arguments required by the function */
+    int64_t num_args;
+  };
   std::string module_name_;
   /* \brief mapping global packed func to the unique name */
   std::unordered_map<std::string, std::string> declared_globals_;
@@ -69,10 +78,11 @@ class CodeGenCHost final : public CodeGenC {
   Array<String> function_names_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
-  bool is_aot_executor_;
 
+  FunctionInfo GetFunctionInfo(const CallNode* op);
   void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name);
-  void PrintFuncCall(const std::string& packed_func_name, PrimExpr values, int num_args);
+  void PrintFuncCall(const std::string& packed_func_name, int num_args);
+  void PrintFuncCallC(const std::string& packed_func_name, int num_args);
 
   /*!
    * \brief Print ternary conditional operator implementing binary `op`
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 32377df41f12..80fc9b486971 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -157,7 +157,7 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
     const Array<runtime::Module>& ext_modules, Target target,
-    runtime::AOTMetadata aot_metadata = runtime::AOTMetadata());
+    runtime::Metadata metadata = runtime::Metadata());
 
 /*!
  * \brief Create a source module for viewing and limited saving for device.
@@ -175,11 +175,11 @@ runtime::Module DeviceSourceModuleCreate(
  * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime.
  * \param modules The modules to be wrapped.
  * \param target the target the modules are compiled for.
- * \param aot_metadata the metadata needed for aot code generation.
+ * \param metadata the metadata needed for code generation.
  * \return The wrapped module.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
-                                               runtime::AOTMetadata aot_metadata);
+                                               runtime::Metadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index bf24692c3484..ccf520840f47 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -130,8 +130,8 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
 class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
  public:
   CSourceCrtMetadataModuleNode(const Array<String>& func_names, const std::string& fmt,
-                               Target target, runtime::AOTMetadata aot_metadata)
-      : fmt_(fmt), func_names_(func_names), target_(target), aot_metadata_(aot_metadata) {
+                               Target target, runtime::Metadata metadata)
+      : fmt_(fmt), func_names_(func_names), target_(target), metadata_(metadata) {
     CreateSource();
   }
   const char* type_key() const { return "c"; }
@@ -159,7 +159,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   std::string fmt_;
   Array<String> func_names_;
   Target target_;
-  runtime::AOTMetadata aot_metadata_;
+  runtime::Metadata metadata_;
 
   void CreateFuncRegistry() {
     code_ << "#include <tvm/runtime/crt/module.h>\n";
@@ -193,7 +193,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   void GenerateAOTDescriptor() {
-    code_ << "#include \"aot_executor.h\"\n";
+    code_ << "#include \"tvm/runtime/crt/internal/aot_executor/aot_executor.h\"\n";
     code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
     code_ << "#ifdef __cplusplus\n";
     code_ << "extern \"C\"\n";
@@ -203,8 +203,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
              "out_type_code, void* resource_handle);\n";
     code_ << "const tvm_model_t network = {\n"
           << "    .run_func = &" << ::tvm::runtime::symbol::tvm_run_func_prefix << ",\n"
-          << "    .num_input_tensors = " << aot_metadata_->num_inputs << ",\n"
-          << "    .num_output_tensors = " << aot_metadata_->num_outputs << ", \n"
+          << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
+          << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
           << "};\n";
   }
 
@@ -213,7 +213,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
-    if (target_->GetAttr<String>("executor").value_or("graph_runtime") == "aot") {
+    if (target_->GetAttr<String>("executor").value_or(kTvmExecutorGraph) == kTvmExecutorAot) {
       GenerateAOTDescriptor();
     }
     code_ << ";";
@@ -221,7 +221,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 };
 
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
-                                               runtime::AOTMetadata aot_metadata) {
+                                               runtime::Metadata metadata) {
   Array<String> func_names;
   for (runtime::Module mod : modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
@@ -232,7 +232,7 @@ runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& mod
       }
     }
   }
-  auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "cc", target, aot_metadata);
+  auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "cc", target, metadata);
   auto csrc_metadata_module = runtime::Module(n);
   for (const auto& mod : modules) {
     csrc_metadata_module.Import(mod);
@@ -304,7 +304,7 @@ TVM_REGISTER_GLOBAL("runtime.CSourceModuleCreate")
 TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule")
     .set_body_typed([](const Array<runtime::Module>& modules, Target target) {
       // Note that we don't need metadata when we compile a single operator
-      return CreateCSourceCrtMetadataModule(modules, target, runtime::AOTMetadata());
+      return CreateCSourceCrtMetadataModule(modules, target, runtime::Metadata());
     });
 
 }  // namespace codegen
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index f4f52eccd1dd..6226ba2f22b3 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -40,8 +40,7 @@ namespace codegen {
  * \param target TVM target.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
-                                               tvm::Target target,
-                                               runtime::AOTMetadata aot_metadata);
+                                               tvm::Target target, runtime::Metadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index c4d76d4a7494..f3ab78f89bec 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -174,7 +174,7 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_stack_make_array)
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(tvm_call_unpacked)
+TIR_DEFINE_BUILTIN_FUNC(tvm_call_cpacked)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_trace_packed)
@@ -187,6 +187,9 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_thread_context)
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed_lowered)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(tvm_call_cpacked_lowered)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_trace_packed_lowered)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index ef50dae82ce0..db5af9412d95 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -179,7 +179,9 @@ class BuiltinLower : public StmtExprMutator {
   }
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_call_packed())) {
-      return MakeCallPacked(op);
+      return MakeCallPacked(op, true);
+    } else if (op->op.same_as(builtin::tvm_call_cpacked())) {
+      return MakeCallPacked(op, false);
     } else if (op->op.same_as(builtin::tvm_call_trace_packed())) {
       return MakeCallTracePacked(op);
     } else if (op->op.same_as(builtin::tvm_stack_make_shape())) {
@@ -256,7 +258,7 @@ class BuiltinLower : public StmtExprMutator {
     return TVMStructGet(DataType::Handle(), scope.stack_array, idx, builtin::kArrAddr);
   }
   // call packed.
-  PrimExpr MakeCallPacked(const CallNode* op) {
+  PrimExpr MakeCallPacked(const CallNode* op, bool use_string_lookup) {
     auto& scope = alloca_scope_.back();
     auto& prep_seq = prep_seq_stack_.back();
 
@@ -298,8 +300,11 @@ class BuiltinLower : public StmtExprMutator {
                                    ConstInt32(arg_stack_begin),
                                    ConstInt32(arg_stack_begin + op->args.size() - 1)};
 
-    // call_packed_lowered needs to do the type casting properly
-    return Call(op->dtype, builtin::tvm_call_packed_lowered(), packed_args);
+    if (use_string_lookup) {
+      return Call(op->dtype, builtin::tvm_call_packed_lowered(), packed_args);
+    } else {
+      return Call(op->dtype, builtin::tvm_call_cpacked_lowered(), packed_args);
+    }
   }
 
   PrimExpr MakeCallTracePacked(const CallNode* op) {
diff --git a/tests/crt/aot_executor_test.cc b/tests/crt/aot_executor_test.cc
index aa18f2b22b7f..ded6729d138b 100644
--- a/tests/crt/aot_executor_test.cc
+++ b/tests/crt/aot_executor_test.cc
@@ -20,7 +20,7 @@
 #include <dlpack/dlpack.h>
 #include <gtest/gtest.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/crt/aot_executor.h>
+#include <tvm/runtime/crt/internal/aot_executor/aot_executor.h>
 
 int test_run_func(TVMValue* args, int* arg_type_ids, int num_args, TVMValue* out_ret_value,
                   int* out_ret_tcode, void* resource_handle) {
diff --git a/tests/crt/memory_test.cc b/tests/crt/memory_test.cc
index b11ab774f101..b531383058e6 100644
--- a/tests/crt/memory_test.cc
+++ b/tests/crt/memory_test.cc
@@ -37,7 +37,7 @@ class MemoryManagerTest : public ::testing::Test {
   void SetUp() override {
     memset(raw_memory_pool, 0, sizeof(raw_memory_pool));
     memory_pool = (uint8_t*)(ROUND_UP(((uintptr_t)raw_memory_pool), (1 << kPageSizeBytesLog)));
-    MemoryManagerCreate(&interface, memory_pool, kMemoryPoolSizeBytes, kPageSizeBytesLog);
+    PageMemoryManagerCreate(&interface, memory_pool, kMemoryPoolSizeBytes, kPageSizeBytesLog);
     mgr = (MemoryManager*)interface;
     ASSERT_EQ(kNumUsablePages, mgr->ptable.max_pages);
     dev_ = {kDLCPU, 0};
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index d65fd1221c20..6e996c99f49b 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -29,7 +29,7 @@ CC_OPTS = CC=$(CC) AR=$(AR) RANLIB=$(RANLIB)
 
 
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
-	-I$(TVM_ROOT)/include/tvm/runtime/crt \
+	-I$(TVM_ROOT)/src/runtime/crt/include \
 	-I$(TVM_ROOT)/src/runtime/crt/host \
 	-I$(TVM_ROOT)/include \
 	-I$(DMLC_CORE)/include \
@@ -47,7 +47,7 @@ CRT_SRCS = $(shell find $(CRT_ROOT))
 
 aot_test_runner: $(build_dir)/aot_test_runner
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/tvm_executor.o  $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/aot_executor.o  $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
 
@@ -59,7 +59,7 @@ $(build_dir)/lib0.o: $(build_dir)/../codegen/host/src/lib0.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
-$(build_dir)/tvm_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_executor.c
+$(build_dir)/aot_executor.o: $(TVM_ROOT)/src/runtime/crt/aot_executor/aot_executor.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
@@ -67,7 +67,7 @@ $(build_dir)/stack_allocator.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_allocat
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
-$(build_dir)/crt_backend_api.o: $(TVM_ROOT)/src/runtime/crt/common/aot_backend_api.c
+$(build_dir)/crt_backend_api.o: $(TVM_ROOT)/src/runtime/crt/common/crt_backend_api.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
@@ -77,4 +77,3 @@ cleanall:
 	$(QUIET)rm -rf $(build_dir)
 # Don't define implicit rules; they tend to match on logical target names that aren't targets (i.e. bundle_static)
 .SUFFIXES:
-.DEFAULT: ethosu_test_runner
diff --git a/tests/python/relay/aot/infra.py b/tests/python/relay/aot/aot_test_utils.py
similarity index 92%
rename from tests/python/relay/aot/infra.py
rename to tests/python/relay/aot/aot_test_utils.py
index 15c2775abcfc..fe97844e5d15 100644
--- a/tests/python/relay/aot/infra.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -14,15 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-This module provides infrastructure to verify the correctness of
-the command stream produced.
-Currently it will invoke vela to generate a vela-optimized tflite
-in which the command stream is contained as a custom operator.
-This class include methods to parse the custom operator to extract
-the command stream and perform an equivalency check for single operator
-test cases.
-"""
+
 import tflite
 import os
 import io
@@ -38,11 +30,9 @@
 import tvm
 from tvm import relay
 from tvm.relay import transform
-from tvm.relay.op.contrib import get_pattern_table
 from tvm.contrib import utils, graph_executor
 from tvm.relay.backend import compile_engine
 from tvm.contrib import utils
-from tvm.contrib import graph_runtime
 from tvm.micro import export_model_library_format
 
 
@@ -72,8 +62,8 @@ def create_main(test_name, input_list, output_list, output_path):
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
         main_file.write("#include <stdio.h>\n")
-        main_file.write('#include "aot_executor.h"\n')
-        main_file.write('#include "stack_allocator.h"\n')
+        main_file.write('#include "tvm/runtime/crt/internal/aot_executor/aot_executor.h"\n')
+        main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n')
         main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
 
@@ -98,7 +88,9 @@ def create_main(test_name, input_list, output_list, output_path):
 void  TVMPlatformAbort(tvm_crt_error_t code) { }
 
 void TVMLogf(const char* msg, ...) { }
-      
+
+TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
+     
         """
         )
         main_file.write("int main(){\n")
@@ -158,7 +150,7 @@ def create_header_file(tensor_name, npy_data, output_path):
         header_file.write("};\n\n")
 
 
-def verify_source(mod, input_list, output_list, params=None):
+def compile_and_run(mod, input_list, output_list, params=None):
     """
     This method verifies the generated source
     """
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 9d2d41450625..f125a3e77e7b 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -38,7 +38,7 @@
 from tvm.micro import export_model_library_format
 from tvm.relay import testing
 
-from infra import *
+from aot_test_utils import *
 
 
 def test_conv_with_params():
@@ -70,7 +70,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     output_list = generate_ref_data(mod, inputs, params)
 
     input_list = [input_data]
-    verify_source(mod, input_list, output_list, params)
+    compile_and_run(mod, input_list, output_list, params)
 
 
 def test_add_with_params():
@@ -87,7 +87,7 @@ def test_add_with_params():
     output_list = generate_ref_data(func, inputs, params)
 
     input_list = [y_in]
-    verify_source(func, input_list, output_list, params)
+    compile_and_run(func, input_list, output_list, params)
 
 
 def test_conv2d():
@@ -132,7 +132,7 @@ def group_conv2d():
     for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
         output_list = generate_ref_data(mod, inputs)
         input_list = [inputs["data"], inputs["weight"]]
-        verify_source(mod, input_list, output_list)
+        compile_and_run(mod, input_list, output_list)
 
 
 def test_concatenate():
@@ -151,7 +151,7 @@ def test_concatenate():
 
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_nested_tuples():
@@ -167,14 +167,14 @@ def test_nested_tuples():
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [x_data]
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_tuple_getitem():
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
     input_list = []
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_id():
@@ -184,7 +184,7 @@ def test_id():
     inputs = {"x": one}
     output_list = generate_ref_data(ident, inputs)
     input_list = [one]
-    verify_source(ident, input_list, output_list)
+    compile_and_run(ident, input_list, output_list)
 
 
 def test_add_const():
@@ -192,7 +192,7 @@ def test_add_const():
     func = relay.Function([], two)
     output_list = generate_ref_data(func, {})
     input_list = []
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_mul_param():
@@ -204,7 +204,7 @@ def test_mul_param():
     inputs = {"x": x_data, "y": y_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"], inputs["y"]]
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_subtract():
@@ -215,7 +215,7 @@ def test_subtract():
     inputs = {"i": i_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["i"]]
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_tuple_output():
@@ -230,7 +230,7 @@ def test_tuple_output():
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
     input_list = [inputs["x"]]
-    verify_source(func, input_list, output_list)
+    compile_and_run(func, input_list, output_list)
 
 
 def test_mobilenet():
@@ -240,7 +240,7 @@ def test_mobilenet():
     inputs = {"data": data}
     output_list = generate_ref_data(mod, inputs, params)
     input_list = [inputs["data"]]
-    verify_source(mod, input_list, output_list, params)
+    compile_and_run(mod, input_list, output_list, params)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index ccf48e077511..54cd31f6fb16 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -133,18 +133,22 @@ def test_plan_memory():
     smap = relay.backend._backend.GraphPlanMemory(func)
     storage_ids = set()
     device_types = set()
+    storage_sizes = set()
     for k, v in smap.items():
         assert len(v) == 3
         for x in v[0]:
             storage_ids.add(x.value)
         for x in v[1]:
             device_types.add(x.value)
+        for x in v[2]:
+            storage_sizes.add(x.value)
 
     # Current rule requires vars have unique storage id
     # because we don't do inplace, we will need another
     # two alternating temporary space.
     assert len(storage_ids) == 4
     assert len(device_types) == 1
+    assert len(storage_sizes) == 4
 
 
 def test_reshape_nop():
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 8fa91b97a714..c6902429c0cd 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -157,8 +157,8 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
         factory = tvm.relay.build(relay_mod, target=TARGET)
 
     with _make_session(workspace, factory.get_lib()) as sess:
-        graph_mod = tvm.micro.create_local_graph_runtime(
-            factory.get_graph_json(), sess.get_system_lib(), sess.context
+        graph_mod = tvm.micro.create_local_graph_executor(
+            factory.get_graph_json(), sess.get_system_lib(), sess.device
         )
         A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
@@ -226,4 +226,5 @@ def test_platform_timer():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_graph_executor()
+#     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 8305f002a6a3..3ad515604d0b 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -199,7 +199,7 @@ def test_llvm_link_params():
             assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
             assert mod.get_function("TVMSystemLibEntryPoint") != None
 
-            graph = json.loads(lib.graph)
+            graph = json.loads(lib.graph_json)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, mod, graph, p) or found_one
 
@@ -310,7 +310,7 @@ def test_c_link_params():
             lib_mod = tvm.runtime.load_module(lib_path)
 
             #            lib_mod = lib_factory['default']()
-            graph = json.loads(lib.graph)
+            graph = json.loads(lib.graph_json)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, lib_mod, graph, p)
 
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 642a521fe620..db6c55bca12a 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -35,7 +35,7 @@
 def validate_graph_json(extract_dir, factory):
     with open(os.path.join(extract_dir, "runtime-config", "graph", "graph.json")) as graph_f:
         graph_json = graph_f.read()
-        assert graph_json == factory.graph
+        assert graph_json == factory.graph_json
 
         # Just check it parses and looks roughly right.
         graph = json.loads(graph_json)
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 6bca45a38ea0..1d80c60de790 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -539,11 +539,11 @@ def test_debug_graph_executor():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # debug graph runtime wrapper
-    debug_g_mod = debug_runtime.GraphModuleDebug(
-        complied_graph_lib["debug_create"]("default", ctx),
-        [ctx],
-        complied_graph_lib.get_graph_json(),
+    # debug graph executor wrapper
+    debug_g_mod = debug_executor.GraphModuleDebug(
+        complied_graph_lib["debug_create"]("default", dev),
+        [dev],
+        complied_graph_lib.get_json(),
         None,
     )
     debug_g_mod.set_input("data", data)

From a236c8d845167823289ade299876cb9362d1d954 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 20 Apr 2021 19:09:41 +0100
Subject: [PATCH 10/33] Addressing comments - 4

Change-Id: Ibe29676abe3b75161b5a0903e007118a8318d862
---
 src/relay/backend/aot_executor_codegen.cc     |  23 ++--
 src/relay/backend/build_module.cc             |  46 +++++---
 tests/python/relay/aot/aot_test.mk            |  11 +-
 tests/python/relay/aot/test_crt_aot.py        | 103 ++++++++++++++++++
 .../test_micro_model_library_format.py        |   4 +-
 5 files changed, 151 insertions(+), 36 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index c514ecb24fef..f9109a7924f2 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -93,7 +93,7 @@ class AotReturnSidVisitor : public ExprVisitor {
 };
 
 /*! \brief Code generator for AOT executor */
-class AOTCodegen : public ExprVisitor {
+class AOTExecutorCodegen : public ExprVisitor {
  protected:
   /*!
    * \brief Utility function to allocate a DLTensor or TVMValue
@@ -309,6 +309,7 @@ class AOTCodegen : public ExprVisitor {
 
       // Generate the TIR function call
       CreateFuncCall(GetRef<Call>(op), ext_func->func_name);
+      return;
     }
 
     ICHECK_GE(storage_device_map_.count(expr), 0);
@@ -498,7 +499,7 @@ class AOTCodegen : public ExprVisitor {
   IntegerArray return_sid_;
 
  public:
-  AOTCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
+  AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
       : mod_(mod), return_sid_() {
     compile_engine_ = CompileEngine::Global();
     targets_ = targets;
@@ -567,9 +568,9 @@ class AOTCodegen : public ExprVisitor {
   }
 };
 
-class AOTCodegenModule : public runtime::ModuleNode {
+class AOTExecutorCodegenModule : public runtime::ModuleNode {
  public:
-  AOTCodegenModule() {}
+  AOTExecutorCodegenModule() {}
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -625,8 +626,8 @@ class AOTCodegenModule : public runtime::ModuleNode {
       ICHECK(dev_type);
       targets[dev_type->value] = it.second;
     }
-    codegen_ =
-        std::make_shared<AOTCodegen>(reinterpret_cast<runtime::Module*>(mod), targets, target_host);
+    codegen_ = std::make_shared<AOTExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod),
+                                                    targets, target_host);
   }
 
   LoweredOutput codegen(Function func) { return this->codegen_->Codegen(func); }
@@ -655,17 +656,17 @@ class AOTCodegenModule : public runtime::ModuleNode {
 
   Map<String, IRModule> get_irmodule() { return this->output_.lowered_funcs; }
 
-  std::shared_ptr<AOTCodegen> codegen_;
+  std::shared_ptr<AOTExecutorCodegen> codegen_;
   LoweredOutput output_;
 };
 
-runtime::Module CreateAOTCodegenMod() {
-  auto ptr = make_object<AOTCodegenModule>();
+runtime::Module CreateAOTExecutorCodegenMod() {
+  auto ptr = make_object<AOTExecutorCodegenModule>();
   return runtime::Module(ptr);
 }
 
-TVM_REGISTER_GLOBAL("relay.build_module._GraphAOTCodegen")
-    .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CreateAOTCodegenMod(); });
+TVM_REGISTER_GLOBAL("relay.build_module._AOTExecutorCodegen")
+    .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CreateAOTExecutorCodegenMod(); });
 
 }  // namespace backend
 }  // namespace relay
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 57bd48102ae1..25c676362e19 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -57,6 +57,8 @@ struct ExecutorCodegen {
 
   void Codegen(const Function& func) { CallFunc("codegen", func); }
 
+  virtual void UpdateOutput(BuildOutput* ret) = 0;
+
   std::unordered_map<std::string, tvm::runtime::NDArray> GetParams() {
     std::unordered_map<std::string, tvm::runtime::NDArray> ret;
     auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
@@ -88,6 +90,7 @@ struct ExecutorCodegen {
   }
 
   runtime::Metadata GetMetadata() { return CallFunc<runtime::Metadata>("get_metadata"); }
+  virtual ~ExecutorCodegen() {}
 
  protected:
   tvm::runtime::Module mod;
@@ -104,11 +107,14 @@ struct ExecutorCodegen {
   }
 };
 
-struct AOTCodegen : public ExecutorCodegen {
+struct AOTCodegen : ExecutorCodegen {
   AOTCodegen() {
-    auto pf = GetPackedFunc("relay.build_module._GraphAOTCodegen");
+    auto pf = GetPackedFunc("relay.build_module._AOTExecutorCodegen");
     mod = (*pf)();
   }
+
+  void UpdateOutput(BuildOutput* ret) override { ret->graph_json = ""; }
+
   ~AOTCodegen() {}
 };
 
@@ -116,17 +122,33 @@ struct AOTCodegen : public ExecutorCodegen {
  * \brief GraphCodegen module wrapper
  *
  */
-struct GraphCodegen : public ExecutorCodegen {
- public:
+struct GraphCodegen : ExecutorCodegen {
   GraphCodegen() {
     auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
     mod = (*pf)();
   }
+  void UpdateOutput(BuildOutput* ret) override { ret->graph_json = GetJSON(); }
 
   std::string GetJSON() { return CallFunc<std::string>("get_graph_json", nullptr); }
-  ~GraphCodegen();
+
+  ~GraphCodegen() {}
 };
 
+/*!
+ * \brief Executor codegen factory function
+ */
+std::unique_ptr<ExecutorCodegen> MakeExecutorCodegen(String executor_str) {
+  std::unique_ptr<ExecutorCodegen> ret;
+  if (executor_str == kTvmExecutorGraph) {
+    ret = std::make_unique<GraphCodegen>();
+  } else if (executor_str == kTvmExecutorAot) {
+    ret = std::make_unique<AOTCodegen>();
+  } else {
+    CHECK(false) << "Executor " << executor_str << " not supported";
+  }
+  return ret;
+}
+
 /*!
  * \brief Relay build module
  *
@@ -499,20 +521,10 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Generate code for the updated function.
     const String executor_str =
         target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
-    if (executor_str == kTvmExecutorGraph) {
-      executor_codegen_ = std::unique_ptr<ExecutorCodegen>(new GraphCodegen());
-    } else {
-      executor_codegen_ = std::unique_ptr<ExecutorCodegen>(new AOTCodegen());
-    }
-
+    executor_codegen_ = MakeExecutorCodegen(executor_str);
     executor_codegen_->Init(nullptr, targets_);
     executor_codegen_->Codegen(func);
-
-    if (executor_str == kTvmExecutorGraph) {
-      ret_.graph_json = reinterpret_cast<GraphCodegen*>(executor_codegen_.get())->GetJSON();
-    } else {
-      ret_.graph_json = "";
-    }
+    executor_codegen_->UpdateOutput(&ret_);
     ret_.params = executor_codegen_->GetParams();
 
     auto lowered_funcs = executor_codegen_->GetIRModule();
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index 6e996c99f49b..ae8389561459 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -47,15 +47,14 @@ CRT_SRCS = $(shell find $(CRT_ROOT))
 
 aot_test_runner: $(build_dir)/aot_test_runner
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/lib0.o  $(build_dir)/lib1.o $(build_dir)/aot_executor.o  $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
+source_libs= $(wildcard $(build_dir)/../codegen/host/src/lib*.c)
+lib_objs =$(source_libs:.c=.o) 
 
-$(build_dir)/lib1.o: $(build_dir)/../codegen/host/src/lib1.c
+$(build_dir)/aot_test_runner: $(build_dir)/test.c  $(build_dir)/aot_executor.o  $(source_libs) $(build_dir)/stack_allocator.o $(build_dir)/crt_backend_api.o
 	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
+	$(QUIET)$(CC) $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -lm
 
-$(build_dir)/lib0.o: $(build_dir)/../codegen/host/src/lib0.c
+$(build_dir)/%.o: $(build_dir)/../codegen/host/src/%.c
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^ $(BACKTRACE_CFLAGS)
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f125a3e77e7b..f4f46ab67479 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -37,6 +37,9 @@
 from tvm.contrib import graph_executor
 from tvm.micro import export_model_library_format
 from tvm.relay import testing
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from tvm.contrib import utils
+from tvm.relay.expr_functor import ExprMutator
 
 from aot_test_utils import *
 
@@ -243,5 +246,105 @@ def test_mobilenet():
     compile_and_run(mod, input_list, output_list, params)
 
 
+class CcompilerAnnotator(ExprMutator):
+    """
+    This is used to create external functions for ccompiler.
+    A simple annotator that creates the following program:
+           |
+      -- begin --
+           |
+          add
+           |
+        subtract
+           |
+        multiply
+           |
+       -- end --
+           |
+    """
+
+    def __init__(self):
+        super(CcompilerAnnotator, self).__init__()
+        self.in_compiler = 0
+
+    def visit_call(self, call):
+        if call.op.name == "add":  # Annotate begin at args
+            if self.in_compiler == 1:
+                lhs = compiler_begin(super().visit(call.args[0]), "ccompiler")
+                rhs = compiler_begin(super().visit(call.args[1]), "ccompiler")
+                op = relay.add(lhs, rhs)
+                self.in_compiler = 2
+                return op
+        elif call.op.name == "subtract":
+            if self.in_compiler == 1:
+                lhs = super().visit(call.args[0])
+                rhs = super().visit(call.args[1])
+                if isinstance(lhs, relay.expr.Var):
+                    lhs = compiler_begin(lhs, "ccompiler")
+                if isinstance(rhs, relay.expr.Var):
+                    rhs = compiler_begin(rhs, "ccompiler")
+                return relay.subtract(lhs, rhs)
+        elif call.op.name == "multiply":  # Annotate end at output
+            self.in_compiler = 1
+            lhs = super().visit(call.args[0])
+            rhs = super().visit(call.args[1])
+            if isinstance(lhs, relay.expr.Var):
+                lhs = compiler_begin(lhs, "ccompiler")
+            if isinstance(rhs, relay.expr.Var):
+                rhs = compiler_begin(rhs, "ccompiler")
+            op = relay.multiply(lhs, rhs)
+            if self.in_compiler == 2:
+                op = compiler_end(op, "ccompiler")
+            self.in_compiler = 0
+            return op
+        return super().visit_call(call)
+
+
+def test_byoc_utvm():
+    """This is a simple test case to check BYOC capabilities of AOT"""
+    x = relay.var("x", shape=(10, 10))
+    w0 = relay.var("w0", shape=(10, 10))
+    w1 = relay.var("w1", shape=(10, 10))
+    w2 = relay.var("w2", shape=(10, 10))
+    w3 = relay.var("w3", shape=(10, 10))
+    w4 = relay.var("w4", shape=(10, 10))
+    w5 = relay.var("w5", shape=(10, 10))
+    w6 = relay.var("w6", shape=(10, 10))
+    w7 = relay.var("w7", shape=(10, 10))
+
+    # C compiler
+    z0 = relay.add(x, w0)
+    p0 = relay.subtract(z0, w1)
+    q0 = relay.multiply(p0, w2)
+
+    z1 = relay.add(x, w3)
+    p1 = relay.subtract(z1, w4)
+    q1 = relay.multiply(p1, w5)
+
+    # Other parts on TVM
+    z2 = relay.add(x, w6)
+    q2 = relay.subtract(z2, w7)
+
+    r = relay.concatenate((q0, q1, q2), axis=0)
+    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
+    mod = tvm.IRModule()
+    ann = CcompilerAnnotator()
+    mod["main"] = ann.visit(f)
+    mod = tvm.relay.transform.PartitionGraph()(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+
+    x_data = np.random.rand(10, 10).astype("float32")
+    w_data = []
+    for _ in range(8):
+        w_data.append(np.random.rand(10, 10).astype("float32"))
+
+    map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
+    map_inputs["x"] = x_data
+    output_list = generate_ref_data(mod, map_inputs)
+    input_list = [map_inputs["x"]]
+    input_list.extend([map_inputs["w{}".format(i)] for i in range(8)])
+    compile_and_run(mod, input_list, output_list)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index db6c55bca12a..712bd8d348a2 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -26,7 +26,7 @@
 
 import tvm
 import tvm.relay
-from tvm.relay.backend import graph_executor_factory
+from tvm.relay.backend import executor_factory
 import tvm.runtime.module
 import tvm.testing
 from tvm.contrib import utils
@@ -170,7 +170,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 @tvm.testing.requires_micro
 def test_export_model():
     module = tvm.support.FrontendTestModule()
-    factory = graph_executor_factory.GraphExecutorFactoryModule(
+    factory = executor_factory.GraphExecutorFactoryModule(
         None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}
     )
 

From 0b4b12ed51170dc0ec22d5fe236a472c509ebb68 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 20 Apr 2021 23:46:48 +0100
Subject: [PATCH 11/33] fix tests - 2

Change-Id: I2117f9d4392bfd87102ecbef0993c8b320f479a0
---
 python/tvm/driver/tvmc/compiler.py                         | 7 +++++++
 python/tvm/micro/model_library_format.py                   | 1 +
 python/tvm/relay/backend/executor_factory.py               | 2 +-
 tests/micro/zephyr/test_zephyr.py                          | 2 +-
 tests/python/relay/test_external_codegen.py                | 2 +-
 tests/python/unittest/test_runtime_graph.py                | 2 +-
 .../python/unittest/test_runtime_module_based_interface.py | 2 +-
 tests/python/unittest/test_runtime_profiling.py            | 2 +-
 8 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 3f1d04aee7fd..dffa06671191 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -251,8 +251,15 @@ def compile_model(
         source = str(mod) if source_type == "relay" else lib.get_source(source_type)
         dumps[source_type] = source
 
+<<<<<<< HEAD
     # Create a new tvmc model package object from the graph definition.
     package_path = tvmc_model.export_package(graph_module, package_path, cross, export_format)
+=======
+    # TODO we need to update this return to use the updated graph module APIs
+    #      as these getter functions will be deprecated in the next release (@leandron)
+    return graph_module.get_graph_json(), graph_module.get_lib(), graph_module.get_params(), dumps
+
+>>>>>>> 99ce0408b... fix tests - 2
 
     # Write dumps to file.
     if dumps:
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 833740ab7fc2..3c136adea224 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -89,6 +89,7 @@ def _build_memory_map(graph_json):
     graph = json.loads(graph_json)
 
     seen_storage_ids = set()
+    memory_map = []
     for node_id, storage_id in enumerate(graph["attrs"]["storage_id"][1]):
         if storage_id in seen_storage_ids:
             continue
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index d81305145ae1..9fa611f86c11 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -148,7 +148,7 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
         return self.module.export_library(file_name, fcompile, addons, **kwargs)
 
     def save_executor_config(self):
-        return self.internal_repr
+        return self.graph_json
 
     def get_params(self):
         return self.params
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index d75e1b607b8d..4da1f12b273a 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -235,7 +235,7 @@ def test_onnx(platform, west_cmd):
     target = tvm.target.target.micro(model, options=["-link-params=1"])
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         lowered = relay.build(relay_mod, target, params=params)
-        graph = lowered.get_json()
+        graph = lowered.get_graph_json()
 
     with _make_session(model, target, zephyr_board, west_cmd, lowered.lib) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 9f6d88e47f0b..be92ef200c31 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -353,7 +353,7 @@ def test_load_params_with_constants_in_ext_codegen():
 
     graph_module = relay.build(mod, target="llvm", params=params)
     lib = update_lib(graph_module.get_lib())
-    rt_mod = tvm.contrib.graph_executor.create(graph_module.get_json(), lib, tvm.cpu(0))
+    rt_mod = tvm.contrib.graph_executor.create(graph_module.get_graph_json(), lib, tvm.cpu(0))
     rt_mod.load_params(runtime.save_param_dict(graph_module.get_params()))
 
 
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index aac7e497f38f..44f20878b800 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -131,7 +131,7 @@ def test_load_unexpected_params():
 
     graph_module = relay.build(mod, target="llvm", params=params)
     rt_mod = tvm.contrib.graph_executor.create(
-        graph_module.get_json(), graph_module.get_lib(), tvm.cpu(0)
+        graph_module.get_graph_json(), graph_module.get_lib(), tvm.cpu(0)
     )
 
     new_params = graph_module.get_params()
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 1d80c60de790..f85edfc8d033 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -543,7 +543,7 @@ def test_debug_graph_executor():
     debug_g_mod = debug_executor.GraphModuleDebug(
         complied_graph_lib["debug_create"]("default", dev),
         [dev],
-        complied_graph_lib.get_json(),
+        complied_graph_lib.get_graph_json(),
         None,
     )
     debug_g_mod.set_input("data", data)
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 563775fe0c62..2edac56683a8 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -51,7 +51,7 @@ def test_graph_executor(target, dev):
     mod, params = mlp.get_workload(1)
 
     exe = relay.build(mod, target, params=params)
-    gr = debug_executor.create(exe.get_json(), exe.lib, dev)
+    gr = debug_executor.create(exe.get_graph_json(), exe.lib, dev)
 
     data = np.random.rand(1, 1, 28, 28).astype("float32")
     report = gr.profile(data=data)

From 0b108f4fa252b93c3b41a47b163686c16a96fa06 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 21 Apr 2021 10:51:37 +0100
Subject: [PATCH 12/33] fix tests - 3

Change-Id: Ic0373543b0f9a54dbd4dc32d428272f7293200ba
---
 tests/python/relay/test_backend_graph_executor.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 54cd31f6fb16..33bd1d1e85b3 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -133,15 +133,14 @@ def test_plan_memory():
     smap = relay.backend._backend.GraphPlanMemory(func)
     storage_ids = set()
     device_types = set()
-    storage_sizes = set()
+    storage_sizes = {}
     for k, v in smap.items():
         assert len(v) == 3
         for x in v[0]:
             storage_ids.add(x.value)
+            storage_sizes[x.value] = v[2]
         for x in v[1]:
             device_types.add(x.value)
-        for x in v[2]:
-            storage_sizes.add(x.value)
 
     # Current rule requires vars have unique storage id
     # because we don't do inplace, we will need another

From 0473a033eeb58223e4c408ccbbeaa43238a453b1 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 21 Apr 2021 15:56:01 +0100
Subject: [PATCH 13/33] fix tests - 4

Change-Id: I8a6f229c9a3a9e169779c8d49cbfa3f473348b1f
---
 src/relay/backend/build_module.cc      | 7 +++++--
 tests/python/relay/aot/test_crt_aot.py | 1 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 25c676362e19..cfcf4e12d56c 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -201,9 +201,12 @@ class RelayBuildModule : public runtime::ModuleNode {
       });
     } else if (name == "get_executor_type") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        const String executor_str = kTvmExecutorGraph;
+
         auto target_host = GetTargetHost();
-        const String executor_str =
-            target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
+        if (target_host.defined()) {
+          target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
+        }
 
         *rv = executor_str;
       });
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f4f46ab67479..0f1f2ad369e7 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import tflite
 import os
 import io
 import struct

From 13a939c17cad76622f45df195b4c91e31d4a73da Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 21 Apr 2021 18:30:31 +0100
Subject: [PATCH 14/33] Addressing comments - 5

Change-Id: Ib9ccd07c87392034a21b2eb70955d0b091b780f1
---
 apps/bundle_deploy/bundle.c               |  4 +--
 apps/bundle_deploy/bundle_static.c        |  4 +--
 include/tvm/target/target_kind.h          |  6 -----
 python/tvm/relay/build_module.py          | 33 ++++++++++++++---------
 src/relay/backend/aot_executor_codegen.cc |  2 +-
 src/relay/backend/build_module.cc         | 25 ++++++-----------
 src/runtime/meta_data.h                   | 11 +++++++-
 src/target/source/source_module.cc        |  2 +-
 src/target/target_kind.cc                 |  1 -
 tests/cpp/relay_build_module_test.cc      |  2 +-
 tests/cpp/utvm_runtime_standalone_test.cc |  2 +-
 tests/python/relay/aot/aot_test_utils.py  |  4 +--
 12 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c
index 4dbe1141c6d4..6018d40dd300 100644
--- a/apps/bundle_deploy/bundle.c
+++ b/apps/bundle_deploy/bundle.c
@@ -64,8 +64,8 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   dev.device_id = device_id;
 
   // declare pointers
-  TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
-                                CRT_MEMORY_PAGE_SIZE_LOG2));
+  TVM_CCALL(PageMemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
+                                    CRT_MEMORY_PAGE_SIZE_LOG2));
   TVM_CCALL(TVMInitializeRuntime());
   TVMPackedFunc pf;
   TVMArgs args = TVMArgs_Create(NULL, NULL, 0);
diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index d0eeec4d956f..18a7b2bbb0ff 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -64,8 +64,8 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   dev.device_id = device_id;
 
   // get pointers
-  TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
-                                CRT_MEMORY_PAGE_SIZE_LOG2));
+  TVM_CCALL(PageMemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
+                                    CRT_MEMORY_PAGE_SIZE_LOG2));
   TVM_CCALL(TVMInitializeRuntime());
   TVMPackedFunc pf;
   TVMArgs args = TVMArgs_Create(NULL, NULL, 0);
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 2b9d2c5f5a69..e7da2dd413a0 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -140,12 +140,6 @@ static constexpr const char* kTvmRuntimeCpp = "c++";
 /*! \brief Value used with --runtime in target specs to indicate the C runtime. */
 static constexpr const char* kTvmRuntimeCrt = "c";
 
-/*! \brief Value used with --executor in target specs to indicate the graph executor. */
-static constexpr const char* kTvmExecutorGraph = "graph";
-
-/*! \brief Value used with --executor in target specs to indicate the aot executor. */
-static constexpr const char* kTvmExecutorAot = "aot";
-
 /*!
  * \brief Helper structure to register TargetKind
  * \sa TVM_REGISTER_TARGET_KIND
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 81e81852d438..011d059ee358 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -83,9 +83,8 @@ def __init__(self):
         self._optimize = self.mod["optimize"]
         self._set_params_func = self.mod["set_params"]
         self._get_params_func = self.mod["get_params"]
-        self._get_executor_type = self.mod["get_executor_type"]
 
-    def build(self, mod, target=None, target_host=None, params=None):
+    def build(self, mod, target=None, target_host=None, params=None, executor="graph"):
         """
         Parameters
         ----------
@@ -110,6 +109,11 @@ def build(self, mod, target=None, target_host=None, params=None):
             Input parameters to the graph that do not change
             during inference time. Used for constant folding.
 
+        executor: str[Optional]
+            The type of executor to be used in order to run the model:
+            - If "graph" is specified, then the graph_executor will be used
+            - If "aot" is specified, then the aot_executor will be used
+
         Returns
         -------
 <<<<<<< HEAD
@@ -145,13 +149,13 @@ def build(self, mod, target=None, target_host=None, params=None):
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
-        self._build(mod, target, target_host)
+        self._build(mod, target, target_host, executor)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
         mod = self.get_module()
         params = self.get_params()
-        internal_repr = self.get_graph_json() if self.get_executor_type() == "graph" else None
+        internal_repr = self.get_graph_json() if executor == "graph" else None
 
         return internal_repr, mod, params
 
@@ -209,10 +213,6 @@ def get_params(self):
             ret[key] = value.data
         return ret
 
-    def get_executor_type(self):
-        """ Return the executor TVM is building for """
-        return self._get_executor_type()
-
 
 @register_func("tvm.relay.module_export_library")
 def _module_export(module, file_name):  # fcompile, addons, kwargs?
@@ -229,7 +229,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     return build(mod, target, params=params, mod_name=mod_name).module
 
 
-def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
+def build(ir_mod, target=None, target_host=None, params=None, mod_name="default", executor="graph"):
     # fmt: off
     # pylint: disable=line-too-long
     """Helper function that builds a Relay function to run on TVM graph executor.
@@ -259,6 +259,11 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     mod_name: Optional[str]
         The module name we will build
 
+    executor: Optional[str]
+        The type of executor to be used in order to run the model:
+            - If "graph" is specified, then the graph_executor will be used
+            - If "aot" is specified, then the aot_executor will be used
+
     Returns
     -------
 <<<<<<< HEAD
@@ -311,18 +316,20 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        internal_repr, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
+        internal_repr, runtime_mod, params = bld_mod.build(
+            mod=ir_mod, target=target, params=params, executor=executor
+        )
 
-        if bld_mod.get_executor_type() == "aot":
+        if executor == "aot":
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
                 ir_mod, target, runtime_mod, mod_name, params
             )
-        elif bld_mod.get_executor_type() == "graph":
+        elif executor == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
                 ir_mod, target, internal_repr, runtime_mod, mod_name, params
             )
         else:
-            assert False, "Executor " + bld_mod.get_executor_type() + " not supported"
+            assert False, "Executor " + executor + " not supported"
 
         return executor_factory
 
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index f9109a7924f2..d544d1c4e4bb 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -563,7 +563,7 @@ class AOTExecutorCodegen : public ExprVisitor {
       ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
     }
 
-    ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size());
+    ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size(), kTvmExecutorAot);
     return ret;
   }
 };
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index cfcf4e12d56c..0d6fddd463cf 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -170,8 +170,8 @@ class RelayBuildModule : public runtime::ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        ICHECK_EQ(args.num_args, 3);
-        this->Build(args[0], args[1], args[2]);
+        ICHECK_EQ(args.num_args, 4);
+        this->Build(args[0], args[1], args[2], args[3]);
       });
     } else if (name == "list_params") {
       return PackedFunc(
@@ -199,17 +199,6 @@ class RelayBuildModule : public runtime::ModuleNode {
         ICHECK_EQ(args.num_args, 2);
         *rv = this->Optimize(args[0], args[1], this->params_);
       });
-    } else if (name == "get_executor_type") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        const String executor_str = kTvmExecutorGraph;
-
-        auto target_host = GetTargetHost();
-        if (target_host.defined()) {
-          target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
-        }
-
-        *rv = executor_str;
-      });
     } else {
       LOG(FATAL) << "Unknown packed function: " << name;
       return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
@@ -278,10 +267,12 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \param target Target device
    * \param target_host Host target device
    */
-  void Build(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) {
+  void Build(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host,
+             const String executor) {
     // Create protected variable targets_ from ground up
     targets_ = targets;
     target_host_ = target_host;
+    executor_ = executor;
     CheckAndUpdateHostConsistency(&targets_, &target_host_);
     BuildRelay(mod, params_);
     // Clear compile engine so that tuning schedules can be changed between runs. See issue #6096.
@@ -522,9 +513,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
     // Generate code for the updated function.
-    const String executor_str =
-        target_host->GetAttr<String>("executor").value_or(kTvmExecutorGraph);
-    executor_codegen_ = MakeExecutorCodegen(executor_str);
+    executor_codegen_ = MakeExecutorCodegen(executor_);
     executor_codegen_->Init(nullptr, targets_);
     executor_codegen_->Codegen(func);
     executor_codegen_->UpdateOutput(&ret_);
@@ -598,6 +587,8 @@ class RelayBuildModule : public runtime::ModuleNode {
   std::unordered_map<std::string, runtime::NDArray> params_;
   /*! \brief building output */
   BuildOutput ret_;
+  /*! \brief Executor used to execute the graph */
+  String executor_;
 };
 
 runtime::Module RelayBuildCreate() {
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 6cb39187193b..d9cf99628a2d 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -37,6 +37,12 @@
 
 #include "runtime_base.h"
 
+/*! \brief Value used to indicate the graph executor. */
+static constexpr const char* kTvmExecutorGraph = "graph";
+
+/*! \brief Value used to indicate the aot executor. */
+static constexpr const char* kTvmExecutorAot = "aot";
+
 namespace tvm {
 namespace runtime {
 
@@ -49,6 +55,8 @@ class MetadataNode : public Object {
   int num_inputs = 1;
   /*! \brief number of outputs of the main function */
   int num_outputs = 1;
+  /*! \brief the executor to be used to run the model */
+  String executor;
 
   static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
   static constexpr const char* _type_key = "MetadataObj";
@@ -60,10 +68,11 @@ class MetadataNode : public Object {
  */
 class Metadata : public ObjectRef {
  public:
-  TVM_DLL Metadata(int num_inputs, int num_outputs) {
+  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor = kTvmExecutorGraph) {
     auto n = make_object<MetadataNode>();
     n->num_inputs = num_inputs;
     n->num_outputs = num_outputs;
+    n->executor = executor;
     data_ = std::move(n);
   }
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index ccf520840f47..c3702ad6d8da 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -213,7 +213,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
-    if (target_->GetAttr<String>("executor").value_or(kTvmExecutorGraph) == kTvmExecutorAot) {
+    if (metadata_->executor == kTvmExecutorAot) {
       GenerateAOTDescriptor();
     }
     code_ << ";";
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 474b1b0d8ac4..2ca4f4533c7e 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -227,7 +227,6 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
-    .add_attr_option<String>("executor")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 344fd3d40ba8..f01e41465395 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -119,7 +119,7 @@ TEST(Relay, BuildModule) {
   targets.Set(0, llvm_tgt);
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt);
+  build_f(relay_mod, targets, llvm_tgt, "graph");
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
diff --git a/tests/cpp/utvm_runtime_standalone_test.cc b/tests/cpp/utvm_runtime_standalone_test.cc
index 5c642a37d6bc..f0d301e89add 100644
--- a/tests/cpp/utvm_runtime_standalone_test.cc
+++ b/tests/cpp/utvm_runtime_standalone_test.cc
@@ -91,7 +91,7 @@ TEST(MicroStandaloneRuntime, BuildModule) {
 
   Target llvm_tgt = Target("llvm");
   targets.Set(0, llvm_tgt);
-  build_f(func, targets, llvm_tgt);
+  build_f(func, targets, llvm_tgt, "graph");
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   std::string o_fname = std::tmpnam(nullptr);
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index fe97844e5d15..f8ac822aebeb 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -154,10 +154,10 @@ def compile_and_run(mod, input_list, output_list, params=None):
     """
     This method verifies the generated source
     """
-    target = "c -runtime=c --link-params --executor=aot"
+    target = "c -runtime=c --link-params"
 
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lib = tvm.relay.build(mod, target, target_host=target, params=params)
+        lib = tvm.relay.build(mod, target, target_host=target, params=params, executor="aot")
 
     tmp_path = utils.tempdir()
     tmp_dir = tmp_path.temp_dir

From 6cb16f6484d96ca4e2eaf6b78e66c4f45f6fc206 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 21 Apr 2021 20:05:09 +0100
Subject: [PATCH 15/33] fix tests - 5

Change-Id: I4b13c3b548ced414991e83072e9e6fc99b64f939
---
 src/runtime/meta_data.h            | 4 ++--
 src/target/source/source_module.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index d9cf99628a2d..a573d93daf5d 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -56,7 +56,7 @@ class MetadataNode : public Object {
   /*! \brief number of outputs of the main function */
   int num_outputs = 1;
   /*! \brief the executor to be used to run the model */
-  String executor;
+  String executor = kTvmExecutorGraph;
 
   static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
   static constexpr const char* _type_key = "MetadataObj";
@@ -68,7 +68,7 @@ class MetadataNode : public Object {
  */
 class Metadata : public ObjectRef {
  public:
-  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor = kTvmExecutorGraph) {
+  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor) {
     auto n = make_object<MetadataNode>();
     n->num_inputs = num_inputs;
     n->num_outputs = num_outputs;
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index c3702ad6d8da..698d65f6c9fd 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -213,7 +213,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
-    if (metadata_->executor == kTvmExecutorAot) {
+    if (metadata_.defined() && metadata_->executor == kTvmExecutorAot) {
       GenerateAOTDescriptor();
     }
     code_ << ";";

From 2d6e4df3655b154ff7aff0034f784bed93375171 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 22 Apr 2021 09:49:51 +0100
Subject: [PATCH 16/33] fix tests - 6

Change-Id: Id5af1f778ae25bc60849cc054a605181c1b7a765
---
 tests/python/relay/aot/aot_test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index f8ac822aebeb..cf242b6276e0 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import tflite
 import os
 import io
 import struct

From 4b5d18bf2fb68245792a438a48acc49f770f32d0 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 23 Apr 2021 10:32:33 +0100
Subject: [PATCH 17/33] addressing comments - 6

Change-Id: Id94a2bbcaae891f9498d41be538f13a952f55b81
---
 include/tvm/runtime/crt/stack_allocator.h     |   3 +
 python/tvm/micro/model_library_format.py      |   4 +-
 python/tvm/relay/backend/executor_factory.py  |  11 +-
 python/tvm/relay/build_module.py              |  14 ++-
 src/relay/backend/aot_executor_codegen.cc     |   3 +-
 src/relay/backend/build_module.cc             |  10 +-
 src/relay/backend/utils.h                     |   8 +-
 src/runtime/crt/memory/stack_allocator.c      |  15 ++-
 src/runtime/cuda/cuda_module.h                |   3 +-
 src/runtime/file_utils.h                      |   3 +-
 src/runtime/hexagon/hexagon_module.h          |   3 +-
 src/runtime/meta_data.h                       | 113 ------------------
 src/runtime/metadata_module.cc                |   3 +-
 src/runtime/metal/metal_module.h              |   3 +-
 src/runtime/opencl/aocl/aocl_module.h         |   3 +-
 src/runtime/opencl/opencl_module.h            |   3 +-
 src/runtime/opencl/sdaccel/sdaccel_module.h   |   3 +-
 src/runtime/rocm/rocm_module.h                |   3 +-
 src/target/build_common.h                     |   3 +-
 src/target/metadata_module.cc                 |   4 +-
 src/target/metadata_module.h                  |   5 +-
 src/target/source/codegen_c_host.cc           |   3 +-
 src/target/source/codegen_source_base.h       |   3 +-
 src/target/source/source_module.cc            |   2 +-
 src/target/source/source_module.h             |   3 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  10 +-
 tests/cpp/relay_build_module_test.cc          |   3 +-
 tests/cpp/utvm_runtime_standalone_test.cc     |   3 +-
 .../relay/test_backend_graph_executor.py      |   6 +
 29 files changed, 77 insertions(+), 176 deletions(-)
 delete mode 100644 src/runtime/meta_data.h

diff --git a/include/tvm/runtime/crt/stack_allocator.h b/include/tvm/runtime/crt/stack_allocator.h
index 43db589831d9..1858abb3a4fc 100644
--- a/include/tvm/runtime/crt/stack_allocator.h
+++ b/include/tvm/runtime/crt/stack_allocator.h
@@ -25,6 +25,9 @@
 
 #include "error_codes.h"
 
+#define STACK_ALLOCATOR_TAG 0xabcd1234
+#define STACK_ALLOCATOR_TAG_SIZE_BYTES 4
+
 /*! Memory alignment for allocator */
 
 #ifndef TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 3c136adea224..07227abc42b9 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -133,7 +133,7 @@ def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, fil
     """
     tempdir = utils.tempdir()
     is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    memory_map = [] if is_aot else _build_memory_map(mod.get_internal_repr())
+    memory_map = [] if is_aot else _build_memory_map(mod.get_excecutor_config())
     runtime = ["aot"] if is_aot else ["graph"]
 
     metadata = {
@@ -165,7 +165,7 @@ def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, fil
         graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
         os.makedirs(graph_config_dir_path)
         with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
-            f.write(mod.save_executor_config())
+            f.write(mod.get_executor_config())
 
     with tarfile.open(file_name, "w") as tar_f:
 
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index 9fa611f86c11..ed87403bb278 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -30,7 +30,7 @@ class ExecutorFactoryModule:
     """
 
     @abstractmethod
-    def get_internal_repr(self):
+    def get_excecutor_config(self):
         """Common function to return the internal representation
         the executor relies upon to execute the network
         """
@@ -69,7 +69,7 @@ def __next__(self):
         if self.iter_cnt > 2:
             raise StopIteration
 
-        objs = [self.get_internal_repr(), self.lib, self.params]
+        objs = [self.get_excecutor_config(), self.lib, self.params]
         obj = objs[self.iter_cnt]
         self.iter_cnt += 1
         return obj
@@ -101,7 +101,7 @@ def __init__(self, ir_mod, target, libmod, libmod_name, params):
     def get_params(self):
         return self.params
 
-    def get_internal_repr(self):
+    def get_excecutor_config(self):
         return None
 
     def get_lib(self):
@@ -147,16 +147,13 @@ def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
     def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
         return self.module.export_library(file_name, fcompile, addons, **kwargs)
 
-    def save_executor_config(self):
-        return self.graph_json
-
     def get_params(self):
         return self.params
 
     def get_graph_json(self):
         return self.graph_json
 
-    def get_internal_repr(self):
+    def get_excecutor_config(self):
         return self.graph_json
 
     def get_lib(self):
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 011d059ee358..35d28cddda0e 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -155,9 +155,9 @@ def build(self, mod, target=None, target_host=None, params=None, executor="graph
         # Get artifacts
         mod = self.get_module()
         params = self.get_params()
-        internal_repr = self.get_graph_json() if executor == "graph" else None
+        executor_config = self.get_graph_json() if executor == "graph" else None
 
-        return internal_repr, mod, params
+        return executor_config, mod, params
 
     def optimize(self, mod, target=None, params=None):
         """
@@ -266,6 +266,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     Returns
     -------
+<<<<<<< HEAD
 <<<<<<< HEAD
     factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
             The runtime factory for the TVM graph executor.
@@ -275,6 +276,11 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         network. Can be a string representing the json graph (if we are
         building for graph executor) or the PrimFunc representing the
         AOT runner function
+=======
+    executor_config : str
+        The internal configuration the executor uses to execute the
+        network.
+>>>>>>> db667146e... addressing comments - 6
 
     mod : tvm.Module
         The module containing necessary libraries.
@@ -316,7 +322,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        internal_repr, runtime_mod, params = bld_mod.build(
+        executor_config, runtime_mod, params = bld_mod.build(
             mod=ir_mod, target=target, params=params, executor=executor
         )
 
@@ -326,7 +332,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
             )
         elif executor == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
-                ir_mod, target, internal_repr, runtime_mod, mod_name, params
+                ir_mod, target, executor_config, runtime_mod, mod_name, params
             )
         else:
             assert False, "Executor " + executor + " not supported"
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index d544d1c4e4bb..baef6df13af8 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -563,7 +563,8 @@ class AOTExecutorCodegen : public ExprVisitor {
       ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
     }
 
-    ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size(), kTvmExecutorAot);
+    ret.metadata =
+        runtime::Metadata(input_vars_.size(), return_sid_.size(), runtime::kTvmExecutorAot);
     return ret;
   }
 };
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 0d6fddd463cf..10b2b9b56ea9 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -139,9 +139,9 @@ struct GraphCodegen : ExecutorCodegen {
  */
 std::unique_ptr<ExecutorCodegen> MakeExecutorCodegen(String executor_str) {
   std::unique_ptr<ExecutorCodegen> ret;
-  if (executor_str == kTvmExecutorGraph) {
+  if (executor_str == runtime::kTvmExecutorGraph) {
     ret = std::make_unique<GraphCodegen>();
-  } else if (executor_str == kTvmExecutorAot) {
+  } else if (executor_str == runtime::kTvmExecutorAot) {
     ret = std::make_unique<AOTCodegen>();
   } else {
     CHECK(false) << "Executor " << executor_str << " not supported";
@@ -587,7 +587,11 @@ class RelayBuildModule : public runtime::ModuleNode {
   std::unordered_map<std::string, runtime::NDArray> params_;
   /*! \brief building output */
   BuildOutput ret_;
-  /*! \brief Executor used to execute the graph */
+  /*!
+   * \brief Executor used to execute the model:
+   * - graph: use the json graph executor
+   * - aot: use the aot executor
+   */
   String executor_;
 };
 
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 7322f3dde3f5..e8288305488d 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -30,6 +30,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/target/codegen.h>
 #include <tvm/te/operation.h>
 
@@ -40,13 +41,14 @@
 #include <utility>
 #include <vector>
 
-#include "../../runtime/meta_data.h"
-
 namespace tvm {
 namespace relay {
 namespace backend {
 
-/*! \brief Lowered outputs */
+/*!
+ *  \brief Executor generator artifacts. Those artifacts  are subsequently
+ *  used by the relay build process.
+ */
 struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 5464b92b86c3..9fa7bc8074a1 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -16,17 +16,21 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 // LINT_C_FILE
-
 #include <tvm/runtime/crt/stack_allocator.h>
+#ifdef TVM_CRT_DEBUG
+#include <tvm/runtime/crt/logging.h>
+#endif
 
 void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes) {
   uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 1);
   uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
-
+#ifdef TVM_CRT_DEBUG
+  *((uint32_t*) next_alloc) = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES) ^ STACK_ALLOCATOR_TAG;
+  next_alloc += 4;
+#endif
   if (next_alloc > workspace_end) {
     return NULL;
   }
@@ -36,6 +40,11 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
 }
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
+#ifdef TVM_CRT_DEBUG
+  uint32_t tag = *(((uint32_t*) tvm_runtime_workspace->next_alloc ) - 1);
+  uint32_t nbytes = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
+  CHECK_EQ(tag, nbytes^STACK_ALLOCATOR_TAG, "tag did not match");
+#endif
   tvm_runtime_workspace->next_alloc = ptr;
   return 0;
 }
diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
index e65c5fe60811..60a89b0d7434 100644
--- a/src/runtime/cuda/cuda_module.h
+++ b/src/runtime/cuda/cuda_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 #define TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h
index 718d10d5df70..d035ba818538 100644
--- a/src/runtime/file_utils.h
+++ b/src/runtime/file_utils.h
@@ -25,12 +25,11 @@
 #define TVM_RUNTIME_FILE_UTILS_H_
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/meta_data.h>
 
 #include <string>
 #include <unordered_map>
 
-#include "meta_data.h"
-
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index 1288b933410c..00934ee22f42 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -21,6 +21,7 @@
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <array>
@@ -29,8 +30,6 @@
 #include <string>
 #include <unordered_map>
 
-#include "../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
deleted file mode 100644
index a573d93daf5d..000000000000
--- a/src/runtime/meta_data.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file meta_data.h
- * \brief Meta data related utilities
- */
-#ifndef TVM_RUNTIME_META_DATA_H_
-#define TVM_RUNTIME_META_DATA_H_
-
-#include <dmlc/io.h>
-#include <dmlc/json.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "runtime_base.h"
-
-/*! \brief Value used to indicate the graph executor. */
-static constexpr const char* kTvmExecutorGraph = "graph";
-
-/*! \brief Value used to indicate the aot executor. */
-static constexpr const char* kTvmExecutorAot = "aot";
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Structure that can be optionally used by the executor codegen
- */
-class MetadataNode : public Object {
- public:
-  /*! \brief number of inputs of the main function */
-  int num_inputs = 1;
-  /*! \brief number of outputs of the main function */
-  int num_outputs = 1;
-  /*! \brief the executor to be used to run the model */
-  String executor = kTvmExecutorGraph;
-
-  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-  static constexpr const char* _type_key = "MetadataObj";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
-};
-
-/*!
- * \brief Managed reference to MetadataNode.
- */
-class Metadata : public ObjectRef {
- public:
-  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor) {
-    auto n = make_object<MetadataNode>();
-    n->num_inputs = num_inputs;
-    n->num_outputs = num_outputs;
-    n->executor = executor;
-    data_ = std::move(n);
-  }
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
-};
-
-/*!
- * \brief Create a metadata module object.
- *
- * \param metadata The variable name to ndarray mapping.
- * \param sym_vars The symbol to the list of required constant variables
- * mapping.
- *
- * \return The created metadata module.
- */
-Module MetadataModuleCreate(
-    const std::unordered_map<std::string, NDArray>& metadata,
-    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars);
-
-/*! \brief function information needed by device */
-struct FunctionInfo {
-  std::string name;
-  std::vector<DLDataType> arg_types;
-  std::vector<std::string> thread_axis_tags;
-
-  void Save(dmlc::JSONWriter* writer) const;
-  void Load(dmlc::JSONReader* reader);
-  void Save(dmlc::Stream* writer) const;
-  bool Load(dmlc::Stream* reader);
-};
-}  // namespace runtime
-}  // namespace tvm
-
-namespace dmlc {
-DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::FunctionInfo, true);
-}  // namespace dmlc
-#endif  // TVM_RUNTIME_META_DATA_H_
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index 4a1d89ce1a1f..4e5f16de7777 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -28,6 +28,7 @@
  * codegen and runtimes.
  */
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -35,8 +36,6 @@
 #include <cstdint>
 #include <sstream>
 
-#include "meta_data.h"
-
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/metal/metal_module.h b/src/runtime/metal/metal_module.h
index 77cdf64df8bc..8dd858d5e6c7 100644
--- a/src/runtime/metal/metal_module.h
+++ b/src/runtime/metal/metal_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_METAL_METAL_MODULE_H_
 #define TVM_RUNTIME_METAL_METAL_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 /*! \brief Maximum number of GPU supported in MetalModule. */
diff --git a/src/runtime/opencl/aocl/aocl_module.h b/src/runtime/opencl/aocl/aocl_module.h
index 199a94decdd8..605e129d1a14 100644
--- a/src/runtime/opencl/aocl/aocl_module.h
+++ b/src/runtime/opencl/aocl/aocl_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/opencl/opencl_module.h b/src/runtime/opencl/opencl_module.h
index 77f4b8010779..f58c8a9b612a 100644
--- a/src/runtime/opencl/opencl_module.h
+++ b/src/runtime/opencl/opencl_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.h b/src/runtime/opencl/sdaccel/sdaccel_module.h
index 322decc4460c..756b54825c7a 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.h
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/rocm/rocm_module.h b/src/runtime/rocm/rocm_module.h
index c17e123c1a12..aef6560f243c 100644
--- a/src/runtime/rocm/rocm_module.h
+++ b/src/runtime/rocm/rocm_module.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 #define TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <memory>
@@ -31,8 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../meta_data.h"
-
 namespace tvm {
 namespace runtime {
 
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 1816c3ac2650..7c65b1771d70 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -26,6 +26,7 @@
 
 #include <tvm/ir/module.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
@@ -35,8 +36,6 @@
 #include <string>
 #include <unordered_map>
 
-#include "../runtime/meta_data.h"
-
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 4fdcd1b09325..618ab8bebe19 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -21,12 +21,12 @@
  * \file metadata_module.cc
  * \brief Defines functions that build MetadataModules for C++ and C runtimes.
  */
-
 #include "metadata_module.h"
 
+#include <tvm/runtime/meta_data.h>
+
 #include <vector>
 
-#include "../runtime/meta_data.h"
 #include "llvm/llvm_module.h"
 #include "source/source_module.h"
 
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
index 49404a63fdeb..f660ea0d2c70 100644
--- a/src/target/metadata_module.h
+++ b/src/target/metadata_module.h
@@ -26,6 +26,7 @@
 #define TVM_TARGET_METADATA_MODULE_H_
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/target/target.h>
@@ -33,15 +34,13 @@
 #include <string>
 #include <unordered_map>
 
-#include "../runtime/meta_data.h"
-
 namespace tvm {
 namespace codegen {
 
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
-    int num_inputs = 1, int num_outputs = 1);
+    runtime::Metadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 8402ccd4737e..0bfbade23f01 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -47,7 +47,6 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
   decl_stream << "#define TVM_EXPORTS\n";
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
-
   decl_stream << "#include <math.h>\n";
   decl_stream << "void* " << module_name_ << " = NULL;\n";
   CodeGenC::Init(output_ssa);
@@ -309,7 +308,7 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     auto function_info = GetFunctionInfo(op);
     this->PrintGetFuncFromBackend(function_info.func_name, function_info.func_name_packed);
-    this->PrintFuncCall(function_info.func_name, function_info.num_args);
+    this->PrintFuncCall(function_info.func_name_packed, function_info.num_args);
   } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
     auto function_info = GetFunctionInfo(op);
     this->PrintFuncCallC(function_info.func_name, function_info.num_args);
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 80fc9b486971..23be6189c82e 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -25,6 +25,7 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 #define TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 
+#include <tvm/runtime/meta_data.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
@@ -34,8 +35,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../../runtime/meta_data.h"
-
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 698d65f6c9fd..661df9305036 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -213,7 +213,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
-    if (metadata_.defined() && metadata_->executor == kTvmExecutorAot) {
+    if (metadata_.defined() && metadata_->executor == runtime::kTvmExecutorAot) {
       GenerateAOTDescriptor();
     }
     code_ << ";";
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index 6226ba2f22b3..33b93df94a79 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -26,11 +26,10 @@
 #define TVM_TARGET_SOURCE_SOURCE_MODULE_H_
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
-#include "../../runtime/meta_data.h"
-
 namespace tvm {
 namespace codegen {
 
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index db5af9412d95..7cad8b63cbca 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -179,7 +179,7 @@ class BuiltinLower : public StmtExprMutator {
   }
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_call_packed())) {
-      return MakeCallPacked(op, true);
+      return MakeCallPacked(op, /* use_string_lookup */ true);
     } else if (op->op.same_as(builtin::tvm_call_cpacked())) {
       return MakeCallPacked(op, false);
     } else if (op->op.same_as(builtin::tvm_call_trace_packed())) {
@@ -300,11 +300,9 @@ class BuiltinLower : public StmtExprMutator {
                                    ConstInt32(arg_stack_begin),
                                    ConstInt32(arg_stack_begin + op->args.size() - 1)};
 
-    if (use_string_lookup) {
-      return Call(op->dtype, builtin::tvm_call_packed_lowered(), packed_args);
-    } else {
-      return Call(op->dtype, builtin::tvm_call_cpacked_lowered(), packed_args);
-    }
+    auto builtin_call = use_string_lookup ? builtin::tvm_call_packed_lowered()
+                                          : builtin::tvm_call_cpacked_lowered();
+    return Call(op->dtype, builtin_call, packed_args);
   }
 
   PrimExpr MakeCallTracePacked(const CallNode* op) {
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index f01e41465395..020513905be3 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -26,6 +26,7 @@
 #include <tvm/relay/op_strategy.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -119,7 +120,7 @@ TEST(Relay, BuildModule) {
   targets.Set(0, llvm_tgt);
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt, "graph");
+  build_f(relay_mod, targets, llvm_tgt, runtime::kTvmExecutorGraph);
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
diff --git a/tests/cpp/utvm_runtime_standalone_test.cc b/tests/cpp/utvm_runtime_standalone_test.cc
index f0d301e89add..1c8db191d982 100644
--- a/tests/cpp/utvm_runtime_standalone_test.cc
+++ b/tests/cpp/utvm_runtime_standalone_test.cc
@@ -37,6 +37,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/micro/standalone/utvm_runtime.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
@@ -91,7 +92,7 @@ TEST(MicroStandaloneRuntime, BuildModule) {
 
   Target llvm_tgt = Target("llvm");
   targets.Set(0, llvm_tgt);
-  build_f(func, targets, llvm_tgt, "graph");
+  build_f(func, targets, llvm_tgt, runtime::kTvmExecutorGraph);
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   std::string o_fname = std::tmpnam(nullptr);
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 33bd1d1e85b3..70a67c4aec44 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -149,6 +149,12 @@ def test_plan_memory():
     assert len(device_types) == 1
     assert len(storage_sizes) == 4
 
+    # Check the specific size of each sid
+    assert storage_sizes[0][0] == 40
+    assert storage_sizes[1][0] == 4
+    assert storage_sizes[2][0] == 4
+    assert storage_sizes[3][0] == 40
+
 
 def test_reshape_nop():
     # test that reshape can be turned into nop

From 371b6a65f798289f7e9810977555f3c67cafe90b Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 23 Apr 2021 12:55:22 +0100
Subject: [PATCH 18/33] fix linting - 4

Change-Id: I371a0aa5b81824b5a3a1278fac22ace57832027a
---
 src/runtime/crt/memory/stack_allocator.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 9fa7bc8074a1..abc41cff55b1 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -28,8 +28,9 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
 #ifdef TVM_CRT_DEBUG
-  *((uint32_t*) next_alloc) = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES) ^ STACK_ALLOCATOR_TAG;
-  next_alloc += 4;
+  const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
+  *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
+  next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
 #endif
   if (next_alloc > workspace_end) {
     return NULL;
@@ -41,9 +42,9 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
 #ifdef TVM_CRT_DEBUG
-  uint32_t tag = *(((uint32_t*) tvm_runtime_workspace->next_alloc ) - 1);
-  uint32_t nbytes = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
-  CHECK_EQ(tag, nbytes^STACK_ALLOCATOR_TAG, "tag did not match");
+  uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
+  uint32_t total_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
+  CHECK_EQ(tag, total_size ^ STACK_ALLOCATOR_TAG, "tag did not match");
 #endif
   tvm_runtime_workspace->next_alloc = ptr;
   return 0;

From 774672753dd9ccd4e267f71ceddbe225157169ad Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 23 Apr 2021 14:02:37 +0100
Subject: [PATCH 19/33] add missing file

Change-Id: If359bef96dd0773ead4f75f0d9f5234276347e2d
---
 include/tvm/runtime/meta_data.h | 111 ++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 include/tvm/runtime/meta_data.h

diff --git a/include/tvm/runtime/meta_data.h b/include/tvm/runtime/meta_data.h
new file mode 100644
index 000000000000..b447b174c90c
--- /dev/null
+++ b/include/tvm/runtime/meta_data.h
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file meta_data.h
+ * \brief Meta data related utilities
+ */
+#ifndef TVM_RUNTIME_META_DATA_H_
+#define TVM_RUNTIME_META_DATA_H_
+
+#include <dmlc/io.h>
+#include <dmlc/json.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Value used to indicate the graph executor. */
+static constexpr const char* kTvmExecutorGraph = "graph";
+
+/*! \brief Value used to indicate the aot executor. */
+static constexpr const char* kTvmExecutorAot = "aot";
+
+/*!
+ * \brief Structure that can be optionally used by the executor codegen
+ */
+class MetadataNode : public Object {
+ public:
+  /*! \brief number of inputs of the main function */
+  int num_inputs = 1;
+  /*! \brief number of outputs of the main function */
+  int num_outputs = 1;
+  /*! \brief the executor to be used to run the model */
+  String executor = kTvmExecutorGraph;
+
+  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
+  static constexpr const char* _type_key = "MetadataObj";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
+};
+
+/*!
+ * \brief Managed reference to MetadataNode.
+ */
+class Metadata : public ObjectRef {
+ public:
+  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor) {
+    auto n = make_object<MetadataNode>();
+    n->num_inputs = num_inputs;
+    n->num_outputs = num_outputs;
+    n->executor = executor;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
+};
+
+/*!
+ * \brief Create a metadata module object.
+ *
+ * \param metadata The variable name to ndarray mapping.
+ * \param sym_vars The symbol to the list of required constant variables
+ * mapping.
+ *
+ * \return The created metadata module.
+ */
+Module MetadataModuleCreate(
+    const std::unordered_map<std::string, NDArray>& metadata,
+    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars);
+
+/*! \brief function information needed by device */
+struct FunctionInfo {
+  std::string name;
+  std::vector<DLDataType> arg_types;
+  std::vector<std::string> thread_axis_tags;
+
+  void Save(dmlc::JSONWriter* writer) const;
+  void Load(dmlc::JSONReader* reader);
+  void Save(dmlc::Stream* writer) const;
+  bool Load(dmlc::Stream* reader);
+};
+}  // namespace runtime
+}  // namespace tvm
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::FunctionInfo, true);
+}  // namespace dmlc
+#endif  // TVM_RUNTIME_META_DATA_H_

From 8f37fd38bca68c95c82658331ef40e7777704589 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 23 Apr 2021 15:32:12 +0100
Subject: [PATCH 20/33] fix build

Change-Id: I73fc1feb6f7b5d454a528e3289228484dc2b07d5
---
 src/runtime/cuda/cuda_module.cc       | 2 +-
 src/runtime/hexagon/hexagon_module.cc | 2 +-
 src/runtime/metal/metal_module.mm     | 2 +-
 src/runtime/opencl/opencl_common.h    | 2 +-
 src/runtime/rocm/rocm_module.cc       | 2 +-
 src/runtime/vulkan/vulkan_module.h    | 3 ++-
 6 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index a877bc634300..24ecb8779159 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -24,6 +24,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <array>
@@ -33,7 +34,6 @@
 #include <vector>
 
 #include "../file_utils.h"
-#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "cuda_common.h"
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 88815c388ccd..7f3830a8d400 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -23,6 +23,7 @@
 #include <android/log.h>
 #endif
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <memory>
@@ -32,7 +33,6 @@
 #include <vector>
 
 #include "../file_utils.h"
-#include "../meta_data.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index e22caa21a81e..1727a702240a 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -22,13 +22,13 @@
  */
 #include "metal_module.h"
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <array>
 #include <mutex>
 #include <string>
 #include "../file_utils.h"
-#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "metal_common.h"
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 93420feec805..4b5359785b1f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -27,6 +27,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 
@@ -64,7 +65,6 @@
 #include <vector>
 
 #include "../file_utils.h"
-#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../workspace_pool.h"
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 567557c56794..488e18f59918 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -23,6 +23,7 @@
 #include "rocm_module.h"
 
 #include <hip/hip_runtime_api.h>
+#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <array>
@@ -32,7 +33,6 @@
 #include <vector>
 
 #include "../file_utils.h"
-#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "rocm_common.h"
diff --git a/src/runtime/vulkan/vulkan_module.h b/src/runtime/vulkan/vulkan_module.h
index c75a077a361d..c622042fd5cf 100644
--- a/src/runtime/vulkan/vulkan_module.h
+++ b/src/runtime/vulkan/vulkan_module.h
@@ -20,10 +20,11 @@
 #ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
 #define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
 
+#include <tvm/runtime/meta_data.h>
+
 #include <string>
 #include <unordered_map>
 
-#include "../meta_data.h"
 #include "vulkan_shader.h"
 
 namespace tvm {

From e4a5fbb96771b564dc61fefa5f65040cb993145c Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 23 Apr 2021 18:25:34 +0100
Subject: [PATCH 21/33] addressing comments - 7

Change-Id: I7f908f3908ffc77e408391f62edcc06f2600c6c2
---
 src/runtime/cuda/cuda_module.cc             |  2 +-
 src/runtime/cuda/cuda_module.h              |  3 +-
 src/runtime/file_utils.h                    |  3 +-
 src/runtime/hexagon/hexagon_module.cc       |  2 +-
 src/runtime/hexagon/hexagon_module.h        |  3 +-
 src/runtime/meta_data.h                     | 72 +++++++++++++++++++++
 src/runtime/metal/metal_module.h            |  3 +-
 src/runtime/metal/metal_module.mm           |  7 +-
 src/runtime/opencl/aocl/aocl_module.h       |  3 +-
 src/runtime/opencl/opencl_common.h          |  2 +-
 src/runtime/opencl/opencl_module.h          |  3 +-
 src/runtime/opencl/sdaccel/sdaccel_module.h |  3 +-
 src/runtime/rocm/rocm_module.cc             |  2 +-
 src/runtime/rocm/rocm_module.h              |  3 +-
 src/runtime/vulkan/vulkan_module.h          |  3 +-
 15 files changed, 96 insertions(+), 18 deletions(-)
 create mode 100644 src/runtime/meta_data.h

diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 24ecb8779159..a877bc634300 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -24,7 +24,6 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <array>
@@ -34,6 +33,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "cuda_common.h"
diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
index 60a89b0d7434..e65c5fe60811 100644
--- a/src/runtime/cuda/cuda_module.h
+++ b/src/runtime/cuda/cuda_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 #define TVM_RUNTIME_CUDA_CUDA_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h
index d035ba818538..718d10d5df70 100644
--- a/src/runtime/file_utils.h
+++ b/src/runtime/file_utils.h
@@ -25,11 +25,12 @@
 #define TVM_RUNTIME_FILE_UTILS_H_
 
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/meta_data.h>
 
 #include <string>
 #include <unordered_map>
 
+#include "meta_data.h"
+
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 7f3830a8d400..88815c388ccd 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -23,7 +23,6 @@
 #include <android/log.h>
 #endif
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <memory>
@@ -33,6 +32,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../meta_data.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index 00934ee22f42..1288b933410c 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -21,7 +21,6 @@
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <array>
@@ -30,6 +29,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
new file mode 100644
index 000000000000..03dba399fcb4
--- /dev/null
+++ b/src/runtime/meta_data.h
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file meta_data.h
+ * \brief Meta data related utilities
+ */
+#ifndef TVM_RUNTIME_META_DATA_H_
+#define TVM_RUNTIME_META_DATA_H_
+
+#include <dmlc/io.h>
+#include <dmlc/json.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "runtime_base.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Create a metadata module object.
+ *
+ * \param metadata The variable name to ndarray mapping.
+ * \param sym_vars The symbol to the list of required constant variables
+ * mapping.
+ *
+ * \return The created metadata module.
+ */
+Module MetadataModuleCreate(
+    const std::unordered_map<std::string, NDArray>& metadata,
+    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars);
+
+/*! \brief function information needed by device */
+struct FunctionInfo {
+  std::string name;
+  std::vector<DLDataType> arg_types;
+  std::vector<std::string> thread_axis_tags;
+
+  void Save(dmlc::JSONWriter* writer) const;
+  void Load(dmlc::JSONReader* reader);
+  void Save(dmlc::Stream* writer) const;
+  bool Load(dmlc::Stream* reader);
+};
+}  // namespace runtime
+}  // namespace tvm
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::FunctionInfo, true);
+}  // namespace dmlc
+#endif  // TVM_RUNTIME_META_DATA_H_
diff --git a/src/runtime/metal/metal_module.h b/src/runtime/metal/metal_module.h
index 8dd858d5e6c7..77cdf64df8bc 100644
--- a/src/runtime/metal/metal_module.h
+++ b/src/runtime/metal/metal_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_METAL_METAL_MODULE_H_
 #define TVM_RUNTIME_METAL_METAL_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 /*! \brief Maximum number of GPU supported in MetalModule. */
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 1727a702240a..a8b01815bf68 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -22,13 +22,13 @@
  */
 #include "metal_module.h"
 #include <dmlc/memory_io.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <array>
 #include <mutex>
 #include <string>
 #include "../file_utils.h"
+#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "metal_common.h"
@@ -185,8 +185,6 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons
     @autoreleasepool {
       metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
       int device_id = t->device.device_id;
-      auto stream = static_cast<metal::Stream*>(t->stream[device_id]);
-      if (stream->HasErrorHappened()) return;
       if (scache_[device_id] == nil) {
         scache_[device_id] = m_->GetPipelineState(device_id, func_name_);
       }
@@ -194,7 +192,8 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons
       int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2);
       auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup;
       CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup);
-      id<MTLCommandBuffer> cb = stream->GetCommandBuffer();
+      id<MTLCommandQueue> queue = w_->GetCommandQueue(t->device);
+      id<MTLCommandBuffer> cb = [queue commandBuffer];
       id<MTLComputeCommandEncoder> encoder = [cb computeCommandEncoder];
       [encoder setComputePipelineState:scache_[device_id]];
       for (size_t i = 0; i < num_buffer_args_; ++i) {
diff --git a/src/runtime/opencl/aocl/aocl_module.h b/src/runtime/opencl/aocl/aocl_module.h
index 605e129d1a14..199a94decdd8 100644
--- a/src/runtime/opencl/aocl/aocl_module.h
+++ b/src/runtime/opencl/aocl/aocl_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_AOCL_AOCL_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 4b5359785b1f..93420feec805 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -27,7 +27,6 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 
@@ -65,6 +64,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../workspace_pool.h"
diff --git a/src/runtime/opencl/opencl_module.h b/src/runtime/opencl/opencl_module.h
index f58c8a9b612a..77f4b8010779 100644
--- a/src/runtime/opencl/opencl_module.h
+++ b/src/runtime/opencl/opencl_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.h b/src/runtime/opencl/sdaccel/sdaccel_module.h
index 756b54825c7a..322decc4460c 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.h
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
 #define TVM_RUNTIME_OPENCL_SDACCEL_SDACCEL_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 /*!
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 488e18f59918..567557c56794 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -23,7 +23,6 @@
 #include "rocm_module.h"
 
 #include <hip/hip_runtime_api.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 
 #include <array>
@@ -33,6 +32,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "rocm_common.h"
diff --git a/src/runtime/rocm/rocm_module.h b/src/runtime/rocm/rocm_module.h
index aef6560f243c..c17e123c1a12 100644
--- a/src/runtime/rocm/rocm_module.h
+++ b/src/runtime/rocm/rocm_module.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 #define TVM_RUNTIME_ROCM_ROCM_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 
 #include <memory>
@@ -32,6 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../meta_data.h"
+
 namespace tvm {
 namespace runtime {
 
diff --git a/src/runtime/vulkan/vulkan_module.h b/src/runtime/vulkan/vulkan_module.h
index c622042fd5cf..c75a077a361d 100644
--- a/src/runtime/vulkan/vulkan_module.h
+++ b/src/runtime/vulkan/vulkan_module.h
@@ -20,11 +20,10 @@
 #ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
 #define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
 
-#include <tvm/runtime/meta_data.h>
-
 #include <string>
 #include <unordered_map>
 
+#include "../meta_data.h"
 #include "vulkan_shader.h"
 
 namespace tvm {

From a3af874cd89633606c4c888b69cc30475cdbbda4 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Sat, 24 Apr 2021 00:18:19 +0100
Subject: [PATCH 22/33] addressing comments - 8

Change-Id: I90bced4e18259a6d42e6a406d93958e204f3859e
---
 include/tvm/runtime/crt/stack_allocator.h     |   1 +
 include/tvm/runtime/executor_info.h           |  39 ++++++
 include/tvm/runtime/meta_data.h               | 111 ------------------
 python/tvm/relay/backend/executor_factory.py  |  15 +--
 src/relay/backend/utils.h                     |   3 +-
 src/runtime/crt/host/crt_config.h             |   3 +
 src/runtime/crt/memory/stack_allocator.c      |  16 ++-
 src/runtime/meta_data.h                       |  36 ++++++
 src/runtime/metadata_module.cc                |   3 +-
 src/target/build_common.h                     |   3 +-
 src/target/metadata_module.cc                 |   3 +-
 src/target/metadata_module.h                  |   3 +-
 src/target/source/codegen_source_base.h       |   3 +-
 src/target/source/source_module.h             |   3 +-
 src/tir/transforms/lower_tvm_builtin.cc       |   2 +-
 tests/cpp/relay_build_module_test.cc          |   2 +-
 tests/cpp/utvm_runtime_standalone_test.cc     |   2 +-
 .../relay/test_backend_graph_executor.py      |  10 +-
 18 files changed, 116 insertions(+), 142 deletions(-)
 create mode 100644 include/tvm/runtime/executor_info.h
 delete mode 100644 include/tvm/runtime/meta_data.h

diff --git a/include/tvm/runtime/crt/stack_allocator.h b/include/tvm/runtime/crt/stack_allocator.h
index 1858abb3a4fc..eb90f832a419 100644
--- a/include/tvm/runtime/crt/stack_allocator.h
+++ b/include/tvm/runtime/crt/stack_allocator.h
@@ -23,6 +23,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "crt_config.h"
 #include "error_codes.h"
 
 #define STACK_ALLOCATOR_TAG 0xabcd1234
diff --git a/include/tvm/runtime/executor_info.h b/include/tvm/runtime/executor_info.h
new file mode 100644
index 000000000000..5b3572120c9a
--- /dev/null
+++ b/include/tvm/runtime/executor_info.h
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file executor_info.h
+ * \brief Executor information
+ */
+#ifndef TVM_RUNTIME_EXECUTOR_INFO_H_
+#define TVM_RUNTIME_EXECUTOR_INFO_H_
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Value used to indicate the graph executor. */
+static constexpr const char* kTvmExecutorGraph = "graph";
+
+/*! \brief Value used to indicate the aot executor. */
+static constexpr const char* kTvmExecutorAot = "aot";
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_EXECUTOR_INFO_H_
diff --git a/include/tvm/runtime/meta_data.h b/include/tvm/runtime/meta_data.h
deleted file mode 100644
index b447b174c90c..000000000000
--- a/include/tvm/runtime/meta_data.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file meta_data.h
- * \brief Meta data related utilities
- */
-#ifndef TVM_RUNTIME_META_DATA_H_
-#define TVM_RUNTIME_META_DATA_H_
-
-#include <dmlc/io.h>
-#include <dmlc/json.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-
-/*! \brief Value used to indicate the graph executor. */
-static constexpr const char* kTvmExecutorGraph = "graph";
-
-/*! \brief Value used to indicate the aot executor. */
-static constexpr const char* kTvmExecutorAot = "aot";
-
-/*!
- * \brief Structure that can be optionally used by the executor codegen
- */
-class MetadataNode : public Object {
- public:
-  /*! \brief number of inputs of the main function */
-  int num_inputs = 1;
-  /*! \brief number of outputs of the main function */
-  int num_outputs = 1;
-  /*! \brief the executor to be used to run the model */
-  String executor = kTvmExecutorGraph;
-
-  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-  static constexpr const char* _type_key = "MetadataObj";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
-};
-
-/*!
- * \brief Managed reference to MetadataNode.
- */
-class Metadata : public ObjectRef {
- public:
-  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor) {
-    auto n = make_object<MetadataNode>();
-    n->num_inputs = num_inputs;
-    n->num_outputs = num_outputs;
-    n->executor = executor;
-    data_ = std::move(n);
-  }
-
-  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
-};
-
-/*!
- * \brief Create a metadata module object.
- *
- * \param metadata The variable name to ndarray mapping.
- * \param sym_vars The symbol to the list of required constant variables
- * mapping.
- *
- * \return The created metadata module.
- */
-Module MetadataModuleCreate(
-    const std::unordered_map<std::string, NDArray>& metadata,
-    const std::unordered_map<std::string, std::vector<std::string>>& sym_vars);
-
-/*! \brief function information needed by device */
-struct FunctionInfo {
-  std::string name;
-  std::vector<DLDataType> arg_types;
-  std::vector<std::string> thread_axis_tags;
-
-  void Save(dmlc::JSONWriter* writer) const;
-  void Load(dmlc::JSONReader* reader);
-  void Save(dmlc::Stream* writer) const;
-  bool Load(dmlc::Stream* reader);
-};
-}  // namespace runtime
-}  // namespace tvm
-
-namespace dmlc {
-DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::FunctionInfo, true);
-}  // namespace dmlc
-#endif  // TVM_RUNTIME_META_DATA_H_
diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index ed87403bb278..b61d9000bb32 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -30,7 +30,7 @@ class ExecutorFactoryModule:
     """
 
     @abstractmethod
-    def get_excecutor_config(self):
+    def get_executor_config(self):
         """Common function to return the internal representation
         the executor relies upon to execute the network
         """
@@ -38,11 +38,7 @@ def get_excecutor_config(self):
 
     @abstractmethod
     def get_params(self):
-        """
-        Sometimes we want to get params explicitly.
-        For example, we want to save its params value to
-        an independent file.
-        """
+        """Return the compiled parameters."""
         raise NotImplementedError
 
     @abstractmethod
@@ -51,7 +47,6 @@ def get_lib(self):
         raise NotImplementedError
 
     def __getitem__(self, item):
-        print(item)
         return self.module.__getitem__(item)
 
     def __iter__(self):
@@ -69,7 +64,7 @@ def __next__(self):
         if self.iter_cnt > 2:
             raise StopIteration
 
-        objs = [self.get_excecutor_config(), self.lib, self.params]
+        objs = [self.get_executor_config(), self.lib, self.params]
         obj = objs[self.iter_cnt]
         self.iter_cnt += 1
         return obj
@@ -101,7 +96,7 @@ def __init__(self, ir_mod, target, libmod, libmod_name, params):
     def get_params(self):
         return self.params
 
-    def get_excecutor_config(self):
+    def get_executor_config(self):
         return None
 
     def get_lib(self):
@@ -153,7 +148,7 @@ def get_params(self):
     def get_graph_json(self):
         return self.graph_json
 
-    def get_excecutor_config(self):
+    def get_executor_config(self):
         return self.graph_json
 
     def get_lib(self):
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index e8288305488d..c804768c99af 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -30,7 +30,6 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/target/codegen.h>
 #include <tvm/te/operation.h>
 
@@ -41,6 +40,8 @@
 #include <utility>
 #include <vector>
 
+#include "../../runtime/meta_data.h"
+
 namespace tvm {
 namespace relay {
 namespace backend {
diff --git a/src/runtime/crt/host/crt_config.h b/src/runtime/crt/host/crt_config.h
index b81a74eb4ae6..f5533298cae9 100644
--- a/src/runtime/crt/host/crt_config.h
+++ b/src/runtime/crt/host/crt_config.h
@@ -51,6 +51,9 @@
 /*! \brief Maximum length of a PackedFunc function name. */
 #define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
 
+/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */
+#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK 0
+
 // #define TVM_CRT_FRAMER_ENABLE_LOGS
 
 #endif  // TVM_RUNTIME_CRT_HOST_CRT_CONFIG_H_
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index abc41cff55b1..202569b8dee4 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -18,16 +18,19 @@
  */
 // LINT_C_FILE
 #include <tvm/runtime/crt/stack_allocator.h>
-#ifdef TVM_CRT_DEBUG
+#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
 #include <tvm/runtime/crt/logging.h>
 #endif
 
 void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes) {
-  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 1);
+  // reserve bytes at the end of the allocation such that
+  // next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0.
+  uint32_t offset_bytes =
+      (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - nbytes) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 1);
   uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
-#ifdef TVM_CRT_DEBUG
+#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
   const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
   *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
   next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
@@ -41,10 +44,11 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
 }
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
-#ifdef TVM_CRT_DEBUG
+#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
   uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
-  uint32_t total_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
-  CHECK_EQ(tag, total_size ^ STACK_ALLOCATOR_TAG, "tag did not match");
+  uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
+  uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
+  CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering");
 #endif
   tvm_runtime_workspace->next_alloc = ptr;
   return 0;
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 03dba399fcb4..495b3f22e6ad 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -26,12 +26,14 @@
 
 #include <dmlc/io.h>
 #include <dmlc/json.h>
+#include <tvm/runtime/executor_info.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "runtime_base.h"
@@ -39,6 +41,40 @@
 namespace tvm {
 namespace runtime {
 
+/*!
+ * \brief Structure that can be optionally used by the executor codegen
+ */
+class MetadataNode : public Object {
+ public:
+  /*! \brief number of inputs of the main function */
+  int num_inputs = 1;
+  /*! \brief number of outputs of the main function */
+  int num_outputs = 1;
+  /*! \brief the executor to be used to run the model */
+  String executor = kTvmExecutorGraph;
+
+  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
+  static constexpr const char* _type_key = "MetadataObj";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object);
+};
+
+/*!
+ * \brief Managed reference to MetadataNode.
+ */
+class Metadata : public ObjectRef {
+ public:
+  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor) {
+    auto n = make_object<MetadataNode>();
+    n->num_inputs = num_inputs;
+    n->num_outputs = num_outputs;
+    n->executor = executor;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode);
+};
+
 /*!
  * \brief Create a metadata module object.
  *
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index 4e5f16de7777..4a1d89ce1a1f 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -28,7 +28,6 @@
  * codegen and runtimes.
  */
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -36,6 +35,8 @@
 #include <cstdint>
 #include <sstream>
 
+#include "meta_data.h"
+
 namespace tvm {
 namespace runtime {
 
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 7c65b1771d70..1816c3ac2650 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -26,7 +26,6 @@
 
 #include <tvm/ir/module.h>
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
@@ -36,6 +35,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 618ab8bebe19..db4051e00fd2 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -23,10 +23,9 @@
  */
 #include "metadata_module.h"
 
-#include <tvm/runtime/meta_data.h>
-
 #include <vector>
 
+#include "../runtime/meta_data.h"
 #include "llvm/llvm_module.h"
 #include "source/source_module.h"
 
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
index f660ea0d2c70..add05ba52692 100644
--- a/src/target/metadata_module.h
+++ b/src/target/metadata_module.h
@@ -26,7 +26,6 @@
 #define TVM_TARGET_METADATA_MODULE_H_
 
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/target/target.h>
@@ -34,6 +33,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 23be6189c82e..80fc9b486971 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -25,7 +25,6 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 #define TVM_TARGET_SOURCE_CODEGEN_SOURCE_BASE_H_
 
-#include <tvm/runtime/meta_data.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
@@ -35,6 +34,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index 33b93df94a79..6226ba2f22b3 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -26,10 +26,11 @@
 #define TVM_TARGET_SOURCE_SOURCE_MODULE_H_
 
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/meta_data.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
+#include "../../runtime/meta_data.h"
+
 namespace tvm {
 namespace codegen {
 
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 7cad8b63cbca..0e2e612e3ae8 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -181,7 +181,7 @@ class BuiltinLower : public StmtExprMutator {
     if (op->op.same_as(builtin::tvm_call_packed())) {
       return MakeCallPacked(op, /* use_string_lookup */ true);
     } else if (op->op.same_as(builtin::tvm_call_cpacked())) {
-      return MakeCallPacked(op, false);
+      return MakeCallPacked(op, /* use_string_lookup */ false);
     } else if (op->op.same_as(builtin::tvm_call_trace_packed())) {
       return MakeCallTracePacked(op);
     } else if (op->op.same_as(builtin::tvm_stack_make_shape())) {
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 020513905be3..314185240563 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op_strategy.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
-#include <tvm/runtime/meta_data.h>
+#include <tvm/runtime/executor_info.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
diff --git a/tests/cpp/utvm_runtime_standalone_test.cc b/tests/cpp/utvm_runtime_standalone_test.cc
index 1c8db191d982..e674c3b74144 100644
--- a/tests/cpp/utvm_runtime_standalone_test.cc
+++ b/tests/cpp/utvm_runtime_standalone_test.cc
@@ -37,7 +37,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
-#include <tvm/runtime/meta_data.h>
+#include <tvm/runtime/executor_info.h>
 #include <tvm/runtime/micro/standalone/utvm_runtime.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 70a67c4aec44..8f6e4b3a4d3e 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -150,10 +150,12 @@ def test_plan_memory():
     assert len(storage_sizes) == 4
 
     # Check the specific size of each sid
-    assert storage_sizes[0][0] == 40
-    assert storage_sizes[1][0] == 4
-    assert storage_sizes[2][0] == 4
-    assert storage_sizes[3][0] == 40
+    assert (
+        storage_sizes[0][0] == 40
+        and storage_sizes[1][0] == 4
+        and storage_sizes[2][0] == 4
+        and storage_sizes[3][0] == 40
+    )
 
 
 def test_reshape_nop():

From 50a94038eec6e7dd131baa721a7af40b271c0734 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 26 Apr 2021 14:02:46 +0100
Subject: [PATCH 23/33] rebasing

Change-Id: Id28751b069bd046f00faee301b2b446b2ea4fab8
---
 src/runtime/metal/metal_module.mm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index a8b01815bf68..e22caa21a81e 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -185,6 +185,8 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons
     @autoreleasepool {
       metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
       int device_id = t->device.device_id;
+      auto stream = static_cast<metal::Stream*>(t->stream[device_id]);
+      if (stream->HasErrorHappened()) return;
       if (scache_[device_id] == nil) {
         scache_[device_id] = m_->GetPipelineState(device_id, func_name_);
       }
@@ -192,8 +194,7 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons
       int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2);
       auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup;
       CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup);
-      id<MTLCommandQueue> queue = w_->GetCommandQueue(t->device);
-      id<MTLCommandBuffer> cb = [queue commandBuffer];
+      id<MTLCommandBuffer> cb = stream->GetCommandBuffer();
       id<MTLComputeCommandEncoder> encoder = [cb computeCommandEncoder];
       [encoder setComputePipelineState:scache_[device_id]];
       for (size_t i = 0; i < num_buffer_args_; ++i) {

From 15a7b0d940e9dac3ff88e715ff42b745f7d0f9cd Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 26 Apr 2021 15:23:40 +0100
Subject: [PATCH 24/33] Addressing comments - 9

Change-Id: I06c9f280de0a9bf0ca5545bbbbfcc70cb66831b3
---
 src/runtime/crt/crt_config-template.h    |  3 ++
 src/runtime/crt/host/crt_config.h        |  2 +-
 src/runtime/crt/memory/stack_allocator.c |  6 ++--
 tests/crt/aot_memory_test.cc             | 38 +++++++++++++++++-------
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/runtime/crt/crt_config-template.h b/src/runtime/crt/crt_config-template.h
index 67e0608ab696..907559421e5d 100644
--- a/src/runtime/crt/crt_config-template.h
+++ b/src/runtime/crt/crt_config-template.h
@@ -51,4 +51,7 @@
 /*! \brief DLDataType for the return value from strlen */
 #define TVM_CRT_STRLEN_DLTYPE 10
 
+/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. Off by default */
+// #define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
+
 #endif  // TVM_RUNTIME_CRT_CRT_CONFIG_TEMPLATE_H_
diff --git a/src/runtime/crt/host/crt_config.h b/src/runtime/crt/host/crt_config.h
index f5533298cae9..b0a68c939070 100644
--- a/src/runtime/crt/host/crt_config.h
+++ b/src/runtime/crt/host/crt_config.h
@@ -52,7 +52,7 @@
 #define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
 
 /*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */
-#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK 0
+#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
 
 // #define TVM_CRT_FRAMER_ENABLE_LOGS
 
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 202569b8dee4..07b6b368cdef 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -18,7 +18,7 @@
  */
 // LINT_C_FILE
 #include <tvm/runtime/crt/stack_allocator.h>
-#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
+#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
 #include <tvm/runtime/crt/logging.h>
 #endif
 
@@ -30,7 +30,7 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
   uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
-#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
+#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
   const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
   *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
   next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
@@ -44,7 +44,7 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
 }
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
-#if TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK > 0
+#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
   uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
   uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
   uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index 259550d0a813..bcd0d40274ac 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -16,30 +16,30 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 #include <gtest/gtest.h>
 #include <tvm/runtime/crt/stack_allocator.h>
 
+#include "platform.cc"
 /*
  * Tests allocations are properly aligned when allocated
  */
 TEST(AOTMemory, Allocate) {
-  static uint8_t model_memory[80];
+  static uint8_t model_memory[96];
   tvm_workspace_t tvm_runtime_workspace;
 
-  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96);
 
   void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
-  ASSERT_EQ(block_two, &model_memory[16]);
+  ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 24);
-  ASSERT_EQ(two_blocks, &model_memory[32]);
+  ASSERT_EQ(two_blocks, &model_memory[32 + 2 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
-  ASSERT_EQ(block_three, &model_memory[64]);
+  ASSERT_EQ(block_three, &model_memory[64 + 3 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
 /*
@@ -54,15 +54,15 @@ TEST(AOTMemory, Free) {
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
-  ASSERT_EQ(block_two, &model_memory[16]);
+  ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
   ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
 
   void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
-  ASSERT_EQ(two_blocks, &model_memory[16]);
+  ASSERT_EQ(two_blocks, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
   ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
 
   void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
-  ASSERT_EQ(block_three, &model_memory[16]);
+  ASSERT_EQ(block_three, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
 /*
@@ -77,12 +77,30 @@ TEST(AOTMemory, OverAllocate) {
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
-  ASSERT_EQ(block_two, &model_memory[16]);
+  ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 64);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
 
+/*
+ * Test for out-of-order memory deallocation
+ */
+TEST(AOTMemory, FreeOutOfOrder) {
+  static uint8_t model_memory[80];
+  tvm_workspace_t tvm_runtime_workspace;
+  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+
+  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  ASSERT_EQ(block_one, &model_memory[0]);
+
+  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
+
+  ASSERT_EXIT(StackMemoryManager_Free(&tvm_runtime_workspace, block_one),
+              ::testing::ExitedWithCode(2), "");
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";

From ca11312193650193927ddcfcb75504403349daca Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 26 Apr 2021 16:39:11 +0100
Subject: [PATCH 25/33] fix tests - 7

Change-Id: I739f29779862f05def36e5f3e0722019596d17f8
---
 python/tvm/micro/model_library_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 07227abc42b9..4fd85ea38d98 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -133,7 +133,7 @@ def export_model_library_format(mod: executor_factory.ExecutorFactoryModule, fil
     """
     tempdir = utils.tempdir()
     is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    memory_map = [] if is_aot else _build_memory_map(mod.get_excecutor_config())
+    memory_map = [] if is_aot else _build_memory_map(mod.get_executor_config())
     runtime = ["aot"] if is_aot else ["graph"]
 
     metadata = {

From 6151502cee7bb14a5487dce4d48f0391eac681b6 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 27 Apr 2021 14:43:47 +0100
Subject: [PATCH 26/33] Addressing comments - 9

Change-Id: Ie736f40a5225f4e56e79006753d7732127da5408
---
 python/tvm/relay/backend/executor_factory.py | 4 +---
 src/relay/backend/build_module.cc            | 8 ++++----
 src/relay/backend/vm/compiler.cc             | 2 +-
 src/target/source/codegen_source_base.h      | 3 +--
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py
index b61d9000bb32..f81d8f9f1c15 100644
--- a/python/tvm/relay/backend/executor_factory.py
+++ b/python/tvm/relay/backend/executor_factory.py
@@ -31,9 +31,7 @@ class ExecutorFactoryModule:
 
     @abstractmethod
     def get_executor_config(self):
-        """Common function to return the internal representation
-        the executor relies upon to execute the network
-        """
+        """ Return the internal configuration the executor uses to execute the network """
         raise NotImplementedError
 
     @abstractmethod
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 10b2b9b56ea9..506c29fba57e 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -127,9 +127,9 @@ struct GraphCodegen : ExecutorCodegen {
     auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
     mod = (*pf)();
   }
-  void UpdateOutput(BuildOutput* ret) override { ret->graph_json = GetJSON(); }
+  void UpdateOutput(BuildOutput* ret) override { ret->graph_json = GetGraphJSON(); }
 
-  std::string GetJSON() { return CallFunc<std::string>("get_graph_json", nullptr); }
+  std::string GetGraphJSON() { return CallFunc<std::string>("get_graph_json", nullptr); }
 
   ~GraphCodegen() {}
 };
@@ -164,7 +164,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     if (name == "get_graph_json") {
       return PackedFunc(
-          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetJSON(); });
+          [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetGraphJSON(); });
     } else if (name == "get_module") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
@@ -210,7 +210,7 @@ class RelayBuildModule : public runtime::ModuleNode {
    *
    * \return const std::string graph_json
    */
-  const std::string& GetJSON() { return ret_.graph_json; }
+  const std::string& GetGraphJSON() { return ret_.graph_json; }
 
   /*!
    * \brief Get the Module object
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 1e231e65424d..d302965b8108 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1173,7 +1173,7 @@ void VMCompiler::Codegen() {
     // to make sure a DSO module will be also available.
     lib = codegen::CSourceModuleCreate(";", "", Array<String>{});
   }
-  lib = codegen::CreateMetadataModule(params_, lib, ext_mods, target_host_);
+  lib = codegen::CreateMetadataModule(params_, lib, ext_mods, target_host_, runtime::Metadata());
   exec_->SetLib(lib);
 }
 
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 80fc9b486971..ff0d079f5425 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -156,8 +156,7 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
  */
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
-    const Array<runtime::Module>& ext_modules, Target target,
-    runtime::Metadata metadata = runtime::Metadata());
+    const Array<runtime::Module>& ext_modules, Target target, runtime::Metadata metadata);
 
 /*!
  * \brief Create a source module for viewing and limited saving for device.

From 043787f77c4f0f75befa0ade10d7e44324701c96 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 27 Apr 2021 18:46:36 +0100
Subject: [PATCH 27/33] Applying comments + fixing tests

Change-Id: I83e16068b93aaccc7a86b79d42f13328bc76b53d
---
 src/relay/backend/aot_executor_codegen.cc |  2 --
 src/runtime/crt/memory/stack_allocator.c  |  3 +++
 tests/python/relay/aot/aot_test_utils.py  | 16 ++++++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index baef6df13af8..707c531160e9 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -43,8 +43,6 @@ namespace relay {
 namespace backend {
 
 using IntegerArray = Array<Integer>;
-using ShapeVector = std::vector<std::vector<int64_t>>;
-using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
 using TargetsMap = std::unordered_map<int, Target>;
 
 class AotReturnSidVisitor : public ExprVisitor {
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 07b6b368cdef..286970cc926a 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -31,6 +31,9 @@ void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_
   uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
 #ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
+  if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
+    return NULL;
+  }
   const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
   *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
   next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index cf242b6276e0..168c4c749ecb 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -61,6 +61,7 @@ def create_main(test_name, input_list, output_list, output_path):
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
         main_file.write("#include <stdio.h>\n")
+        main_file.write("#include <math.h>\n")
         main_file.write('#include "tvm/runtime/crt/internal/aot_executor/aot_executor.h"\n')
         main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n')
         main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
@@ -108,11 +109,18 @@ def create_main(test_name, input_list, output_list, output_path):
         main_file.write("tvm_runtime_run(&network, inputs, outputs);")
 
         for i in range(0, len(output_list)):
+            is_real_dtype = output_list[i].dtype == "float32"
             main_file.write("for (int i = 0; i<output_data%i_len; i++){\n" % i)
-            main_file.write(
-                'if (output_data%s[i]!=expected_output_data%s[i]){printf("ko\\n");return -1;}\n'
-                % (i, i)
-            )
+            if is_real_dtype:
+                main_file.write(
+                    'if (fabs(output_data%s[i]-expected_output_data%s[i]) > 0.001f){printf("ko\\n");return -1;}\n'
+                    % (i, i)
+                )
+            else:
+                main_file.write(
+                    'if (output_data%s[i]!=expected_output_data%s[i]){printf("ko\\n");return -1;}\n'
+                    % (i, i)
+                )
             main_file.write("}\n")
 
         main_file.write('printf("ok\\n");')

From 9a3a8703e712d89ff88e9200915bd2edb67c338d Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 28 Apr 2021 14:43:01 +0100
Subject: [PATCH 28/33] Applying comments - 10

Change-Id: I443d72f53913849f3c28fd6e416162d1ca99e647
---
 include/tvm/runtime/crt/stack_allocator.h |  7 ++--
 src/runtime/crt/memory/stack_allocator.c  | 24 +++++++------
 tests/crt/aot_memory_test.cc              | 44 ++++++++++++++---------
 tests/python/relay/aot/aot_test_utils.py  |  4 +--
 4 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/include/tvm/runtime/crt/stack_allocator.h b/include/tvm/runtime/crt/stack_allocator.h
index eb90f832a419..daa403cb2764 100644
--- a/include/tvm/runtime/crt/stack_allocator.h
+++ b/include/tvm/runtime/crt/stack_allocator.h
@@ -45,10 +45,11 @@ typedef struct {
   size_t workspace_size;  // Total number of bytes in the workspace
 } tvm_workspace_t;
 
-void StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
-                             size_t workspace_size);
+tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
+                                        uint8_t* g_aot_memory, size_t workspace_size);
 
-void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes);
+tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
+                                            void**);
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);
 
diff --git a/src/runtime/crt/memory/stack_allocator.c b/src/runtime/crt/memory/stack_allocator.c
index 286970cc926a..6722816ec538 100644
--- a/src/runtime/crt/memory/stack_allocator.c
+++ b/src/runtime/crt/memory/stack_allocator.c
@@ -22,28 +22,29 @@
 #include <tvm/runtime/crt/logging.h>
 #endif
 
-void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes) {
+tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
+                                            void** current_alloc) {
   // reserve bytes at the end of the allocation such that
   // next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0.
   uint32_t offset_bytes =
       (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - nbytes) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 1);
-  uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
-  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
   uint8_t* workspace_end = tvm_runtime_workspace->workspace + tvm_runtime_workspace->workspace_size;
+  if (tvm_runtime_workspace->next_alloc + nbytes + offset_bytes > workspace_end) {
+    return kTvmErrorPlatformNoMemory;
+  }
+  (*current_alloc) = tvm_runtime_workspace->next_alloc;
+  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
 #ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
   if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
-    return NULL;
+    return kTvmErrorPlatformNoMemory;
   }
   const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
   *((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
   next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
 #endif
-  if (next_alloc > workspace_end) {
-    return NULL;
-  }
 
   tvm_runtime_workspace->next_alloc = next_alloc;
-  return current_alloc;
+  return kTvmErrorNoError;
 }
 
 tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
@@ -54,12 +55,13 @@ tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace,
   CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering");
 #endif
   tvm_runtime_workspace->next_alloc = ptr;
-  return 0;
+  return kTvmErrorNoError;
 }
 
-void StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace, uint8_t* g_aot_memory,
-                             size_t workspace_size) {
+tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
+                                        uint8_t* g_aot_memory, size_t workspace_size) {
   tvm_runtime_workspace->next_alloc = g_aot_memory;
   tvm_runtime_workspace->workspace = g_aot_memory;
   tvm_runtime_workspace->workspace_size = workspace_size;
+  return kTvmErrorNoError;
 }
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index bcd0d40274ac..a6569f58ada0 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -28,17 +28,20 @@ TEST(AOTMemory, Allocate) {
   tvm_workspace_t tvm_runtime_workspace;
 
   StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96);
-
-  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
+  void* block_two = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &block_two);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
-  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 24);
+  void* two_blocks = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 24, &two_blocks);
   ASSERT_EQ(two_blocks, &model_memory[32 + 2 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
-  void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_three = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three);
   ASSERT_EQ(block_three, &model_memory[64 + 3 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -50,18 +53,22 @@ TEST(AOTMemory, Free) {
   tvm_workspace_t tvm_runtime_workspace;
   StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_two = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
-  ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
+  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
 
-  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 2);
+  void* two_blocks = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &two_blocks);
   ASSERT_EQ(two_blocks, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
-  ASSERT_EQ(0, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
+  ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
 
-  void* block_three = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_three = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three);
   ASSERT_EQ(block_three, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -73,13 +80,16 @@ TEST(AOTMemory, OverAllocate) {
   tvm_workspace_t tvm_runtime_workspace;
   StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_two = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
-  void* two_blocks = StackMemoryManager_Allocate(&tvm_runtime_workspace, 64);
+  void* two_blocks = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 64, &two_blocks);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
 
@@ -91,10 +101,12 @@ TEST(AOTMemory, FreeOutOfOrder) {
   tvm_workspace_t tvm_runtime_workspace;
   StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
 
-  void* block_one = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_one = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
   ASSERT_EQ(block_one, &model_memory[0]);
 
-  void* block_two = StackMemoryManager_Allocate(&tvm_runtime_workspace, 1);
+  void* block_two = NULL;
+  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   ASSERT_EXIT(StackMemoryManager_Free(&tvm_runtime_workspace, block_one),
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 168c4c749ecb..1cea6dac26c5 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -78,11 +78,11 @@ def create_main(test_name, input_list, output_list, output_path):
         main_file.write(
             """
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
-    (*out_ptr) = StackMemoryManager_Allocate(&app_workspace, num_bytes);
+    return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
 
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
-    StackMemoryManager_Free(&app_workspace,ptr);
+    return StackMemoryManager_Free(&app_workspace,ptr);
 }
 
 void  TVMPlatformAbort(tvm_crt_error_t code) { }

From 358de45b6bcc21ad4626b3d3420ef61a11acc604 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 29 Apr 2021 11:21:44 +0100
Subject: [PATCH 29/33] Addressing comments - 11

Change-Id: I7fefbd0076949b9c38d0abbf2759ebf1502de330
---
 python/tvm/relay/build_module.py         | 37 ++++++++++++++++++++----
 src/target/target_kind.cc                |  1 +
 tests/python/relay/aot/aot_test_utils.py |  4 +--
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 35d28cddda0e..6a99b145e534 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -229,7 +229,34 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     return build(mod, target, params=params, mod_name=mod_name).module
 
 
-def build(ir_mod, target=None, target_host=None, params=None, mod_name="default", executor="graph"):
+def get_executor_from_target(target, target_host):
+    """Helper function to extract the executor parameter from the target
+
+    Parameters
+    ----------
+    target : Dict of targets for heterogeneous compilation
+
+    target_host :  Host compilation target
+
+    Returns
+    -------
+    executor : str
+    A string representing the executor type
+    """
+
+    # Default executor is graph
+    executor = "graph"
+    cpu_device_type = 1
+    if target_host:
+        executor = target_host.attrs.get("executor", "graph")
+    else:
+        for device_type in target:
+            if device_type == cpu_device_type:
+                executor = target[device_type].attrs.get("executor", "graph")
+    return executor
+
+
+def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
     # fmt: off
     # pylint: disable=line-too-long
     """Helper function that builds a Relay function to run on TVM graph executor.
@@ -259,11 +286,6 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     mod_name: Optional[str]
         The module name we will build
 
-    executor: Optional[str]
-        The type of executor to be used in order to run the model:
-            - If "graph" is specified, then the graph_executor will be used
-            - If "aot" is specified, then the aot_executor will be used
-
     Returns
     -------
 <<<<<<< HEAD
@@ -313,6 +335,9 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         target, target_host, target_is_dict_key=False
     )
 
+    # Retrieve the executor from the target
+    executor = get_executor_from_target(target, target_host)
+
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 2ca4f4533c7e..474b1b0d8ac4 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -227,6 +227,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
+    .add_attr_option<String>("executor")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 1cea6dac26c5..42c134bc47bd 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -161,10 +161,10 @@ def compile_and_run(mod, input_list, output_list, params=None):
     """
     This method verifies the generated source
     """
-    target = "c -runtime=c --link-params"
+    target = "c -runtime=c --link-params --executor=aot"
 
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lib = tvm.relay.build(mod, target, target_host=target, params=params, executor="aot")
+        lib = tvm.relay.build(mod, target, target_host=target, params=params)
 
     tmp_path = utils.tempdir()
     tmp_dir = tmp_path.temp_dir

From 9a67b3f867d10c4a0a54eba8ebc126d9f49be4a0 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 4 May 2021 14:09:48 +0100
Subject: [PATCH 30/33] Addressing comments - 11

Change-Id: Iad028144d7b394b2dd2fce41a35ca689d1680200
---
 python/tvm/relay/build_module.py          | 25 ----------------
 src/relay/backend/aot_executor_codegen.cc |  7 ++---
 tests/crt/aot_memory_test.cc              | 35 ++++++++++++-----------
 tests/python/relay/aot/aot_test_utils.py  |  4 +--
 4 files changed, 23 insertions(+), 48 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 6a99b145e534..2d8c8207c930 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -116,7 +116,6 @@ def build(self, mod, target=None, target_host=None, params=None, executor="graph
 
         Returns
         -------
-<<<<<<< HEAD
         graph_json : str
             The json string that can be accepted by graph executor.
 
@@ -125,10 +124,6 @@ def build(self, mod, target=None, target_host=None, params=None, executor="graph
 
         params : dict
             The parameters of the final graph.
-=======
-        factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
-            The runtime factory for the TVM executor.
->>>>>>> f65012308... Addressing comments - 3
         """
         target = _update_target(target)
         target, target_host = Target.check_and_update_host_consist(
@@ -288,28 +283,8 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     Returns
     -------
-<<<<<<< HEAD
-<<<<<<< HEAD
     factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
             The runtime factory for the TVM graph executor.
-=======
-    internal_repr : str or tir.PrimFunc
-        The internal representation the executor uses to execute the
-        network. Can be a string representing the json graph (if we are
-        building for graph executor) or the PrimFunc representing the
-        AOT runner function
-=======
-    executor_config : str
-        The internal configuration the executor uses to execute the
-        network.
->>>>>>> db667146e... addressing comments - 6
-
-    mod : tvm.Module
-        The module containing necessary libraries.
-
-    params : dict
-        The parameters of the final graph.
->>>>>>> f65012308... Addressing comments - 3
     """
     # pylint: enable=line-too-long
     # fmt: on
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 707c531160e9..1939e05e2075 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -22,7 +22,6 @@
  * \brief Graph runtime codegen
  */
 
-#include <dmlc/any.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
@@ -212,7 +211,7 @@ class AOTExecutorCodegen : public ExprVisitor {
    */
   void CreateFuncCall(Call call, std::string func_name) {
     tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
-    std::vector<tir::Stmt> CreateFuncCall_stmts;
+    std::vector<tir::Stmt> create_func_call_stmts;
 
     // Pack the inputs
     for (Expr arg : call->args) {
@@ -228,9 +227,9 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
 
     // Use tvm_call_packed to execute the function
-    CreateFuncCall_stmts.push_back(tir::Evaluate(
+    create_func_call_stmts.push_back(tir::Evaluate(
         tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args)));
-    tir::Stmt body = tir::SeqStmt(CreateFuncCall_stmts);
+    tir::Stmt body = tir::SeqStmt(create_func_call_stmts);
     stmts_.push_back(body);
   }
 
diff --git a/tests/crt/aot_memory_test.cc b/tests/crt/aot_memory_test.cc
index a6569f58ada0..ecae2ef52f59 100644
--- a/tests/crt/aot_memory_test.cc
+++ b/tests/crt/aot_memory_test.cc
@@ -27,21 +27,21 @@ TEST(AOTMemory, Allocate) {
   static uint8_t model_memory[96];
   tvm_workspace_t tvm_runtime_workspace;
 
-  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96);
+  ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 96), kTvmErrorNoError);
   void* block_one = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &block_two);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &block_two), kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 24, &two_blocks);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 24, &two_blocks), kTvmErrorNoError);
   ASSERT_EQ(two_blocks, &model_memory[32 + 2 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* block_three = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError);
   ASSERT_EQ(block_three, &model_memory[64 + 3 * STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -51,24 +51,24 @@ TEST(AOTMemory, Allocate) {
 TEST(AOTMemory, Free) {
   static uint8_t model_memory[80];
   tvm_workspace_t tvm_runtime_workspace;
-  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
   ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, block_two));
 
   void* two_blocks = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &two_blocks);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 2, &two_blocks), kTvmErrorNoError);
   ASSERT_EQ(two_blocks, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
   ASSERT_EQ(kTvmErrorNoError, StackMemoryManager_Free(&tvm_runtime_workspace, two_blocks));
 
   void* block_three = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_three), kTvmErrorNoError);
   ASSERT_EQ(block_three, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 }
 
@@ -78,18 +78,19 @@ TEST(AOTMemory, Free) {
 TEST(AOTMemory, OverAllocate) {
   static uint8_t model_memory[72];
   tvm_workspace_t tvm_runtime_workspace;
-  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   void* two_blocks = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 64, &two_blocks);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 64, &two_blocks),
+            kTvmErrorPlatformNoMemory);
   ASSERT_EQ(two_blocks, (void*)NULL);
 }
 
@@ -99,14 +100,14 @@ TEST(AOTMemory, OverAllocate) {
 TEST(AOTMemory, FreeOutOfOrder) {
   static uint8_t model_memory[80];
   tvm_workspace_t tvm_runtime_workspace;
-  StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80);
+  ASSERT_EQ(StackMemoryManager_Init(&tvm_runtime_workspace, model_memory, 80), kTvmErrorNoError);
 
   void* block_one = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_one), kTvmErrorNoError);
   ASSERT_EQ(block_one, &model_memory[0]);
 
   void* block_two = NULL;
-  StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two);
+  ASSERT_EQ(StackMemoryManager_Allocate(&tvm_runtime_workspace, 1, &block_two), kTvmErrorNoError);
   ASSERT_EQ(block_two, &model_memory[16 + STACK_ALLOCATOR_TAG_SIZE_BYTES]);
 
   ASSERT_EXIT(StackMemoryManager_Free(&tvm_runtime_workspace, block_one),
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 42c134bc47bd..8273d3954d3b 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -109,9 +109,9 @@ def create_main(test_name, input_list, output_list, output_path):
         main_file.write("tvm_runtime_run(&network, inputs, outputs);")
 
         for i in range(0, len(output_list)):
-            is_real_dtype = output_list[i].dtype == "float32"
+            is_float_dtype = output_list[i].dtype == "float32"
             main_file.write("for (int i = 0; i<output_data%i_len; i++){\n" % i)
-            if is_real_dtype:
+            if is_float_dtype:
                 main_file.write(
                     'if (fabs(output_data%s[i]-expected_output_data%s[i]) > 0.001f){printf("ko\\n");return -1;}\n'
                     % (i, i)

From 2dec7406caa9e5001366feabbffe0dec5e559233 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 4 May 2021 15:51:41 +0100
Subject: [PATCH 31/33] fix tests - 7

Change-Id: I14286e665dcdba1e9bc10bb5a27dd6ced50372b0
---
 python/tvm/driver/tvmc/compiler.py                | 7 -------
 tests/python/relay/test_backend_graph_executor.py | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index dffa06671191..3f1d04aee7fd 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -251,15 +251,8 @@ def compile_model(
         source = str(mod) if source_type == "relay" else lib.get_source(source_type)
         dumps[source_type] = source
 
-<<<<<<< HEAD
     # Create a new tvmc model package object from the graph definition.
     package_path = tvmc_model.export_package(graph_module, package_path, cross, export_format)
-=======
-    # TODO we need to update this return to use the updated graph module APIs
-    #      as these getter functions will be deprecated in the next release (@leandron)
-    return graph_module.get_graph_json(), graph_module.get_lib(), graph_module.get_params(), dumps
-
->>>>>>> 99ce0408b... fix tests - 2
 
     # Write dumps to file.
     if dumps:
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 8f6e4b3a4d3e..06623e0baa24 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -173,7 +173,7 @@ def test_reshape_nop():
     func = relay.Function([x], relay.Tuple([z0, z1, z2]))
     x_data = np.random.rand(10, 4).astype("float32")
     graph = relay.build(tvm.IRModule.from_expr(func), "llvm")
-    graph_json_str = graph.get_json()
+    graph_json_str = graph.get_graph_json()
 
     graph_json = json.loads(graph_json_str)
 

From f5db29ea4bb4304633eb8956c773af7bc11782f6 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 4 May 2021 18:19:13 +0100
Subject: [PATCH 32/33] fixing tests -8

Change-Id: I7b4c966da9680870ceda1704c749ee3bdc751926
---
 python/tvm/driver/tvmc/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index a26a47c788fe..0b48c49bab2f 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -52,7 +52,7 @@
 import tvm.contrib.cc
 from tvm import relay
 from tvm.contrib import utils
-from tvm.relay.backend.graph_executor_factory import GraphExecutorFactoryModule
+from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
 
 from .common import TVMCException
 

From 5a0222239ded7f72a8e9352d738bcd16ef9571a5 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 4 May 2021 19:51:28 +0100
Subject: [PATCH 33/33] fixing tests - 9

Change-Id: Icf62128a604998ed1b7d5af4cbeadf7d39196d0b
---
 python/tvm/driver/tvmc/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 0b48c49bab2f..e48125f0f619 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -220,7 +220,7 @@ def export_package(
         self.lib_path = path_lib
 
         with open(temp.relpath(graph_name), "w") as graph_file:
-            graph_file.write(executor_factory.get_json())
+            graph_file.write(executor_factory.get_graph_json())
 
         with open(temp.relpath(param_name), "wb") as params_file:
             params_file.write(relay.save_param_dict(executor_factory.get_params()))