From 40e5672507f3d26394e96a1f76ae0b6edb45ab23 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 12 Nov 2020 22:40:21 -0800
Subject: [PATCH 01/60] refactor RPCSessionContext utils

---
 include/tvm/runtime/device_api.h  |  54 ++++++++++++++--
 src/runtime/rpc/rpc_device_api.cc |  35 ++++------
 src/runtime/rpc/rpc_endpoint.cc   |   2 +-
 src/runtime/rpc/rpc_module.cc     | 104 +++++++++++++++++-------------
 4 files changed, 122 insertions(+), 73 deletions(-)
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c6a2ce3d28d0..40989e4057e0 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -240,13 +240,57 @@ inline const char* DeviceName(int type) {
   }
 }
 
+/*!
+ * \brief Return true if a TVMContext is owned by an RPC session.
+ */
+inline bool IsRPCSessionContext(TVMContext ctx) {
+  return (ctx.device_type / kRPCSessMask) > 0;
+}
+
+/*!
+ * \brief Return the RPCSessTable index of the RPC Session that owns this context.
+ * \return the table index.
+ */
+inline int GetRPCSessionIndex(TVMContext ctx) {
+  ICHECK(IsRPCSessionContext(ctx)) << "GetRPCSessionIndex: ctx has no RPC session";
+  return ctx.device_type / kRPCSessMask - 1;
+}
+
+/*!
+ * \brief Remove the RPC session mask from a TVMContext.
+ * RPC clients typically do this when encoding a TVMContext for transmission to an RPC remote.
+ * On the wire, RPCContext are expected to be valid on the server without interpretation.
+ * \param ctx A TVMContext with non-zero RPC Session mask, valid on the RPC client.
+ * \return A TVMContext without any RPC Session mask, valid on the RPC server.
+ */
+inline TVMContext RemoveRPCSessionMask(TVMContext ctx) {
+  ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+  return ctx;
+}
+
+inline std::ostream& operator<<(std::ostream& os, DLContext ctx);
+
+/*!
+ * \brief Add a RPC session mask to a TVMContext.
+ * RPC clients typically do this when decoding a TVMContext received from a RPC remote.
+ * \param ctx A TVMContext without any RPC Session mask, valid on the RPC server.
+ * \param session_table_index Numeric index of the RPC session in the session table.
+ * \return A TVMContext with RPC session mask added, valid on the RPC client.
+ */
+inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) {
+  CHECK(!IsRPCSessionContext(ctx))
+    << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
+  ctx.device_type = static_cast<DLDeviceType>(
+    ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
+  return ctx;
+}
+
 inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
-  int device_type = static_cast<int>(ctx.device_type);
-  if (device_type > kRPCSessMask) {
-    os << "remote[" << (device_type / kRPCSessMask) << "]-";
-    device_type = device_type % kRPCSessMask;
+  if (IsRPCSessionContext(ctx)) {
+    os << "remote[" << GetRPCSessionIndex(ctx) << "]-";
+    ctx = RemoveRPCSessionMask(ctx);
   }
-  os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
+  os << runtime::DeviceName(static_cast<int>(ctx.device_type)) << "(" << ctx.device_id << ")";
   return os;
 }
 }  // namespace runtime
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 943990fd9585..a1e96e92b4e0 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -34,19 +34,19 @@ namespace runtime {
 class RPCDeviceAPI final : public DeviceAPI {
  public:
   void SetDevice(TVMContext ctx) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->SetDevice(remote_ctx);
   }
 
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
   }
 
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     auto sess = GetSess(ctx);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     void* data =
         sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, nbytes, alignment, type_hint);
 
@@ -57,7 +57,7 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     try {
       GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data);
     } catch (const dmlc::Error& e) {
@@ -68,13 +68,11 @@ class RPCDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    int from_dev_type = ctx_from.device_type;
-    int to_dev_type = ctx_to.device_type;
-    if (from_dev_type > kRPCSessMask && to_dev_type > kRPCSessMask) {
+    if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
       ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       auto remote_ctx = remote_ctx_from;
       if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
       GetSess(ctx_from)
@@ -82,12 +80,12 @@ class RPCDeviceAPI final : public DeviceAPI {
           ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
                            static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                            remote_ctx_from, remote_ctx_to, type_hint, stream);
-    } else if (from_dev_type > kRPCSessMask && to_dev_type == kDLCPU) {
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
+    } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
       GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
                                         to, to_offset, size, remote_ctx_from, type_hint);
-    } else if (from_dev_type == kDLCPU && to_dev_type > kRPCSessMask) {
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+    } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
                                     static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                                     remote_ctx_to, type_hint);
@@ -97,22 +95,15 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
   }
 
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
-    int dev_type = ctx.device_type;
-    ICHECK_GE(dev_type, kRPCSessMask);
-    int tbl_index = dev_type / kRPCSessMask - 1;
+    int tbl_index = GetRPCSessionIndex(ctx);
     return RPCSession::Get(tbl_index);
   }
-
-  static TVMContext RemoveSessMask(TVMContext ctx) {
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
-  }
 };
 
 TVM_REGISTER_GLOBAL("device_api.rpc").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index b8c2a3bb0b97..fbdd93fb4f62 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -178,7 +178,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << args[i].AsObjectRef<ObjectRef>()->GetTypeKey() << " is not supported by RPC";
       } else if (tcode == kTVMContext) {
         DLContext ctx = args[i];
-        ICHECK_LT(static_cast<int>(ctx.device_type), kRPCSessMask)
+        ICHECK(!IsRPCSessionContext(ctx))
             << "InternalError: cannot pass RPC context in the channel";
       }
     }
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index a3d888e927ed..7f810a229887 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -36,6 +36,52 @@
 namespace tvm {
 namespace runtime {
 
+
+// deleter of RPC remote array
+static void RemoteNDArrayDeleter(Object* obj) {
+  auto* ptr = static_cast<NDArray::Container*>(obj);
+  RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+  space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
+  delete space;
+  delete ptr;
+}
+
+/*!
+ * \brief Build a local NDArray with remote backing storage.
+ * \param handle A pointer valid on the remote end which should form the `data` field of the
+ *     underlying DLTensor.
+ * \param shape The shape field of this DLTensor.
+ * \param ndim The rank of this DLTensor.
+ * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
+ * \param deleter A function invoked when the local NDArray object is no longer used. If `handle`
+ *      needs to be explicitly deleted after the NDArray is freed, this function should do that.
+ * \param deleter_ctx An opaque pointer passed to deleter to identify the tensor being deleted.
+ */
+NDArray NDArrayFromRemoteOpaqueHandle(void* handle, int64_t* shape, int64_t ndim, DLContext* ctx, FDeleter deleter, void* deleter_ctx) {
+  NDArray::Container* data = new NDArray::Container();
+  data->manager_ctx = deleter_ctx;
+  data->SetDeleter(deleter);
+  RemoteSpace* space = new RemoteSpace();
+  space->sess = sess_;
+  space->data = tensor->data;
+  data->dl_tensor.data = space;
+  NDArray ret(GetObjectPtr<Object>(data));
+  // RAII now in effect
+  data->shape_ = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
+  data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+  data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+  // setup dtype
+  data->dl_tensor.dtype = tensor->dtype;
+  // setup ctx
+  data->dl_tensor.ctx = ctx;
+  // check strides.
+  ICHECK(tensor->strides == nullptr);
+  // setup byteoffset
+  data->dl_tensor.byte_offset = tensor->byte_offset;
+  return ret;
+}
+
+
 /*!
  * \brief A wrapped remote function as a PackedFunc.
  */
@@ -108,47 +154,10 @@ class RPCWrappedFunc : public Object {
 
   // remove a remote session mask
   TVMContext RemoveSessMask(TVMContext ctx) const {
-    int dev_type = ctx.device_type;
-    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
-        << "Can not pass in local context or context with a different remote session";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
-  }
-
-  // deleter of RPC remote array
-  static void RemoteNDArrayDeleter(Object* obj) {
-    auto* ptr = static_cast<NDArray::Container*>(obj);
-    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
-    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
-    delete space;
-    delete ptr;
-  }
-
-  // wrap return value as remote NDArray.
-  NDArray WrapRemoteNDArray(DLTensor* tensor, void* nd_handle) const {
-    NDArray::Container* data = new NDArray::Container();
-    data->manager_ctx = nd_handle;
-    data->SetDeleter(RemoteNDArrayDeleter);
-    RemoteSpace* space = new RemoteSpace();
-    space->sess = sess_;
-    space->data = tensor->data;
-    data->dl_tensor.data = space;
-    NDArray ret(GetObjectPtr<Object>(data));
-    // RAII now in effect
-    data->shape_ = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
-    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
-    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
-    // setup dtype
-    data->dl_tensor.dtype = tensor->dtype;
-    // setup ctx, encode as remote session
-    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
-    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
-        static_cast<int>(tensor->ctx.device_type) + kRPCSessMask * (sess_->table_index() + 1));
-    // check strides.
-    ICHECK(tensor->strides == nullptr);
-    // setup byteoffset
-    data->dl_tensor.byte_offset = tensor->byte_offset;
-    return ret;
+    ICHECK(IsRPCSessionContext(ctx)) << "Can not pass in local context";
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
+        << "Can not pass in context with a different remote session";
+    return RemoveRPCSessionMask(ctx);
   }
 };
 
@@ -189,10 +198,9 @@ class RPCModuleNode final : public ModuleNode {
                               int min_repeat_ms, const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
-    int dev_type = ctx.device_type;
-    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
         << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+    ctx = RemoveRPCSessionMask(ctx);
 
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
@@ -283,7 +291,7 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
-    *rv = WrapRemoteNDArray(tensor, nd_handle);
+    *rv = NDArrayFromRemoteOpaqueHandle(tensor->data, tensor->shape, tensor->ndim, AddRPCSessionMask(ctx, sess_->table_index()), RemoteNDArrayDeleter, nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
     *rv = args[1];
@@ -469,5 +477,11 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.wrap_remote_ndarray").set_body_typed([](void* remote_array, PackedFunc deleter) {
+  *rv = WrapRemoteNDArray(remote_array, [pf](Object* ctx) {
+    pf();
+  });
+});
+
 }  // namespace runtime
 }  // namespace tvm

From 99ef7e4496445c67a2be37f8399eb066fccc4339 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 09:35:06 -0800
Subject: [PATCH 02/60] Make TVMLogf platform-independent.

 * Some platforms need to use an alternate printf() to support basic
   things like %zu. Since %zu is platform-specific, we prefer to
   use a printf() that supports it or allow the platform to fix it up
   as needed.
---
 include/tvm/runtime/crt/platform.h            | 17 +++++++++++++++++
 src/runtime/crt/host/main.cc                  |  5 +++++
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |  3 ++-
 tests/micro/qemu/zephyr-runtime/src/main.c    |  5 +++++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 782060dfd000..3eac45f64e4e 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RUNTIME_CRT_PLATFORM_H_
 #define TVM_RUNTIME_CRT_PLATFORM_H_
 
+#include <stdarg.h>
+#include <stddef.h>
 #include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
@@ -39,6 +41,21 @@ extern "C" {
  */
 void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t code);
 
+/*! \brief Called by the microTVM RPC server to implement TVMLogf.
+ *
+ * Not required to be implemented when the RPC server is not linked into the binary. This
+ * function's signature matches that of vsnprintf, so trivial implementations can just call
+ * vsnprintf.
+ *
+ * \param out_buf A char buffer where the formatted string should be written.
+ * \param out_buf_size_bytes Number of bytes available for writing in out_buf.
+ * \param fmt The printf-style formatstring.
+ * \param args extra arguments to be formatted.
+ * \return number of bytes written.
+ */
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
+                                const char* fmt, va_list args);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 5623b2515585..60797c39b41d 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -43,6 +43,11 @@ ssize_t UTvmWriteFunc(void* context, const uint8_t* data, size_t num_bytes) {
   return to_return;
 }
 
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
+                                const char* fmt, va_list args) {
+  return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
+}
+
 void TVMPlatformAbort(tvm_crt_error_t error_code) {
   std::cerr << "TVMPlatformAbort: " << error_code << std::endl;
   throw "Aborted";
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 34eff6a3270d..84930866367e 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -219,7 +219,8 @@ void TVMLogf(const char* format, ...) {
   va_list args;
   char log_buffer[256];
   va_start(args, format);
-  size_t num_bytes_logged = vsnprintf(log_buffer, sizeof(log_buffer), format, args);
+  size_t num_bytes_logged = TVMPlatformFormatMessage(
+    log_buffer, sizeof(log_buffer), format, args);
   va_end(args);
 
   // Most header-based logging frameworks tend to insert '\n' at the end of the log message.
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
index 19e72e1c076d..91b13de7d04d 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -57,6 +57,11 @@ ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
   return size;
 }
 
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
+                                const char* fmt, va_list args) {
+  return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
+}
+
 void TVMPlatformAbort(tvm_crt_error_t error) {
   sys_reboot(SYS_REBOOT_COLD);
   for (;;)

From b9db1471303fc33cc02d69ad12a097b7130a2071 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 9 Nov 2020 09:14:29 -0800
Subject: [PATCH 03/60] test pass, make runtime part work (wip)

---
 include/tvm/runtime/module.h                  |   4 +
 python/tvm/relay/param_dict.py                |  17 ++
 src/relay/backend/build_module.cc             |  29 +++
 src/runtime/graph/graph_runtime.cc            |  43 +++-
 src/runtime/graph/graph_runtime.h             |   7 +-
 src/target/llvm/codegen_llvm.cc               | 102 ++++++++++
 src/target/llvm/codegen_params.cc             | 184 ++++++++++++++++++
 src/target/llvm/llvm_module.cc                |   7 +-
 .../unittest/test_target_codegen_llvm.py      |   3 +
 9 files changed, 390 insertions(+), 6 deletions(-)
 create mode 100644 src/target/llvm/codegen_params.cc

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 0e7cd2b08784..0e9266b17c74 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -226,6 +226,10 @@ constexpr const char* tvm_global_barrier_state = "__tvm_global_barrier_state";
 constexpr const char* tvm_prepare_global_barrier = "__tvm_prepare_global_barrier";
 /*! \brief Placeholder for the module's entry function. */
 constexpr const char* tvm_module_main = "__tvm_main__";
+/*! \brief Prefix for parameter symbols emitted into the main program. */
+constexpr const char* tvm_param_prefix = "__tvm_param__";
+/*! \brief A PackedFunc that looks up linked parameters by storage_id. */
+constexpr const char* tvm_lookup_linked_param = "__lookup_linked_param";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 2d0398e20486..463eae51d7b8 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name
 """Helper utility to save parameter dicts."""
+import json
 import tvm
 import tvm._ffi
 
@@ -76,3 +77,19 @@ def load_param_dict(param_bytes):
         param_bytes = bytearray(param_bytes)
     load_arr = _load_param_dict(param_bytes)
     return {v.name: v.array for v in load_arr}
+
+
+def linkable_param_dict(graph_json, params, target):
+    graph = json.loads(graph_json)
+    data_by_sid = [None] * len(params)
+    for param_name, param in params.items():
+        for node in graph['nodes']:
+            if node['name'] == param_name:
+                sid = node['storage_id']
+                data_by_sid[sid] = param
+
+    # GraphRuntimeCodegen is expected to allocated the first len(params) storage_ids to contain
+    # parameters.
+    assert all(d is not None for d in data_by_sid)
+
+    data_
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index ddea5456585b..152bbde2ee46 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -443,6 +443,35 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
 
+    Target target_host = GetTargetHost();
+    // If no target_host has been set, we choose a default one, which is
+    // llvm if "codegen.LLVMModuleCreate" is accessible.
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+    if (!target_host.defined())
+      target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+
+    if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
+      CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
+      auto param_ids = graph_codegen_->GetParamIds();
+      auto link_params = Map<String, tir::LinkedParam>();
+      for (auto param : ret_.params) {
+        link_params.Set(
+          param.first, tir::LinkedParam(param_ids[param.first], param.second));
+      }
+
+      Map<String, ObjectRef> dict;
+      dict.Set(tvm::tir::attr::kLinkedParams, link_params);
+      dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::target::packed_func::kLookupLinkedParam));
+      DictAttrs attrs{dict};
+      auto prim = tir::PrimFunc(
+        Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(), Map<tir::Var, tir::Buffer>(), attrs);
+      if (lowered_funcs.find(target_host->str()) == lowered_funcs.end()) {
+        lowered_funcs.Set(target_host->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+      lowered_funcs[target_host->str()]->Add(
+        GlobalVar(::tvm::target::packed_func::kLookupLinkedParam), prim);
+    }
+
     // When there is no lowered_funcs due to reasons such as optimization.
     if (lowered_funcs.size() == 0) {
       Target target_host = GetTargetHost();
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 45a36900b586..fdf4ee8baf8a 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -72,6 +72,7 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu
   this->Load(&reader);
   module_ = module;
   ctxs_ = ctxs;
+  this->SetupLinkedParams();
   this->SetupStorage();
   this->SetupOpExecs();
   for (size_t i = 0; i < input_nodes_.size(); i++) {
@@ -244,7 +245,15 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   this->SetupOpExecs();
 }
 
+void GraphRuntime::PreAllocatedDeleter(void* ctx) {
+  delete ctx;
+}
+
 void GraphRuntime::SetupStorage() {
+  // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
+  // params are present.
+  tvm::runtime::PackedFunc pf = module_.GetFunction(::tvm::runtime::module::kLookupLinkedParam, true);
+
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
   for (const std::string& s_type : attrs_.dltype) {
@@ -254,6 +263,8 @@ void GraphRuntime::SetupStorage() {
   // Size and device type of each storage pool entry.
   std::vector<PoolEntry> pool_entry;
   // Find the maximum space size.
+  int node_index = 0;
+  int node_output = 0;
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Use the fallback device if no device index is available.
@@ -278,21 +289,41 @@ void GraphRuntime::SetupStorage() {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
+    if (pf != nullptr && pool_entry[sid] == nullptr) {
+      try {
+        pool_entry[sid].pre_linked_param = pf(sid);
+        pool_entry[sid].param_data_entry = i;
+      } except (std::runtime_error& e) {
+        // Indicates this storage_id is not pre-linked.
+      }
+    }
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
 
   // Allocate the space.
   for (const auto& pit : pool_entry) {
-    std::vector<int64_t> shape;
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
     const auto& cit = std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
       return pit.device_type == static_cast<int>(c.device_type);
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
-    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-    storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    if (pit.pre_linked_param != nullptr) {
+      auto param_entry = data_entry_[pit.param_data_entry];
+      DLTensor* param_tensor = new DLTensor{
+        pit.preq_linked_param, ctx, vtype[pit.param_data_entry],
+        param_entry.size(), nullptr, 0};
+
+      storage_pool_.push_back(
+        NDArray::FromDLManagedTensor(
+          DLManagedTensor{param_tensor, param_tensor, PreAllocatedDeleter}));
+
+    } else {
+      std::vector<int64_t> shape;
+      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
+      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    }
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplifiy
@@ -303,7 +334,11 @@ void GraphRuntime::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    auto pool_entry = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    if (pool_entry.get() != nullptr) {
+      data_entry_[i] = pool_entry.get();
+    }
+
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 810ff43fe97a..d687ab4b3615 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -182,7 +182,10 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   struct PoolEntry {
     size_t size;
     int device_type;
-    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+    void* pre_linked_param;
+    int param_data_entry;
+    PoolEntry(int s, int dev_type, std::unique_ptr<DLTensor> pre_linked_param) :
+        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
   // Node entry
   struct NodeEntry {
@@ -363,6 +366,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     }
     ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
+  /*! \brief Setup pre-linked parameters. */
+  void SetupLinkedParams();
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors. */
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index faa483d019c0..2604f5c50ddd 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -184,6 +184,108 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 }
 
+void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
+  // It would be nice to de-dupe these declarations frm src/tir/transforms/make_packed_api.cc,
+  // but they are at a different layer in the compiler...
+  std::vector<llvm::Type*> param_types;
+  // args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // num_args
+  param_types.push_back(t_int64_);
+  // ret_args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // ret_tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // resource_handle
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+
+  // TODO(tvm-team):
+  // Update the function type to respect the ret_type field of f.
+  // Once we allow more flexibility in the PrimFunc.
+  llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
+
+  llvm::Function* function = llvm::Function::Create(
+    ftype, llvm::Function::ExternalLinkage,
+    ::tvm::target::packed_func::kLookupLinkedParam, module_.get());
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
+  builder_->SetInsertPoint(entry);
+  std::vector<llvm::Value*> zero_index_list{{llvm::ConstantInt::get(t_int32_, 0)}};
+  auto args_array = builder_->CreateBitCast(
+    &function->arg_begin()[0], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+  llvm::Value* sid =
+    builder_->CreateBitCast(
+      builder_->CreateLoad(t_void_->getPointerTo(GetGlobalAddressSpace()),
+                           builder_->CreateInBoundsGEP(args_array, zero_index_list)), t_int64_);
+    //
+//    builder_->CreateGEP(&function->arg_begin()[0], zero_index_list), t_int64_);
+
+  llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
+  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
+
+  builder_->SetInsertPoint(default_block);
+  builder_->CreateRet(ConstInt32(kTvmErrorGeneratedInvalidStorageId));
+
+  llvm::raw_os_ostream os{std::cout};
+
+  for (auto kv : params) {
+    auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
+    std::cout << "param " << kv.first << ": ";
+    array->print(os);
+    std::string symbol_name = std::string{::tvm::runtime::symbol::tvm_param_prefix} + kv.first;
+    llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
+      *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage,
+      array, symbol_name);
+
+    llvm::BasicBlock* case_block = llvm::BasicBlock::Create(*ctx_, "case_" + symbol_name, function);
+    switch_inst->addCase(
+      llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)),
+      case_block);
+    builder_->SetInsertPoint(case_block);
+    auto retval_array = builder_->CreateBitCast(
+      &function->arg_begin()[3], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+    builder_->CreateStore(
+//      param_symbol,
+      builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
+      builder_->CreateGEP(retval_array, zero_index_list));
+    auto ret_types_array = builder_->CreateBitCast(
+      &function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+    builder_->CreateStore(
+      llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+      builder_->CreateGEP(ret_types_array, zero_index_list));
+    builder_->CreateRet(ConstInt32(0));
+  }
+
+  std::cout << "generated function: " << std::endl;
+  function->print(os);
+
+  // llvm::Value* sid_start = module_->getGlobalVariable(module::tvm_param_array_sid_start);
+  // llvm::Value* cond = builder_->CreateAnd(
+  //   builder_->CreateICmpSGE(sid, sid_start),
+  //   builder_->CreateICmpSLT(sid,
+  //                           module_->getGlobalVariable(module::tvm_param_array_sid_end)));
+
+  // BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
+  // builder_->CreateCondBr(cond, then_block, else_block);
+
+  // // SID valid block (fetch sid data pointer and write to ret_values).
+  // builder_->SetInsertPoint(then_block);
+  // std::vector<llvm::Value> sid_index_list{builder_->CreateISub(sid, sid_start)};
+  // builder_->CreateStore(
+  //   builder_->CreateGEP(module_->getGlobalVariable(module::tvm_param_array), sid_index_list),
+  //   builder_->CreateBitCast(
+  //     builder_->CreateGEP(function->getArg(3), zero_index_list), t_int64_ty_));
+  // NOTE: set ret_tcode[0] to kTVMOpaqueHandle because the 'data' pointer of a DLTensor is returned
+  // here, *not* a proper DLTensor. It is up to the caller to create a DLTensor that correctly
+  // describes the returned data pointer.
+
+  // SID invalid block (return invalid SID error).
+}
+
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
   for (size_t i = 0; i < link_modules_.size(); ++i) {
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
new file mode 100644
index 000000000000..7c160cf198e7
--- /dev/null
+++ b/src/target/llvm/codegen_params.cc
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_blob.cc
+ */
+#ifdef TVM_LLVM_VERSION
+
+#include "codegen_params.h"
+
+namespace tvm {
+namespace codegen {
+
+class DLManagedTensorDeleter {
+ public:
+  void operator()(DLManagedTensor* ptr) {
+    ptr->deleter(ptr);
+  }
+};
+
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
+  llvm::Type* element_type = nullptr;
+
+  auto arr_type = arr.DataType();
+  CHECK_EQ(arr_type.lanes(), 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
+  std::vector<llvm::Constant*> elements;
+
+  switch (arr_type.code()) {
+  case runtime::DataType::kInt:
+    CHECK(arr_type.bits() == 8 ||
+          arr_type.bits() == 16 ||
+          arr_type.bits() == 32 ||
+          arr_type.bits() == 64)
+      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+      << arr_type.bits() << "-bit array";
+    element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+    if (arr_type.bits() == 8) {
+      int8_t* data_buf = static_cast<int8_t*>(tensor->dl_tensor.data);
+      for (int i = 0; i < num_elements; i++) {
+        std::cout << std::hex << +static_cast<std::uint8_t>(data_buf[i]) << std::dec << " ";
+        if (((i + 1) % 16) == 0) {
+          std::cout << std::endl;
+        }
+
+        elements.emplace_back(llvm::ConstantInt::getSigned(element_type, data_buf[i]));
+      }
+    } else if (arr_type.bits() == 16) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::getSigned(element_type, ((int16_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 32) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::getSigned(element_type, ((int32_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 64) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::getSigned(element_type, ((int64_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else {
+      CHECK(false) << "should not get here";
+    }
+    break;
+
+  case runtime::DataType::TypeCode::kUInt:
+    CHECK(arr_type.bits() == 8 ||
+          arr_type.bits() == 16 ||
+          arr_type.bits() == 32 ||
+          arr_type.bits() == 64)
+      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+      << arr_type.bits() << "-bit array";
+    element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+    if (arr_type.bits() == 8) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::get(element_type, ((int8_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 16) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::get(element_type, ((int16_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 32) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::get(element_type, ((int32_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 64) {
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantInt::get(element_type, ((int64_t*) tensor->dl_tensor.data)[i]));
+      }
+    } else {
+      CHECK(false) << "should not get here";
+    }
+    break;
+
+  case runtime::DataType::TypeCode::kFloat:
+    if (arr_type.bits() == 32) {
+      element_type = llvm::Type::getFloatTy(*ctx);
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantFP::get(element_type, ((float*) tensor->dl_tensor.data)[i]));
+      }
+    } else if (arr_type.bits() == 64) {
+      element_type = llvm::Type::getDoubleTy(*ctx);
+      for (int i = 0; i < num_elements; i++) {
+        elements.emplace_back(
+          llvm::ConstantFP::get(element_type, ((double*) tensor->dl_tensor.data)[i]));
+      }
+    } else {
+      CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                   << arr_type.bits() << "-bit array";
+    }
+    break;
+
+  default:
+    CHECK(false) << "Data type not supported";
+  }
+
+  return llvm::cast<llvm::ConstantArray>(
+    llvm::ConstantArray::get(llvm::ArrayType::get(element_type, num_elements),
+                             llvm::ArrayRef<llvm::Constant*>(elements)));
+}
+
+// void LLVMCodeGenParams(llvm::LLVMContext* ctx,
+//                        llvm::Module* module,
+//                        int64_t storage_id_offset,
+//                        const Map<String,
+//                        const std::unordered_map<String, NDArray>& params,
+//                        const std::unordered_map<String, int64_t>& param_ids) {
+//   CHECK_EQ(params.size(), params_ids.size())
+//     << "Expect param_names and params_ids to have equal lengths, but params.size() == "
+//     << params.size() << " and params_ids.size() == " << params_ids.size();
+
+//   llvm::ArrayType* t_sid_ptr_ty =
+//     llvm::ArrayType::get(llvm::PointerType::getUnqual(llvm::getVoidTy()));
+//   std::vector<llvm::Constant*> sid_ptrs;
+//   for (auto kv : params) {
+
+//     sid_ptrs.push_back(
+//   }
+
+//   llvm::GlobalVaraible* sid_offset_symbol = new llvm::GlobalVariable(
+//     *module, llvm::Type::getInt64Ty(), true, llvm::GlobalVariable::InternalLinkage,
+//     llvm::ConstantInt::getSigned(ctx, sid_offset), ::tvm::runtime::symbol::tvm_sid_offset);
+//   llvm::GlobalVariable* sid_ptrs_symbol = new llvm::GlobalVariable(
+//     *module, t_sid_ptr_ty, true, llvm::GlobalValue::InternalLinkage,
+//     llvm::ConstantArray::get(t_sid_ptr_ty, sid_ptrs), ::tvm::runtime::symbol::tvm_param_array);
+// }
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 569082022852..98857f574bd9 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -209,7 +209,8 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       }
       funcs.push_back(f);
     }
-    ICHECK_NE(funcs.size(), 0U);
+    bool is_link_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
+    ICHECK(funcs.size() > 0 || is_link_params);
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -222,6 +223,10 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       cg->AddMainFunction(entry_func);
     }
 
+    if (is_link_params) {
+      CHECK(found_linked_params) << "--link-params given, but no parameters given to codegen";
+      cg->LinkParameters(linked_params);
+    }
     module_ = cg->Finish();
     module_->addModuleFlag(llvm::Module::Warning, "tvm_target",
                            llvm::MDString::get(*ctx_, LLVMTargetToString(target)));
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 3599493a74cb..ea2a1f165b30 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -14,6 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import collections
+import ctypes
+import json
 import tvm
 import tvm.testing
 from tvm import te

From 8d62592dabbcedead74a4569c9db65f8142a15bf Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 9 Nov 2020 18:00:16 -0800
Subject: [PATCH 04/60] llvm and c backends work!

---
 include/tvm/runtime/module.h                  |   2 +-
 src/relay/backend/build_module.cc             |   4 +-
 src/runtime/graph/graph_runtime.cc            |  42 ++--
 src/runtime/graph/graph_runtime.h             |   8 +-
 src/target/llvm/codegen_llvm.cc               |   4 +-
 src/target/llvm/codegen_params.cc             | 224 +++++++++++++++---
 src/target/llvm/codegen_params.h              |  48 ++++
 src/target/llvm/llvm_module.cc                |  16 +-
 src/target/source/codegen_c_host.cc           |  62 +++++
 src/target/source/codegen_c_host.h            |   3 +
 .../unittest/test_target_codegen_llvm.py      |   2 +-
 11 files changed, 350 insertions(+), 65 deletions(-)
 create mode 100644 src/target/llvm/codegen_params.h

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 0e9266b17c74..04a5cf8bf25d 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -229,7 +229,7 @@ constexpr const char* tvm_module_main = "__tvm_main__";
 /*! \brief Prefix for parameter symbols emitted into the main program. */
 constexpr const char* tvm_param_prefix = "__tvm_param__";
 /*! \brief A PackedFunc that looks up linked parameters by storage_id. */
-constexpr const char* tvm_lookup_linked_param = "__lookup_linked_param";
+constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 152bbde2ee46..cc304808b16f 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -461,7 +461,7 @@ class RelayBuildModule : public runtime::ModuleNode {
 
       Map<String, ObjectRef> dict;
       dict.Set(tvm::tir::attr::kLinkedParams, link_params);
-      dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::target::packed_func::kLookupLinkedParam));
+      dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::runtime::symbol::tvm_lookup_linked_param));
       DictAttrs attrs{dict};
       auto prim = tir::PrimFunc(
         Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(), Map<tir::Var, tir::Buffer>(), attrs);
@@ -469,7 +469,7 @@ class RelayBuildModule : public runtime::ModuleNode {
         lowered_funcs.Set(target_host->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
       }
       lowered_funcs[target_host->str()]->Add(
-        GlobalVar(::tvm::target::packed_func::kLookupLinkedParam), prim);
+        GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param), prim);
     }
 
     // When there is no lowered_funcs due to reasons such as optimization.
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index fdf4ee8baf8a..c64f773f5157 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -72,7 +72,6 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu
   this->Load(&reader);
   module_ = module;
   ctxs_ = ctxs;
-  this->SetupLinkedParams();
   this->SetupStorage();
   this->SetupOpExecs();
   for (size_t i = 0; i < input_nodes_.size(); i++) {
@@ -245,14 +244,16 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   this->SetupOpExecs();
 }
 
-void GraphRuntime::PreAllocatedDeleter(void* ctx) {
-  delete ctx;
+void GraphRuntime::PreAllocatedDLTensorDeleter(DLManagedTensor* tensor) {
+  // ctx is the DLTensor which needs to get deleted. The data member points to global const memory.
+  delete reinterpret_cast<DLTensor*>(tensor);
 }
 
 void GraphRuntime::SetupStorage() {
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
-  tvm::runtime::PackedFunc pf = module_.GetFunction(::tvm::runtime::module::kLookupLinkedParam, true);
+  tvm::runtime::PackedFunc pf = module_.GetFunction(
+    ::tvm::runtime::symbol::tvm_lookup_linked_param, true);
 
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
@@ -263,8 +264,6 @@ void GraphRuntime::SetupStorage() {
   // Size and device type of each storage pool entry.
   std::vector<PoolEntry> pool_entry;
   // Find the maximum space size.
-  int node_index = 0;
-  int node_output = 0;
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Use the fallback device if no device index is available.
@@ -289,14 +288,14 @@ void GraphRuntime::SetupStorage() {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
-    if (pf != nullptr && pool_entry[sid] == nullptr) {
+    if (pf != nullptr && pool_entry[sid].pre_linked_param == nullptr) {
       try {
         pool_entry[sid].pre_linked_param = pf(sid);
-        pool_entry[sid].param_data_entry = i;
-      } except (std::runtime_error& e) {
+      } catch (std::runtime_error& e) {
         // Indicates this storage_id is not pre-linked.
       }
     }
+    pool_entry[sid].param_data_entry = i;
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
@@ -310,16 +309,20 @@ void GraphRuntime::SetupStorage() {
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
     if (pit.pre_linked_param != nullptr) {
-      auto param_entry = data_entry_[pit.param_data_entry];
-      DLTensor* param_tensor = new DLTensor{
-        pit.preq_linked_param, ctx, vtype[pit.param_data_entry],
-        param_entry.size(), nullptr, 0};
-
-      storage_pool_.push_back(
-        NDArray::FromDLManagedTensor(
-          DLManagedTensor{param_tensor, param_tensor, PreAllocatedDeleter}));
+      LOG(INFO) << "param " << pit.param_data_entry << " pre-loaded!";
+      auto param_shape = &attrs_.shape[pit.param_data_entry];
+      DLManagedTensor* param_tensor = new DLManagedTensor{
+        {pit.pre_linked_param, ctx, static_cast<int>(param_shape->size()),
+         vtype[pit.param_data_entry], param_shape->data(), nullptr, 0},
+        nullptr,
+        PreAllocatedDLTensorDeleter};
+
+      storage_pool_.push_back(NDArray::FromDLPack(param_tensor));
+      LOG(INFO) << "Loaded data entry " << pit.param_data_entry
+                << " from pre-linked blob: " << param_tensor->dl_tensor.data;
 
     } else {
+      LOG(INFO) << "param " << pit.param_data_entry << " blank!";
       std::vector<int64_t> shape;
       shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
       storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
@@ -334,10 +337,7 @@ void GraphRuntime::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    auto pool_entry = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
-    if (pool_entry.get() != nullptr) {
-      data_entry_[i] = pool_entry.get();
-    }
+    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
 
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index d687ab4b3615..9f0b0962333a 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -184,8 +184,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     int device_type;
     void* pre_linked_param;
     int param_data_entry;
-    PoolEntry(int s, int dev_type, std::unique_ptr<DLTensor> pre_linked_param) :
-        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
+//    PoolEntry(int s, int dev_type, void* pre_linked_param) :
+//        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
   // Node entry
   struct NodeEntry {
@@ -366,8 +366,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     }
     ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
-  /*! \brief Setup pre-linked parameters. */
-  void SetupLinkedParams();
+  /*! \brief Delete pre-allocated DLTensor. */
+  static void PreAllocatedDLTensorDeleter(DLManagedTensor* tensor);
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors. */
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 2604f5c50ddd..20cbdf83b971 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -208,13 +208,13 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
 
   llvm::Function* function = llvm::Function::Create(
     ftype, llvm::Function::ExternalLinkage,
-    ::tvm::target::packed_func::kLookupLinkedParam, module_.get());
+    ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
 
   llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
   builder_->SetInsertPoint(entry);
-  std::vector<llvm::Value*> zero_index_list{{llvm::ConstantInt::get(t_int32_, 0)}};
+  std::vector<llvm::Value*> zero_index_list{llvm::ConstantInt::get(t_int32_, 0)};
   auto args_array = builder_->CreateBitCast(
     &function->arg_begin()[0], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
   llvm::Value* sid =
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 7c160cf198e7..365ab04505c9 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -24,6 +24,8 @@
 
 #include "codegen_params.h"
 
+#include <iomanip>
+
 namespace tvm {
 namespace codegen {
 
@@ -63,11 +65,6 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
     if (arr_type.bits() == 8) {
       int8_t* data_buf = static_cast<int8_t*>(tensor->dl_tensor.data);
       for (int i = 0; i < num_elements; i++) {
-        std::cout << std::hex << +static_cast<std::uint8_t>(data_buf[i]) << std::dec << " ";
-        if (((i + 1) % 16) == 0) {
-          std::cout << std::endl;
-        }
-
         elements.emplace_back(llvm::ConstantInt::getSigned(element_type, data_buf[i]));
       }
     } else if (arr_type.bits() == 16) {
@@ -152,31 +149,198 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
                              llvm::ArrayRef<llvm::Constant*>(elements)));
 }
 
-// void LLVMCodeGenParams(llvm::LLVMContext* ctx,
-//                        llvm::Module* module,
-//                        int64_t storage_id_offset,
-//                        const Map<String,
-//                        const std::unordered_map<String, NDArray>& params,
-//                        const std::unordered_map<String, int64_t>& param_ids) {
-//   CHECK_EQ(params.size(), params_ids.size())
-//     << "Expect param_names and params_ids to have equal lengths, but params.size() == "
-//     << params.size() << " and params_ids.size() == " << params_ids.size();
-
-//   llvm::ArrayType* t_sid_ptr_ty =
-//     llvm::ArrayType::get(llvm::PointerType::getUnqual(llvm::getVoidTy()));
-//   std::vector<llvm::Constant*> sid_ptrs;
-//   for (auto kv : params) {
-
-//     sid_ptrs.push_back(
-//   }
-
-//   llvm::GlobalVaraible* sid_offset_symbol = new llvm::GlobalVariable(
-//     *module, llvm::Type::getInt64Ty(), true, llvm::GlobalVariable::InternalLinkage,
-//     llvm::ConstantInt::getSigned(ctx, sid_offset), ::tvm::runtime::symbol::tvm_sid_offset);
-//   llvm::GlobalVariable* sid_ptrs_symbol = new llvm::GlobalVariable(
-//     *module, t_sid_ptr_ty, true, llvm::GlobalValue::InternalLinkage,
-//     llvm::ConstantArray::get(t_sid_ptr_ty, sid_ptrs), ::tvm::runtime::symbol::tvm_param_array);
-// }
+
+static constexpr const char* kFloatCast = "(float)";
+static constexpr const char* kDoubleCast = "(double)";
+
+static constexpr const int kMaxLineLength = 80;
+
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
+  auto arr_type = arr.DataType();
+  CHECK_EQ(arr_type.lanes(), 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+
+  int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
+  if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
+    one_element_size_bytes += 1; // sign bit
+    if (arr_type.bits() > 32) {
+      one_element_size_bytes += 2;  // "UL"
+    }
+  } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
+    if (arr_type.bits() > 32) {
+      one_element_size_bytes += 1; // "L"
+    }
+  } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
+    // Floats and doubles are printed as hex but casted.
+    one_element_size_bytes += std::string{(arr_type.bits() == 32 ? kFloatCast : kDoubleCast)}.size();
+  }
+
+  int elements_per_row = 16;
+  while (elements_per_row > 1 &&
+         (elements_per_row * one_element_size_bytes) > (kMaxLineLength - indent_chars)) {
+    elements_per_row /= 2;
+  }
+
+  std::string indent_str(indent_chars, ' ');
+  os << indent_str;
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
+  auto old_fmtflags = os.flags();
+  os.setf(std::ios::right | std::ios::hex, std::ios::adjustfield | std::ios::basefield);
+  os.fill('0');
+  switch (arr_type.code()) {
+  case runtime::DataType::kInt:
+    CHECK(arr_type.bits() == 8 ||
+          arr_type.bits() == 16 ||
+          arr_type.bits() == 32 ||
+          arr_type.bits() == 64)
+      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+      << arr_type.bits() << "-bit array";
+
+    if (arr_type.bits() == 8) {
+      for (int i = 0; i < num_elements; i++) {
+        // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid printing
+        // as a char.
+        int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
+        uint8_t to_print;
+        if (elem < 0) {
+          os << "-";
+          to_print = -elem;
+        } else {
+          os << "+";
+          to_print = elem;
+        }
+        os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 16) {
+      for (int i = 0; i < num_elements; i++) {
+        int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
+        uint16_t to_print;
+        if (elem < 0) {
+          os << "-";
+          to_print = -elem;
+        } else {
+          os << "+";
+          to_print = elem;
+        }
+        os << "0x" << std::setw(4) << to_print;
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 32) {
+      for (int i = 0; i < num_elements; i++) {
+        int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
+        uint32_t to_print ;
+        if (elem < 0) {
+          os << "-";
+          to_print = -elem;
+        } else {
+          os << "+";
+          to_print = elem;
+        }
+        os << "0x" << std::setw(8) << to_print;
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 64) {
+      for (int i = 0; i < num_elements; i++) {
+        int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
+        uint64_t to_print;
+        if (elem < 0) {
+          os << "-";
+          to_print = -elem;
+        } else {
+          os << "+";
+          to_print = elem;
+        }
+        os << "0x" << std::setw(16) << to_print;
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else {
+      CHECK(false) << "should not get here";
+    }
+    break;
+
+  case runtime::DataType::TypeCode::kUInt:
+    CHECK(arr_type.bits() == 8 ||
+          arr_type.bits() == 16 ||
+          arr_type.bits() == 32 ||
+          arr_type.bits() == 64)
+      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+      << arr_type.bits() << "-bit array";
+
+    if (arr_type.bits() == 8) {
+      for (int i = 0; i < num_elements; i++) {
+        // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid printing
+        // as a char.
+        os << "0x" << std::setw(2)
+           << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 16) {
+      for (int i = 0; i < num_elements; i++) {
+        os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 32) {
+      for (int i = 0; i < num_elements; i++) {
+        os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 64) {
+      for (int i = 0; i < num_elements; i++) {
+        os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else {
+      CHECK(false) << "should not get here";
+    }
+    break;
+
+  case runtime::DataType::TypeCode::kFloat:
+    if (arr_type.bits() == 32) {
+      for (int i = 0; i < num_elements; i++) {
+        os << kFloatCast << "0x" << std::setw(8)
+           << static_cast<uint32_t*>(tensor->dl_tensor.data)[i] << "U";
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else if (arr_type.bits() == 64) {
+      for (int i = 0; i < num_elements; i++) {
+        os << kDoubleCast << "0x" << std::setw(16)
+           << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
+        if (i < num_elements - 1) { os << ", "; }
+        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      }
+    } else {
+      CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                   << arr_type.bits() << "-bit array";
+    }
+    break;
+
+  default:
+    CHECK(false) << "Data type not supported";
+  }
+
+  if (num_elements % elements_per_row != 0) {
+    os << "\n";
+  }
+  os.flags(old_fmtflags);
+}
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
new file mode 100644
index 000000000000..8b8ba4f23cc6
--- /dev/null
+++ b/src/target/llvm/codegen_params.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+#define TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+
+#include "llvm_common.h"
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/ndarray.h>
+
+namespace tvm {
+namespace codegen {
+
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
+
+void LLVMCodeGenParams(llvm::LLVMContext* ctx,
+                       llvm::Module* module,
+                       int64_t storage_id_offset,
+                       ::tvm::runtime::Array<String> param_names,
+                       ::tvm::runtime::Array<runtime::NDArray> params_by_sid);
+
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif // TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 98857f574bd9..89774ec61618 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -200,6 +200,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     std::vector<PrimFunc> funcs;
     std::string entry_func;
     for (auto kv : mod->functions) {
+      if (could_have_linked_params &&
+          kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+        Map<String,ObjectRef> attrs_dict = Downcast<Map<String,ObjectRef>>(kv.second->attrs->dict);
+        CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+        linked_params = Downcast<Map<String,LinkedParam>>(
+          attrs_dict[::tvm::tir::attr::kLinkedParams]);
+        found_linked_params = true;
+        continue;
+      }
       ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
       auto f = Downcast<PrimFunc>(kv.second);
       if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
@@ -209,8 +219,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       }
       funcs.push_back(f);
     }
-    bool is_link_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
-    ICHECK(funcs.size() > 0 || is_link_params);
+    ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -223,8 +232,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       cg->AddMainFunction(entry_func);
     }
 
-    if (is_link_params) {
-      CHECK(found_linked_params) << "--link-params given, but no parameters given to codegen";
+    if (found_linked_params) {
       cg->LinkParameters(linked_params);
     }
     module_ = cg->Finish();
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 6ae11f4f9af8..3896e37d5b5d 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -23,6 +23,8 @@
 #include "codegen_c_host.h"
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/crt/error_codes.h>
 #include <tvm/target/codegen.h>
 
 #include <string>
@@ -31,6 +33,7 @@
 #include "../../support/str_escape.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
+#include "../llvm/codegen_params.h"
 
 namespace tvm {
 namespace codegen {
@@ -57,6 +60,46 @@ void CodeGenCHost::AddFunction(const PrimFunc& f) {
   CodeGenC::AddFunction(f);
 }
 
+void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
+  PrintFuncPrefix();
+  stream << " " << tvm::runtime::symbol::tvm_lookup_linked_param
+         << "(void* args, int* arg_type_ids, int num_args, void* out_ret_value, "
+         << "int* out_ret_tcode, void* resource_handle) {\n";
+  ICHECK_EQ(GetUniqueName(tvm::runtime::symbol::tvm_lookup_linked_param),
+            tvm::runtime::symbol::tvm_lookup_linked_param)
+    << "builtin PackedFunc name already taken: "
+    << tvm::runtime::symbol::tvm_lookup_linked_param;
+  stream << "    switch (((int64_t*) args)[0]) {\n"
+         << "    default:\n"
+         << "        return " << kTvmErrorGeneratedInvalidStorageId << ";\n";
+
+  function_names_.emplace_back(tvm::runtime::symbol::tvm_lookup_linked_param);
+  for (auto kv : params) {
+    decl_stream << "#ifdef __cplusplus\n"
+                << "extern \"C\" {\n"
+                << "#endif\n"
+                << "static const ";
+    int64_t num_elements = 1;
+    for (int64_t dim : kv.second->param.Shape()) {
+      num_elements *= dim;
+    }
+    PrintType(kv.second->param.DataType(), decl_stream);
+    decl_stream << " " << ::tvm::runtime::symbol::tvm_param_prefix
+                << kv.first << "[" << num_elements << "] = {\n";
+    NDArrayDataToC(kv.second->param, 4, decl_stream);
+    decl_stream << "};\n"
+                << "#ifdef __cplusplus\n"
+                << "}  // extern \"C\"\n"
+                << "#endif\n";
+    stream << "    case " << kv.second->id << ":\n"
+           << "        ((int64_t*)out_ret_value)[0] = (int64_t) " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
+           << "        out_ret_tcode[0] = " << kTVMOpaqueHandle << ";\n"
+           << "        return 0;\n";
+  }
+  stream << "    }\n"
+         << "}\n";
+}
+
 void CodeGenCHost::PrintFuncPrefix() {  // NOLINT(*)
   stream << "#ifdef __cplusplus\n"
          << "extern \"C\"\n"
@@ -307,12 +350,31 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   CodeGenCHost cg;
   cg.Init(output_ssa, emit_asserts, target->str());
 
+  Map<String,LinkedParam> linked_params;
+  bool found_linked_params = false;
+  bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
   for (auto kv : mod->functions) {
+    if (could_have_linked_params &&
+        kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+      Map<String,ObjectRef> attrs_dict = Downcast<Map<String,ObjectRef>>(kv.second->attrs->dict);
+      CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+        << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+      linked_params = Downcast<Map<String,LinkedParam>>(
+        attrs_dict[::tvm::tir::attr::kLinkedParams]);
+      found_linked_params = true;
+      continue;
+    }
+
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     cg.AddFunction(f);
   }
 
+  if (could_have_linked_params) {
+    ICHECK(found_linked_params) << "-link-params given but none found";
+    cg.LinkParameters(linked_params);
+  }
+
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
     ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 1bf378be1422..b54b6fbfcfeb 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -42,6 +42,9 @@ class CodeGenCHost final : public CodeGenC {
 
   void AddFunction(const PrimFunc& f);
 
+  /*! \brief Add linked parameters, if they are present. */
+  void LinkParameters(Map<String, LinkedParam> params);
+
   void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
   void PrintFuncPrefix() final;                        // NOLINT(*)
   void PrintFinalReturn() final;                       // NOLINT(*)
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index ea2a1f165b30..162481bfdb6e 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -21,7 +21,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import utils, clang
+from tvm.contrib import utils
 import numpy as np
 import ctypes
 import math

From e0259b0762a444891089f96ed303bb29043e81d5 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 9 Nov 2020 22:35:39 -0800
Subject: [PATCH 05/60] switch to floating point hex

---
 src/target/llvm/codegen_params.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 365ab04505c9..e4ecf30d382f 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -173,7 +173,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
     // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += std::string{(arr_type.bits() == 32 ? kFloatCast : kDoubleCast)}.size();
+    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
   }
 
   int elements_per_row = 16;
@@ -193,7 +193,8 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
 
   std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
   auto old_fmtflags = os.flags();
-  os.setf(std::ios::right | std::ios::hex, std::ios::adjustfield | std::ios::basefield);
+  os.setf(std::ios::right | std::ios::hex | std::ios::fixed | std::ios::scientific,
+          std::ios::adjustfield | std::ios::basefield | std::ios::floatfield);
   os.fill('0');
   switch (arr_type.code()) {
   case runtime::DataType::kInt:
@@ -314,15 +315,14 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
   case runtime::DataType::TypeCode::kFloat:
     if (arr_type.bits() == 32) {
       for (int i = 0; i < num_elements; i++) {
-        os << kFloatCast << "0x" << std::setw(8)
-           << static_cast<uint32_t*>(tensor->dl_tensor.data)[i] << "U";
+        os << static_cast<float*>(tensor->dl_tensor.data)[i];
         if (i < num_elements - 1) { os << ", "; }
         if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
       }
+      std::cout << "\n";
     } else if (arr_type.bits() == 64) {
       for (int i = 0; i < num_elements; i++) {
-        os << kDoubleCast << "0x" << std::setw(16)
-           << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
+        os << static_cast<double*>(tensor->dl_tensor.data)[i];
         if (i < num_elements - 1) { os << ", "; }
         if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
       }

From cb7c001d61be324ee9a6656a7b46edefdc640d5b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 9 Nov 2020 23:38:46 -0800
Subject: [PATCH 06/60] c backend works works

---
 src/target/llvm/codegen_params.cc         |  45 ++-
 tests/python/unittest/test_link_params.py | 337 ++++++++++++++++++++++
 2 files changed, 374 insertions(+), 8 deletions(-)
 create mode 100644 tests/python/unittest/test_link_params.py

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index e4ecf30d382f..20a1efbb575e 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -173,7 +173,8 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
     // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
+    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ +
+      1 /* exponent sign */ + 1 /* extra decimal digit in exponent */;
   }
 
   int elements_per_row = 16;
@@ -193,8 +194,8 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
 
   std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
   auto old_fmtflags = os.flags();
-  os.setf(std::ios::right | std::ios::hex | std::ios::fixed | std::ios::scientific,
-          std::ios::adjustfield | std::ios::basefield | std::ios::floatfield);
+  os.setf(std::ios::internal | std::ios::hex,
+          std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
   os.fill('0');
   switch (arr_type.code()) {
   case runtime::DataType::kInt:
@@ -210,7 +211,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
         // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid printing
         // as a char.
         int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
-        uint8_t to_print;
+        uint16_t to_print;
         if (elem < 0) {
           os << "-";
           to_print = -elem;
@@ -240,7 +241,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     } else if (arr_type.bits() == 32) {
       for (int i = 0; i < num_elements; i++) {
         int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
-        uint32_t to_print ;
+        uint32_t to_print;
         if (elem < 0) {
           os << "-";
           to_print = -elem;
@@ -312,17 +313,44 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     }
     break;
 
-  case runtime::DataType::TypeCode::kFloat:
+  case runtime::DataType::TypeCode::kFloat: {
+    std::stringstream ss;
+    ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+    os.fill(' ');
+    os.setf(std::ios::left, std::ios::adjustfield);
     if (arr_type.bits() == 32) {
       for (int i = 0; i < num_elements; i++) {
-        os << static_cast<float*>(tensor->dl_tensor.data)[i];
+        float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
+        if (isinf(elem)) {
+          // C99 standard.
+          os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+        } else if (isnan(elem)) {
+          // GNU extension, implemenatation-dependent.
+          os << std::setw(one_element_size_bytes) << "NAN";
+        } else {
+          ss << elem;
+          os << std::setw(one_element_size_bytes) << ss.str();
+          ss.str("");
+        }
         if (i < num_elements - 1) { os << ", "; }
         if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
       }
       std::cout << "\n";
     } else if (arr_type.bits() == 64) {
       for (int i = 0; i < num_elements; i++) {
-        os << static_cast<double*>(tensor->dl_tensor.data)[i];
+        double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
+        if (isinf(elem)) {
+          // C99 standard.
+          os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+        } else if (isnan(elem)) {
+          // GNU extension, implemenatation-dependent.
+          os << std::setw(one_element_size_bytes) << "NAN";
+        } else {
+          ss << elem;
+          os << std::setw(one_element_size_bytes) << ss.str();
+          ss.str("");
+        }
         if (i < num_elements - 1) { os << ", "; }
         if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
       }
@@ -331,6 +359,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
                    << arr_type.bits() << "-bit array";
     }
     break;
+  }
 
   default:
     CHECK(false) << "Data type not supported";
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
new file mode 100644
index 000000000000..a3ce97e383d8
--- /dev/null
+++ b/tests/python/unittest/test_link_params.py
@@ -0,0 +1,337 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import collections
+import ctypes
+import json
+import os
+import re
+import struct
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.relay
+import tvm.testing
+from tvm.contrib import utils
+
+
+TEST_SHAPE = (3, 4, 5)
+
+
+# The data types that are linkable.
+LINKABLE_DTYPES = (
+    [f'uint{b}' for b in (8, 16, 32, 64)] +
+    [f'int{b}' for b in (8, 16, 32, 64)] +
+    ['float32', 'float64'])
+
+
+
+def dtype_info(dtype):
+    """Lookup numpy type info for the given string dtype (of LINKABLE_DTYPES above)."""
+    if 'int' in dtype:
+        return np.iinfo(getattr(np, dtype))
+    else:
+        return np.finfo(getattr(np, dtype))
+
+
+# Note: for debugging, set this to an integer (i.e. 1.0). Then all "random" tensors will become
+# predictable
+RANDOM_TENSOR_START = None
+
+
+def _make_random_tensor(dtype):
+    """Create a random test tensor of shape TEST_SHAPE and the given dtype."""
+    global RAND_SEED
+    if RANDOM_TENSOR_START is not None:
+      to_return = np.arange(RANDOM_TENSOR_START,
+                            RANDOM_TENSOR_START + np.prod(TEST_SHAPE),
+                            dtype=dtype).reshape(TEST_SHAPE)
+      RAND_SEED += np.prod(TEST_SHAPE)
+      return to_return
+
+    dinfo = dtype_info(dtype)
+    if 'int' in dtype:
+        return np.random.randint(dinfo.min, dinfo.max, TEST_SHAPE, dtype=dtype)
+    else:
+        to_return = np.random.uniform(0, dinfo.max, TEST_SHAPE)
+#        to_return = dinfo.min + (np.random.random(TEST_SHAPE) * dinfo.max)
+        np.reshape(to_return, np.prod(TEST_SHAPE))[::2] *= -1
+        return to_return
+
+
+def _lookup_sid(graph, name):
+    """Lookup the storage id of a named parameter.
+
+    Arguments
+    ---------
+    graph : dict
+        Parsed JSON graph.
+
+    name : str
+        Name of the tensor parameter to lookup.
+
+    Returns
+    -------
+    int :
+        The storage_id of the parameter.
+    """
+    num_outputs_seen = 0
+    for i, n in enumerate(graph['nodes']):
+        if n['name'] == name:
+            return graph['attrs']['storage_id'][1][num_outputs_seen]
+        else:
+            if 'attrs' in n and 'num_outputs' in n['attrs']:
+                num_outputs_seen += n['attrs']['num_outputs']
+            else:
+                num_outputs_seen += 1
+
+    raise KeyError(f'no such param: {name}')
+
+
+def _get_ctypes_dtype(dt):
+    """Return a ctypes c_* datatype given a string data type."""
+    if 'int' in dt:
+        return getattr(ctypes, f'c_{dt}')
+    elif dt == 'float32':
+        return ctypes.c_float
+    elif dt == 'float64':
+        return ctypes.c_double
+    else:
+        assert False, f'unknown dtype: {dt}'
+
+
+def _verify_linked_param(dtype, lib, mod, graph, name):
+    """Directly read memory from the linked library to verify the linked parameter is correct."""
+    sid = _lookup_sid(graph, name)
+    # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
+    # a GraphRuntimeFactory module is created instead of the module itself.
+    param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
+    print('verify', param_ptr)
+    arr_data = (_get_ctypes_dtype(dtype) * np.prod(TEST_SHAPE)).from_address(param_ptr.value)
+    gen_param = lib.params[name]
+    print('gen param dtype', gen_param.dtype)
+    arr = np.ndarray(
+        shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order='C')
+    if 'int' in gen_param.dtype:
+        np.testing.assert_equal(gen_param.asnumpy(), arr)
+    else:
+        np.testing.assert_allclose(gen_param.asnumpy(), arr)
+
+
+def _make_mod_and_params(dtype):
+    """Create a Relay module and parameters to test the given datatype."""
+    param_decls = collections.OrderedDict()
+    param_init = {}
+
+    def _add_decl(name, dtype):
+        param_decls[name] = f'%{name} : Tensor[{TEST_SHAPE}, {dtype}]'
+        param_init[name] = _make_random_tensor(dtype)
+
+    _add_decl(f'{dtype}_a', dtype)
+    _add_decl(f'{dtype}_b', dtype)
+
+    mod_lines = [
+        '#[version = "0.0.5"]',
+        f"def @main(%rand_input : Tensor[{TEST_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
+    ]
+    if 'int' in dtype:
+        mod_lines.append(
+#            f'    %0 = bitwise_xor(%rand_input, bitwise_xor(%{dtype}_a, %{dtype}_b));')
+            f'    %0 = add(%rand_input, %{dtype}_a);')
+    else:
+        mod_lines.append(
+            f'    %0 = cast(add(%rand_input, cast(add(%{dtype}_a, %{dtype}_b), dtype="{dtype}")), dtype="{dtype}");')
+#             f'    %0 = cast(add(%rand_input, %{dtype}_a), dtype="{dtype}");')
+    mod_lines.extend([
+        '    %0',
+        '}'
+    ])
+
+    mod = tvm.parser.fromtext('\n'.join(mod_lines))
+    return mod, param_init
+
+
+@tvm.testing.requires_llvm
+def test_llvm_link_params():
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype)
+        main_func = mod['main']
+        target = 'llvm --runtime=c --system-lib --link-params'
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
+
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib.lib, graph, p)
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib):
+                graph_json, mod, _ = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input('rand_input', rand_input) # NOTE: params not required.
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib)
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, 'llvm --system-lib', params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input('rand_input', rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib)
+
+        if 'int' in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+def _get_c_datatype(dtype):
+  """Translate LINKABLE_DTYPES element to c datatype."""
+  if 'int' in dtype:
+    return f'{dtype}_t'
+  elif dtype == 'float32':
+    return 'float'
+  elif dtype == 'float64':
+    return 'double'
+  else:
+    assert False, f'unknown dtype {dtype}'
+
+
+def _format_c_value(dtype, width, x):
+  if 'int' in dtype:
+    hex_formatstr = f'{{:{"+" if dtype.startswith("int") else ""}#0{width}x}}'
+    return hex_formatstr.format(x)
+  elif 'float' in dtype:
+    to_ret = float(x).hex()
+    if 'inf' in to_ret:
+      return ('-' if x < 0 else '') + 'INFINITY'
+    elif 'nan' in to_ret:
+      return 'NAN'
+
+    before, after = to_ret.split('p')
+    return f'{before.rstrip("0")}p{after}'
+  else:
+    assert False, f"don't know dtype {dtype}"
+
+
+HEX_NUM_RE = re.compile(r'[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))')
+
+
+def test_c_link_params():
+    temp_dir = utils.tempdir()
+    for dtype in LINKABLE_DTYPES:
+        print("test", dtype)
+        mod, param_init = _make_mod_and_params(dtype)
+        print('built mod', mod)
+        rand_input = _make_random_tensor(dtype)
+        main_func = mod['main']
+        target = 'c --link-params'
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
+
+            src = lib.lib.get_source()
+            lib.lib.save('test.c', 'cc')
+            c_dtype = _get_c_datatype(dtype)
+            src_lines = src.split('\n')
+            param = lib.params['p0'].asnumpy().reshape(np.prod(TEST_SHAPE))
+            param_def = f'static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{'
+            for i, line in enumerate(src_lines):
+              if line == param_def:
+                i += 1
+                break
+            else:
+              assert False, f'did not find parameter definition "{param_def}":\n{src}'
+
+            cursor = 0
+            width = dtype_info(dtype).bits // 4 + 2
+            if dtype.startswith("int"):
+              width += 1  # Account for sign
+
+            print('check printing of', param)
+            while '};' not in src_lines[i]:
+              for match in HEX_NUM_RE.finditer(src_lines[i]):
+                assert match.group() == _format_c_value(dtype, width, param[cursor]), (
+                  f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
+                  f'"{match.group(0)}"; full p0 follows:\n{src}')
+                cursor += 1
+              i += 1
+
+            assert cursor == np.prod(param.shape)
+            temp = utils.tempdir()
+
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f'test-{dtype}-linked.so')
+            lib['remove_params']().export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+#            lib_mod = lib_factory['default']()
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib_mod, graph, p)
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(
+                  lib_mod['default'](tvm.cpu(0)))
+                graph_rt.set_input('rand_input', rand_input) # NOTE: params not required.
+                print('linked', graph_rt.get_input('p0'))
+                graph_rt.run()
+
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib_mod)
+
+        linked_params = lib.params
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, 'c', params=param_init)
+            _, _, params = lib
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f'test-{dtype}-unlinked.so')
+            lib.export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+            print('unlinked', params)
+            def _run_unlinked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod['default'](tvm.cpu(0)))
+                graph_rt.set_input('rand_input', rand_input, **params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib_mod)
+
+        if 'int' in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+
+
+if __name__ == '__main__':
+  sys.exit(pytest.main(sys.argv[1:]))

From bbdfd3d71dbdf75f9bfff62982f576ceeb72dd68 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 10 Nov 2020 19:53:49 -0800
Subject: [PATCH 07/60] crt tests work

---
 cmake/modules/StandaloneCrt.cmake             |   1 +
 include/tvm/runtime/crt/error_codes.h         |  16 +
 include/tvm/runtime/crt/graph_runtime.h       |  12 +
 .../tvm/runtime/crt/graph_runtime_module.h    |  42 +++
 include/tvm/runtime/crt/module.h              |   8 +
 python/tvm/contrib/binutils.py                | 277 +-----------------
 python/tvm/micro/build.py                     |  18 +-
 python/tvm/micro/debugger.py                  |  32 +-
 python/tvm/micro/transport/__init__.py        |   1 +
 python/tvm/micro/transport/base.py            |   2 +-
 python/tvm/micro/transport/file_descriptor.py |   9 +
 src/runtime/crt/Makefile                      |   2 +-
 src/runtime/crt/common/crt_runtime_api.c      |   2 +-
 src/runtime/crt/graph_runtime/graph_runtime.c |  21 +-
 .../graph_runtime_module.c                    | 211 +++++++++++++
 src/runtime/crt/host/main.cc                  |   9 +
 .../internal/graph_runtime/graph_runtime.h    |   1 +
 src/runtime/micro/micro_session.cc            |   2 +
 src/runtime/rpc/rpc_endpoint.cc               |   1 +
 tests/python/unittest/test_link_params.py     |  64 +++-
 20 files changed, 443 insertions(+), 288 deletions(-)
 create mode 100644 include/tvm/runtime/crt/graph_runtime_module.h
 create mode 100644 src/runtime/crt/graph_runtime_module/graph_runtime_module.c

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 73c85d13e2ef..256ce2a48a6c 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -44,6 +44,7 @@ if(USE_MICRO)
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime"
+         "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module"
          "src/runtime/crt/host crt_config.h -> src/runtime/crt/host"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 16d0e793848b..93a332a5924f 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -41,6 +41,9 @@ typedef enum {
   kTvmErrorCategoryWriteStream = 3,
   kTvmErrorCategorySession = 4,
   kTvmErrorCategoryPlatform = 5,
+  kTvmErrorCategoryGenerated = 6,
+  kTvmErrorCategoryGraphRuntime = 7,
+  kTvmErrorCategoryFunctionCall = 8,
 } tvm_crt_error_category_t;
 
 typedef enum {
@@ -74,6 +77,19 @@ typedef enum {
   kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
   kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
 
+  // Common error codes returned from generated functions.
+  kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
+
+  // Graph runtime
+  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 0),
+  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 1),
+  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 2),
+
+  // Function Calls - common problems encountered calling functions.
+  kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
+  kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
+  kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
+
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
   kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_runtime.h
index d2eb3b7785e9..dc17debbc69b 100644
--- a/include/tvm/runtime/crt/graph_runtime.h
+++ b/include/tvm/runtime/crt/graph_runtime.h
@@ -69,6 +69,12 @@ TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const struct TVMMo
 
 int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
 
+/*!
+ * \brief get number of input tensors allocated.
+ * \return integer number of tensors available to use.
+ */
+int TVMGraphRuntime_GetNumInputs();
+
 /*!
  * \brief set input to the graph based on name.
  * \param runtime The graph runtime.
@@ -77,6 +83,12 @@ int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
  */
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 
+/*!
+ * \brief get number of output tensors allocated.
+ * \return integer number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs();
+
 /*!
  * \brief Return NDArray for given output index.
  * \param runtime The graph runtime.
diff --git a/include/tvm/runtime/crt/graph_runtime_module.h b/include/tvm/runtime/crt/graph_runtime_module.h
new file mode 100644
index 000000000000..04e9184c8b8d
--- /dev/null
+++ b/include/tvm/runtime/crt/graph_runtime_module.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime.h
+ * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ */
+#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <tvm/runtime/crt/error_codes.h>
+
+/*!
+ * \brief Register the "tvm.graph_runtime.create" constructor PackedFunc.
+ */
+tvm_crt_error_t TVMGraphRuntimeModule_Register();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/crt/module.h b/include/tvm/runtime/crt/module.h
index 2359025f6fe1..7b124c4faa3a 100644
--- a/include/tvm/runtime/crt/module.h
+++ b/include/tvm/runtime/crt/module.h
@@ -39,6 +39,14 @@ typedef struct TVMModule {
   const TVMFuncRegistry* registry;
 } TVMModule;
 
+/*!
+ * \brief Create a new module handle from the given TVMModule instance.
+ * \param mod The module instance to register.
+ * \param out_handle Pointer to recieve the newly-minted handle for this module.
+ * \return 0 on success, non-zero on error.
+ */
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle);
+
 /*! \brief Entry point for the system lib module. */
 const TVMModule* TVMSystemLibEntryPoint(void);
 
diff --git a/python/tvm/contrib/binutils.py b/python/tvm/contrib/binutils.py
index 646362a5587f..146944970827 100644
--- a/python/tvm/contrib/binutils.py
+++ b/python/tvm/contrib/binutils.py
@@ -16,61 +16,13 @@
 # under the License.
 
 """Utilities for binary file manipulation"""
+import logging
 import os
 import subprocess
 import tvm._ffi
 from . import utils
 
-# TODO does this file still belong in `contrib`. is it too µTVM-specific?
-
-# TODO shouldn't need so many `ALIGN` directives
-RELOCATION_LD_SCRIPT_TEMPLATE = """
-/* linker symbol for use in UTVMInit */
-_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
-
-SECTIONS
-{{
-  . = 0x{text_start:x};
-  . = ALIGN({word_size});
-  .text :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.text))
-    KEEP(*(.text*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{rodata_start:x};
-  . = ALIGN({word_size});
-  .rodata :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.rodata))
-    KEEP(*(.rodata*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{data_start:x};
-  . = ALIGN({word_size});
-  .data :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.data))
-    KEEP(*(.data*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{bss_start:x};
-  . = ALIGN({word_size});
-  .bss :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.bss))
-    KEEP(*(.bss*))
-    . = ALIGN({word_size});
-  }}
-}}
-"""
+_LOG = logging.getLogger(__name__)
 
 
 def run_cmd(cmd):
@@ -86,6 +38,7 @@ def run_cmd(cmd):
     output : str
         resulting stdout capture from the subprocess
     """
+    _LOG.debug('execute: %s', ' '.join(cmd))
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (output, _) = proc.communicate()
     output = output.decode("utf-8")
@@ -94,227 +47,3 @@ def run_cmd(cmd):
         msg = f'error while running command "{cmd_str}":\n{output}'
         raise RuntimeError(msg)
     return output
-
-
-@tvm._ffi.register_func("tvm_callback_get_section_size")
-def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
-    """Finds size of the section in the binary.
-    Assumes `size` shell command exists (typically works only on Linux machines)
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    section_name : str
-        name of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    size : integer
-        size of the section in bytes
-    """
-    if not os.path.isfile(binary_path):
-        raise RuntimeError('no such file "{}"'.format(binary_path))
-    # We use the "-A" flag here to get the ".rodata" section's size, which is
-    # not included by default.
-    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
-
-    # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
-    # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
-    section_mapping = {
-        ".text": [".text"],
-        ".rodata": [".rodata"],
-        ".data": [".data", ".sdata"],
-        ".bss": [".bss", ".sbss"],
-    }
-    sections_to_sum = section_mapping["." + section_name]
-    section_size = 0
-    # Skip the first two header lines in the `size` output.
-    for line in size_output.split("\n")[2:]:
-        tokens = list(filter(lambda s: len(s) != 0, line.split(" ")))
-        if len(tokens) != 3:
-            continue
-        entry_name = tokens[0]
-        entry_size = int(tokens[1])
-        for section in sections_to_sum:
-            if entry_name.startswith(section):
-                section_size += entry_size
-                break
-
-    # NOTE: in the past, section_size has been wrong on x86. it may be
-    # inconsistent. TODO: maybe stop relying on `*size` to give us the size and
-    # instead read the section with `*objcopy` and count the bytes.
-    # NOTE(areusch): I think the problem is due to alignment ops in the linker.
-    # Since this is going away in the impending switch to on-device runtime,
-    # add a constant to hopefully absorb these relocations.
-    if section_size > 0:
-        section_size += 64
-
-    return section_size
-
-
-@tvm._ffi.register_func("tvm_callback_relocate_binary")
-def tvm_callback_relocate_binary(
-    binary_path,
-    word_size,
-    text_start,
-    rodata_start,
-    data_start,
-    bss_start,
-    stack_end,
-    toolchain_prefix,
-):
-    """Relocates sections in the binary to new addresses
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    word_size : int
-        word size on the target machine
-
-    text_start : int
-        text section address
-
-    rodata_start : int
-        rodata section address
-
-    data_start : int
-        data section address
-
-    bss_start : int
-        bss section address
-
-    stack_end : int
-        stack section end address
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    rel_bin : bytearray
-        the relocated binary
-    """
-    assert text_start < rodata_start < data_start < bss_start < stack_end
-    stack_pointer_init = stack_end - word_size
-    ld_script_contents = ""
-    # TODO(weberlo): There should be a better way to configure this for different archs.
-    # TODO is this line even necessary?
-    if "riscv" in toolchain_prefix:
-        ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
-    ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
-        word_size=word_size,
-        text_start=text_start,
-        rodata_start=rodata_start,
-        data_start=data_start,
-        bss_start=bss_start,
-        stack_pointer_init=stack_pointer_init,
-    )
-
-    tmp_dir = utils.tempdir()
-    rel_obj_path = tmp_dir.relpath("relocated.obj")
-    rel_ld_script_path = tmp_dir.relpath("relocate.lds")
-    with open(rel_ld_script_path, "w") as f:
-        f.write(ld_script_contents)
-    run_cmd(
-        ["{}ld".format(toolchain_prefix), binary_path, "-T", rel_ld_script_path, "-o", rel_obj_path]
-    )
-
-    with open(rel_obj_path, "rb") as f:
-        rel_bin = bytearray(f.read())
-
-    gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
-    if gdb_init_dir is not None:
-        gdb_init_path = f"{gdb_init_dir}/.gdbinit"
-        with open(gdb_init_path, "r") as f:
-            gdbinit_contents = f.read().split("\n")
-        new_contents = []
-        for line in gdbinit_contents:
-            new_contents.append(line)
-            if line.startswith("target"):
-                new_contents.append(f"add-symbol-file {rel_obj_path}")
-        with open(gdb_init_path, "w") as f:
-            f.write("\n".join(new_contents))
-
-    return rel_bin
-
-
-@tvm._ffi.register_func("tvm_callback_read_binary_section")
-def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
-    """Returns the contents of the specified section in the binary byte array
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    section : str
-        type of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    section_bin : bytearray
-        contents of the read section
-    """
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("temp.bin")
-    tmp_section = tmp_dir.relpath("tmp_section.bin")
-    with open(tmp_bin, "wb") as out_file:
-        out_file.write(bytes(binary))
-    run_cmd(
-        [
-            "{}objcopy".format(toolchain_prefix),
-            "--dump-section",
-            ".{}={}".format(section, tmp_section),
-            tmp_bin,
-        ]
-    )
-    if os.path.isfile(tmp_section):
-        # Get section content if it exists.
-        with open(tmp_section, "rb") as f:
-            section_bin = bytearray(f.read())
-    else:
-        # Return empty bytearray if the section does not exist.
-        section_bin = bytearray("", "utf-8")
-    return section_bin
-
-
-@tvm._ffi.register_func("tvm_callback_get_symbol_map")
-def tvm_callback_get_symbol_map(binary, toolchain_prefix):
-    """Obtains a map of symbols to addresses in the passed binary
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    map_str : str
-        map of defined symbols to addresses, encoded as a series of
-        alternating newline-separated keys and values
-    """
-    tmp_dir = utils.tempdir()
-    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
-    with open(tmp_obj, "wb") as out_file:
-        out_file.write(bytes(binary))
-    nm_output = run_cmd(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj])
-    nm_output = nm_output.splitlines()
-    map_str = ""
-    for line in nm_output:
-        line = line.split()
-        map_str += line[2] + "\n"
-        map_str += line[0] + "\n"
-    return map_str
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index d1a3c4163755..bed5bde6f916 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -23,6 +23,8 @@
 import re
 from tvm.contrib import utils
 
+from .micro_library import MicroLibrary
+
 
 _LOG = logging.getLogger(__name__)
 
@@ -109,7 +111,8 @@ def default_options(target_include_dir):
 
 
 def build_static_runtime(
-    workspace, compiler, module, lib_opts=None, bin_opts=None, generated_lib_opts=None
+    workspace, compiler, module, lib_opts=None, bin_opts=None, generated_lib_opts=None,
+    extra_libs=None
 ):
     """Build the on-device runtime, statically linking the given modules.
 
@@ -131,6 +134,12 @@ def build_static_runtime(
         The `options` parameter passed to compiler.library() when compiling the generated TVM C
         source module.
 
+    extra_libs : Optional[List[MicroLibrary|str]]
+        If specified, extra libraries to be compiled into the binary. If a MicroLibrary, it is
+        included into the binary directly. If a string, the path to a directory; all direct children
+        of this directory matching RUNTIME_SRC_REGEX are built into a library. These libraries are
+        placed before any common CRT libraries in the link order.
+
     Returns
     -------
     MicroBinary :
@@ -150,7 +159,12 @@ def build_static_runtime(
     module.save(mod_src_path, "cc")
 
     libs = []
-    for lib_src_dir in RUNTIME_LIB_SRC_DIRS:
+    for mod_or_src_dir in (extra_libs or []) + RUNTIME_LIB_SRC_DIRS:
+        if isinstance(mod_or_src_dir, MicroLibrary):
+            libs.append(mod_or_src_dir)
+            continue
+
+        lib_src_dir = mod_or_src_dir
         lib_name = os.path.basename(lib_src_dir)
         lib_build_dir = workspace.relpath(f"build/{lib_name}")
         os.makedirs(lib_build_dir)
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index b76d46a04db6..18ed350305d8 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -19,6 +19,7 @@
 
 import atexit
 import abc
+import errno
 import logging
 import os
 import signal
@@ -26,12 +27,14 @@
 import sys
 import termios
 import threading
+import time
 
 import psutil
 
 from .._ffi import register_func
 from . import class_factory
 from . import transport
+from .transport.file_descriptor import FdTransport
 
 
 _LOG = logging.getLogger(__name__)
@@ -195,7 +198,8 @@ def popen_kwargs(self):
         else:
             raise NotImplementedError(f"System {sysname} is not yet supported")
 
-        self.fd_transport = fd.FdTransport(stdout_read, stdin_write)
+        self.fd_transport = FdTransport(
+            stdout_read, stdin_write, timeouts=transport.debug_transport_timeouts())
         self.fd_transport.open()
 
         return {
@@ -227,13 +231,33 @@ def open(self):
             pass  # Pipes opened by parent class.
 
         def write(self, data, timeout_sec):
-            return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
+            end_time = time.monotonic() + timeout_sec
+            while timeout_sec == 0 or time.monotonic() < end_time:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
+                except OSError as e:
+                    if e.errno == errno.EAGAIN:
+                        time.sleep(0.1)
+                        continue
+                    raise e
+
+            raise base.IoTimeoutError()
 
         def read(self, n, timeout_sec):
-            return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
+            end_time = time.monotonic() + timeout_sec
+            while timeout_sec == 0 or time.monotonic() < end_time:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
+                except OSError as e:
+                    if e.errno == errno.EAGAIN:
+                        time.sleep(0.1)
+                        continue
+                    raise e
+
+            raise base.IoTimeoutError()
 
         def close(self):
-            pass  # Pipes closed by parent class.
+            pass  # Pipes closed by parent class (DebugWrapperTransport calls stop() next).
 
     def transport(self):
         return self._Transport(self)
diff --git a/python/tvm/micro/transport/__init__.py b/python/tvm/micro/transport/__init__.py
index 1e1709707568..dffe9ae32792 100644
--- a/python/tvm/micro/transport/__init__.py
+++ b/python/tvm/micro/transport/__init__.py
@@ -22,5 +22,6 @@
 from .base import TransportClosedError
 from .base import TransportLogger
 from .base import TransportTimeouts
+from .base import debug_transport_timeouts
 from .debug import DebugWrapperTransport
 from .subprocess import SubprocessTransport
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
index f8951f6226a5..07332e8a745d 100644
--- a/python/tvm/micro/transport/base.py
+++ b/python/tvm/micro/transport/base.py
@@ -64,7 +64,7 @@ class IoTimeoutError(Exception):
 )
 
 
-def debug_transport_timeouts(session_start_retry_timeout_sec=0.0):
+def debug_transport_timeouts(session_start_retry_timeout_sec=0):
     return TransportTimeouts(
         session_start_retry_timeout_sec=session_start_retry_timeout_sec,
         session_start_timeout_sec=0,
diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
index 3f69c4c26751..6df6cd425eff 100644
--- a/python/tvm/micro/transport/file_descriptor.py
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -62,8 +62,11 @@ def open(self):
     def close(self):
         if self.read_fd is not None:
             os.close(self.read_fd)
+            self.read_fd = None
+
         if self.write_fd is not None:
             os.close(self.write_fd)
+            self.write_fd = None
 
     def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
         if end_time is None:
@@ -78,6 +81,9 @@ def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
         return True
 
     def read(self, n, timeout_sec):
+        if self.read_fd is None:
+            raise base.TransportClosedError()
+
         end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
 
         self._await_ready([self.read_fd], [], end_time=end_time)
@@ -90,6 +96,9 @@ def read(self, n, timeout_sec):
         return to_return
 
     def write(self, data, timeout_sec):
+        if self.write_fd is None:
+            raise base.TransportClosedError()
+
         end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
 
         data_len = len(data)
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 8a24db4e8b2b..6e462431173f 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -65,7 +65,7 @@ $(notdir $(1)): $${BUILD_DIR}/lib$(notdir $(1)).a
 
 endef
 
-LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
+LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/graph_runtime_module src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
 
 $(foreach lib,$(LIBS),$(eval $(call LIB_template,$(lib))))
 
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index d6f78d9e3a03..9a0663fc704d 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -127,7 +127,7 @@ static TVMModuleHandle EncodeModuleHandle(tvm_module_index_t module_index) {
   return (TVMModuleHandle)((uintptr_t)(module_index | 0x8000));
 }
 
-static int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
   tvm_module_index_t idx;
 
   for (idx = 0; idx < TVM_CRT_MAX_REGISTERED_MODULES; idx++) {
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index a6cd77ad6a22..68213b7dd3c3 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -539,6 +539,15 @@ uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint
   return runtime->node_row_ptr[nid] + index;
 }
 
+/*!
+ * \brief Get the number of input tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of input tensors allocated.
+ */
+int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) {
+  return runtime->input_nodes_count;
+}
+
 /*!
  * \brief Get the input index given the name of input.
  * \param runtime The graph runtime.
@@ -675,6 +684,15 @@ void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) {
   }
 }
 
+/*!
+ * \brief Get the number of output tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) {
+  return runtime->outputs_count;
+}
+
 int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) {
   int status = 0;
   uint32_t nid = runtime->outputs[idx].node_id;
@@ -875,7 +893,6 @@ void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, cons
 
 TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const TVMModule* m,
                                         const TVMContext* ctxs) {
-  CHECK_EQ(vleak_size, 1, "memory leak checking won't work with concurrent CRT use");
   TVMGraphRuntime* runtime = (TVMGraphRuntime*)vmalloc(sizeof(TVMGraphRuntime));  // NOLINT(*)
   memset(runtime, 0, sizeof(TVMGraphRuntime));
   // init
@@ -909,6 +926,4 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
     vfree(g_fexecs);
     g_fexecs = 0;
   }
-
-  CHECK_EQ(vleak_size, 1, "found memory leak, leak size=%d", vleak_size - 1);
 }
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
new file mode 100644
index 000000000000..a8de71e33f9d
--- /dev/null
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file graph_runtime_module.c
+ * \brief wrap graph_runtime into a TVMModule for use with RPC.
+ */
+
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#include <tvm/runtime/crt/func_registry.h>
+#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/module.h>
+
+#include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h"
+
+typedef struct {
+  TVMModule mod;
+  TVMGraphRuntime* runtime;
+} GraphRuntimeModule;
+
+static GraphRuntimeModule graph_runtime;
+
+int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (graph_runtime.runtime != NULL) {
+    return kTvmErrorGraphModuleAlreadyCreated;
+  }
+
+  if (nargs != 4) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMModuleHandle || tcodes[2] != kTVMArgInt || tcodes[3] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  if (args[2].v_int64 != kDLCPU || args[3].v_int64 != 0) {
+    return kTvmErrorGraphModuleBadContext;
+  }
+
+  TVMContext ctx = {(DLDeviceType) args[2].v_int64, (int) args[3].v_int64};
+  graph_runtime.runtime = TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx);
+
+  TVMModuleHandle out;
+  int ret_value = TVMModCreateFromCModule(&graph_runtime.mod, &out);
+  if (ret_value != 0) {
+    ret_tcodes[0] = kTVMNullptr;
+    TVMGraphRuntime_Release(&graph_runtime.runtime);
+    return ret_value;
+  }
+
+  ret_values[0].v_handle = out;
+  ret_tcodes[0] = kTVMModuleHandle;
+  return kTvmErrorNoError;
+}
+
+int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int index = TVMGraphRuntime_GetInputIndex(graph_runtime.runtime, args[0].v_str);
+  if (index < 0) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t eid = TVMGraphRuntime_GetEntryId(
+    graph_runtime.runtime, graph_runtime.runtime->input_nodes[index], 0);
+  ret_values[0].v_handle = (void*) &graph_runtime.runtime->data_entry[eid].dl_tensor;
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumInputs();
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs();
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int output_index = args[0].v_int64;
+  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs()) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t nid = graph_runtime.runtime->outputs[output_index].node_id;
+  uint32_t index = graph_runtime.runtime->outputs[output_index].index;
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
+
+  ret_values[0].v_handle = (void*) &(graph_runtime.runtime->data_entry[eid].dl_tensor);
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMBytes) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  ret_tcodes[0] = kTVMNullptr;
+
+  TVMByteArray* arr = (TVMByteArray*) args[0].v_handle;
+  return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size);
+}
+
+int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  TVMGraphRuntime_Run(graph_runtime.runtime);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  if (nargs != 2) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*) args[1].v_handle);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = {
+    &TVMGraphRuntimeModule_GetInput,
+    &TVMGraphRuntimeModule_GetNumInputs,
+    &TVMGraphRuntimeModule_GetNumOutputs,
+    &TVMGraphRuntimeModule_GetOutput,
+    &TVMGraphRuntimeModule_LoadParams,
+    &TVMGraphRuntimeModule_Run,
+    &TVMGraphRuntimeModule_SetInput,
+    &TVMGraphRuntimeModule_NotImplemented,
+};
+
+
+static const TVMFuncRegistry graph_runtime_registry = {
+    "\x08get_input\0"
+    "get_num_inputs\0"
+    "get_num_outputs\0"
+    "get_output\0"
+    "load_params\0"
+    "run\0"
+    "set_input\0"
+    "share_params\0",
+    graph_runtime_registry_funcs};
+
+tvm_crt_error_t TVMGraphRuntimeModule_Register() {
+    graph_runtime.mod.registry = &graph_runtime_registry;
+    graph_runtime.runtime = NULL;
+
+    return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
+}
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 60797c39b41d..8705ca899103 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -32,6 +32,10 @@
 
 #include "crt_config.h"
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#endif
+
 using namespace std::chrono;
 
 extern "C" {
@@ -95,6 +99,11 @@ int main(int argc, char** argv) {
   utvm_rpc_server_t rpc_server =
       UTvmRpcServerInit(memory, sizeof(memory), 8, &UTvmWriteFunc, nullptr);
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+  CHECK_EQ(TVMGraphRuntimeModule_Register(), kTvmErrorNoError,
+           "failed to register GraphRuntime TVMModule");
+#endif
+
   if (TVMFuncRegisterGlobal("tvm.testing.reset_server", (TVMFunctionHandle)&testonly_reset_server,
                             0)) {
     fprintf(stderr, "utvm runtime: internal error registering global packedfunc; exiting\n");
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
index 7ea7a4f035c8..ee095325deef 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
@@ -100,6 +100,7 @@ typedef struct TVMGraphRuntime {
 typedef DLTensor* DLTensorPtr;
 
 // private functions
+uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index);
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
                                const uint32_t param_size);
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 662597086d8a..38252bc27745 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -121,6 +121,7 @@ class MicroTransportChannel : public RPCChannel {
 
       ::std::string chunk;
       if (timeout != ::std::chrono::microseconds::zero()) {
+        LOG(INFO) << "ReceiveUntil no-timeout " << timeout.count() << " us";
         ::std::chrono::microseconds iter_timeout{
             ::std::max(::std::chrono::microseconds{0},
                        ::std::chrono::duration_cast<::std::chrono::microseconds>(
@@ -288,6 +289,7 @@ class MicroTransportChannel : public RPCChannel {
 };
 
 TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) {
+                                                     LOG(INFO) << "MICRO RPC CONNECT " << uint64_t(args[3]) << ", " << uint64_t(args[4]) << ", " << uint64_t(args[5]);
   MicroTransportChannel* micro_channel =
       new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])),
                                 ::std::chrono::microseconds(uint64_t(args[4])),
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index fbdd93fb4f62..ef7b34079606 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -370,6 +370,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
    */
   void HandleReturn(RPCCode code, RPCSession::FEncodeReturn setreturn) {
     TVMArgs args = RecvPackedSeq();
+    LOG(INFO) << "Receive PackedSeq " << args.size();
 
     if (code == RPCCode::kException) {
       // switch to the state before sending exception.
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index a3ce97e383d8..34e5a4a9dd8e 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -245,9 +245,7 @@ def _format_c_value(dtype, width, x):
 def test_c_link_params():
     temp_dir = utils.tempdir()
     for dtype in LINKABLE_DTYPES:
-        print("test", dtype)
         mod, param_init = _make_mod_and_params(dtype)
-        print('built mod', mod)
         rand_input = _make_random_tensor(dtype)
         main_func = mod['main']
         target = 'c --link-params'
@@ -331,6 +329,68 @@ def _run_unlinked(lib_mod):
             np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
 
 
+@tvm.testing.requires_micro
+def test_crt_link_params():
+    import tvm.micro
+
+
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype)
+        main_func = mod['main']
+        target = 'c -mcpu=native --system-lib --runtime=c --link-params'
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
+            assert set(params.keys()) == {"p0"}  # NOTE: op folded
+
+            workspace = tvm.micro.Workspace()
+            compiler = tvm.micro.DefaultCompiler(target=target)
+            opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+            opts['bin_opts']['ldflags'].append('-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE')
+
+            micro_binary = tvm.micro.build_static_runtime(
+                # the x86 compiler *expects* you to give the exact same dictionary for both
+                # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
+                # the binary compiler is expecting those mutations to be in bin_opts.
+                # TODO(weberlo) fix this very bizarre behavior
+                workspace,
+                compiler,
+                lib,
+                lib_opts=opts["bin_opts"],
+                bin_opts=opts["bin_opts"],
+                extra_libs=[os.path.join(tvm.micro.CRT_ROOT_DIR, m)
+                            for m in ('graph_runtime', 'graph_runtime_module')],
+            )
+
+            flasher_kw = {
+                "debug": False,
+            }
+            flasher = compiler.flasher(**flasher_kw)
+            with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
+                rpc_lib = sess.get_system_lib()
+                graph_rt = tvm.contrib.graph_runtime.create(
+                  graph_json, rpc_lib, sess.context)
+
+                graph_rt.set_input('rand_input', rand_input, **params)
+                graph_rt.run()
+                linked_output = graph_rt.get_output(0).asnumpy()
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, 'llvm --system-lib', params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input('rand_input', rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib).asnumpy()
+
+        if 'int' in dtype:
+            np.testing.assert_equal(unlinked_output, linked_output)
+        else:
+            np.testing.assert_allclose(unlinked_output, linked_output)
 
 
 if __name__ == '__main__':

From 1afa10e9fddb7a5d6c9a8a4088fd8d0b3f7c3d6c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 10 Nov 2020 21:47:31 -0800
Subject: [PATCH 08/60] CRT works!

---
 include/tvm/runtime/crt/graph_runtime.h       |  4 +-
 src/runtime/crt/graph_runtime/graph_runtime.c | 66 ++++++++++++++-----
 .../internal/graph_runtime/graph_runtime.h    |  9 ++-
 tests/python/unittest/test_link_params.py     |  3 +-
 4 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_runtime.h
index dc17debbc69b..e8413aa1723d 100644
--- a/include/tvm/runtime/crt/graph_runtime.h
+++ b/include/tvm/runtime/crt/graph_runtime.h
@@ -61,10 +61,10 @@ typedef struct TVMGraphRuntime TVMGraphRuntime;
  * \brief Allocate a new GraphRuntime with vmalloc and initialize it.
  *
  * \param sym_json JSON-encoded graph.
- * \param m TVM Module that exposes the functions to call.
+ * \param module_handle TVM Module that exposes the functions to call.
  * \param ctxs runtime execution context.
  */
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const struct TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs);
 
 int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index 68213b7dd3c3..03d81aa184f8 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -711,8 +711,19 @@ int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTen
 }
 
 void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
+  TVMPackedFunc lookup_linked_param;
+  int lookup_linked_param_valid;
   uint32_t idx;
 
+  {
+    TVMArgs temp_args;
+    temp_args.values[0].v_int64 = 0;
+    temp_args.tcodes[0] = kTVMArgInt;
+    temp_args.values_count = 1;
+    lookup_linked_param_valid =
+      (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle, "_lookup_linked_param", &temp_args) == 0);
+  }
+
   // Grab saved optimization plan from graph.
   TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs);
   DLDataType* vtype = vmalloc(sizeof(DLDataType) * attrs->dltype_count);
@@ -739,6 +750,7 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     if (sid >= pool_entry_count) {
       pool_entry_count = sid + 1;
     }
+    pool_entry[sid].entry_id = idx;
     pool_entry[sid].size = MAX(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
@@ -746,17 +758,36 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
   // Allocate the space.
   for (idx = 0; idx < pool_entry_count; idx++) {
     runtime->storage_pool =
-        vrealloc(runtime->storage_pool, sizeof(TVMNDArray) * (runtime->storage_pool_count + 1));
+        vrealloc(runtime->storage_pool,
+                 sizeof(TVMGraphRuntimeStorageEntry) * (runtime->storage_pool_count + 1));
     TVMGraphRuntimePoolEntry pit = pool_entry[idx];
-    int64_t shape[TVM_CRT_MAX_NDIM] = {
-        0,
-    };
     TVMContext ctx = runtime->ctxs[0];
-    DLDataType dtype = {kDLFloat, 32, 1};
-    shape[0] = (pit.size + 3) / 4;
-    runtime->storage_pool[runtime->storage_pool_count] = TVMNDArray_Empty(1, shape, dtype, ctx);
-    CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].dl_tensor.data, 0,
-             "fail to create storage_pool with idx=%d\n", idx);
+    uint8_t did_find_linked_param = 0;
+    if (lookup_linked_param_valid) {
+      lookup_linked_param.args.values[0].v_int64 = idx;
+      if (lookup_linked_param.Call(&lookup_linked_param) == 0) {
+        runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1;
+        DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor;
+        tensor->data = lookup_linked_param.ret_value.values[0].v_handle;
+        tensor->ctx = ctx;
+        tensor->ndim = attrs->ndim[pit.entry_id];
+        tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM;
+        tensor->strides = NULL;
+        tensor->byte_offset = 0;
+        did_find_linked_param = 1;
+      }
+    }
+    if (did_find_linked_param == 0) {
+      int64_t shape[TVM_CRT_MAX_NDIM] = {
+          0,
+      };
+      DLDataType dtype = {kDLFloat, 32, 1};
+      shape[0] = (pit.size + 3) / 4;
+      runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 0;
+      runtime->storage_pool[runtime->storage_pool_count].array = TVMNDArray_Empty(1, shape, dtype, ctx);
+      CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor.data, 0,
+               "fail to create storage_pool with idx=%d\n", idx);
+    }
     runtime->storage_pool_count++;
   }
 
@@ -769,7 +800,7 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     uint32_t storage_id = attrs->storage_id[idx];
     CHECK(storage_id < runtime->storage_pool_count);
     runtime->data_entry[idx] =
-        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id]),
+        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id].array),
                               attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx], vtype[idx]);
     CHECK_NE(runtime->data_entry[idx].dl_tensor.data, 0,
              "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
@@ -876,27 +907,28 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam*
 /*!
  * \brief Initialize the graph executor with graph and context.
  * \param graph_json The execution graph.
- * \param module The module containing the compiled functions for the host
+ * \param module_handle The module containing the compiled functions for the host
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
  */
-void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, const TVMModule* module,
-                          const TVMContext* ctxs) {
+void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json,
+                          TVMModuleHandle module_handle, const TVMContext* ctxs) {
   JSONReader reader = JSONReader_Create(graph_json);
   TVMGraphRuntime_Load(runtime, &reader);
   JSONReader_Release(&reader);
+  runtime->module_handle = module_handle;
   runtime->ctxs[0] = ctxs[0];
   TVMGraphRuntime_SetupStorage(runtime);
   TVMGraphRuntime_SetupOpExecs(runtime);
 }
 
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs) {
   TVMGraphRuntime* runtime = (TVMGraphRuntime*)vmalloc(sizeof(TVMGraphRuntime));  // NOLINT(*)
   memset(runtime, 0, sizeof(TVMGraphRuntime));
   // init
-  TVMGraphRuntime_Init(runtime, sym_json, m, ctxs);
+  TVMGraphRuntime_Init(runtime, sym_json, module_handle, ctxs);
   return runtime;
 }
 
@@ -909,7 +941,9 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
   vfree(runtime->nodes);
   TVMGraphRuntimeGraphAttr_Release(&(runtime->attrs));
   for (idx = 0; idx < runtime->storage_pool_count; ++idx) {
-    TVMNDArray_Release(&(runtime->storage_pool[idx]));
+    if (runtime->storage_pool[idx].is_linked_param == 0) {
+      TVMNDArray_Release(&(runtime->storage_pool[idx].array));
+    }
   }
   for (idx = 0; idx < runtime->data_entry_count; ++idx) {
     vfree(runtime->data_entry[idx].dl_tensor.shape);
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
index ee095325deef..8e0faaa4f199 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
@@ -33,6 +33,7 @@
 typedef struct TVMGraphRuntimePoolEntry {
   size_t size;
   int device_type;
+  int entry_id;
 } TVMGraphRuntimePoolEntry;
 
 // Node entry
@@ -44,6 +45,12 @@ typedef struct TVMGraphRuntimeNodeEntry {
   void (*Load)(JSONReader* reader);
 } TVMGraphRuntimeNodeEntry;
 
+// Storage entry.
+typedef struct TVMGraphRuntimeStorageEntry {
+  uint8_t is_linked_param;
+  TVMNDArray array;
+} TVMGraphRuntimeStorageEntry;
+
 // Node
 typedef struct TVMGraphRuntimeNode {
   // operator type in string
@@ -87,7 +94,7 @@ typedef struct TVMGraphRuntime {
   TVMContext ctxs[1];
   uint32_t ctxs_count;
   /*! \brief Common storage pool for all devices. */
-  TVMNDArray* storage_pool;
+  TVMGraphRuntimeStorageEntry* storage_pool;
   uint32_t storage_pool_count;
   /*! \brief Data entry of each node. */
   TVMNDArray* data_entry;
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 34e5a4a9dd8e..f134db37c36d 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -371,7 +371,8 @@ def test_crt_link_params():
                 graph_rt = tvm.contrib.graph_runtime.create(
                   graph_json, rpc_lib, sess.context)
 
-                graph_rt.set_input('rand_input', rand_input, **params)
+                # NOTE: not setting params here.
+                graph_rt.set_input('rand_input', rand_input)
                 graph_rt.run()
                 linked_output = graph_rt.get_output(0).asnumpy()
 

From b85d90f9519155ba654a45d3a5a548611bf6764d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 10 Nov 2020 23:12:57 -0800
Subject: [PATCH 09/60] make stm repo work (half done)

---
 python/tvm/target/target.py         | 3 ++-
 src/runtime/micro/micro_session.cc  | 2 --
 src/runtime/rpc/rpc_endpoint.cc     | 1 -
 src/target/llvm/codegen_params.cc   | 8 ++++----
 src/target/source/codegen_c_host.cc | 5 +++--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index ba4a1a2f744e..6ef41748ca5b 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -236,7 +236,8 @@ def micro(model="unknown", options=None):
         "stm32f746xx": ["-mcpu=cortex-m7"],
     }
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options
+        trans_table[model] + ["-runtime=c", "--system-lib", "--link-params", f"-model={model}"],
+        options
     )
 
     # NOTE: in the future, the default micro target will be LLVM except when
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 38252bc27745..662597086d8a 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -121,7 +121,6 @@ class MicroTransportChannel : public RPCChannel {
 
       ::std::string chunk;
       if (timeout != ::std::chrono::microseconds::zero()) {
-        LOG(INFO) << "ReceiveUntil no-timeout " << timeout.count() << " us";
         ::std::chrono::microseconds iter_timeout{
             ::std::max(::std::chrono::microseconds{0},
                        ::std::chrono::duration_cast<::std::chrono::microseconds>(
@@ -289,7 +288,6 @@ class MicroTransportChannel : public RPCChannel {
 };
 
 TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) {
-                                                     LOG(INFO) << "MICRO RPC CONNECT " << uint64_t(args[3]) << ", " << uint64_t(args[4]) << ", " << uint64_t(args[5]);
   MicroTransportChannel* micro_channel =
       new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])),
                                 ::std::chrono::microseconds(uint64_t(args[4])),
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index ef7b34079606..fbdd93fb4f62 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -370,7 +370,6 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
    */
   void HandleReturn(RPCCode code, RPCSession::FEncodeReturn setreturn) {
     TVMArgs args = RecvPackedSeq();
-    LOG(INFO) << "Receive PackedSeq " << args.size();
 
     if (code == RPCCode::kException) {
       // switch to the state before sending exception.
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 20a1efbb575e..9c0b979044f4 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -322,10 +322,10 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     if (arr_type.bits() == 32) {
       for (int i = 0; i < num_elements; i++) {
         float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
-        if (isinf(elem)) {
+        if (std::isinf(elem)) {
           // C99 standard.
           os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-        } else if (isnan(elem)) {
+        } else if (std::isnan(elem)) {
           // GNU extension, implemenatation-dependent.
           os << std::setw(one_element_size_bytes) << "NAN";
         } else {
@@ -340,10 +340,10 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     } else if (arr_type.bits() == 64) {
       for (int i = 0; i < num_elements; i++) {
         double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
-        if (isinf(elem)) {
+        if (std::isinf(elem)) {
           // C99 standard.
           os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-        } else if (isnan(elem)) {
+        } else if (std::isnan(elem)) {
           // GNU extension, implemenatation-dependent.
           os << std::setw(one_element_size_bytes) << "NAN";
         } else {
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 3896e37d5b5d..9a524b0428cc 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -75,7 +75,8 @@ void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
 
   function_names_.emplace_back(tvm::runtime::symbol::tvm_lookup_linked_param);
   for (auto kv : params) {
-    decl_stream << "#ifdef __cplusplus\n"
+    decl_stream << "\n"
+                << "#ifdef __cplusplus\n"
                 << "extern \"C\" {\n"
                 << "#endif\n"
                 << "static const ";
@@ -92,7 +93,7 @@ void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
                 << "}  // extern \"C\"\n"
                 << "#endif\n";
     stream << "    case " << kv.second->id << ":\n"
-           << "        ((int64_t*)out_ret_value)[0] = (int64_t) " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
+           << "        ((uint64_t*)out_ret_value)[0] = (uint64_t) (uintptr_t) " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
            << "        out_ret_tcode[0] = " << kTVMOpaqueHandle << ";\n"
            << "        return 0;\n";
   }

From bbb6e806832da56c9b2d3e780bf6a247805e4f50 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 12 Nov 2020 22:41:32 -0800
Subject: [PATCH 10/60] works-ish on micro

---
 python/tvm/micro/session.py                   |  46 +++++++-
 .../graph_runtime_module.c                    |   6 +-
 .../graph/debug/graph_runtime_debug.cc        |  15 ++-
 src/runtime/graph/graph_runtime.cc            | 102 +++++++++++++-----
 src/runtime/graph/graph_runtime.h             |  14 ++-
 src/runtime/graph/graph_runtime_factory.cc    |   2 +-
 src/runtime/rpc/rpc_module.cc                 |  56 +++++-----
 src/target/llvm/codegen_llvm.cc               |  10 +-
 src/target/source/codegen_c_host.cc           |   3 +-
 9 files changed, 185 insertions(+), 69 deletions(-)

diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 3f84f3beab5b..5be7d59a143f 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -154,6 +154,20 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         self.transport.__exit__(exc_type, exc_value, exc_traceback)
 
 
+def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
+    try:
+        lookup_linked_param = mod.get_function('_lookup_linked_param')
+    except KeyError:
+        return None
+
+    remote_data = lookup_linked_param(storage_id)
+    if remote_data is None:
+        return None
+
+    return get_global_func('tvm.rpc.NDArrayFromRemoteOpaqueHandle')(
+        mod, remote_data, template_tensor, ctx, lambda: None)
+
+
 def create_local_graph_runtime(graph_json_str, mod, ctx):
     """Create a local graph runtime driving execution on the remote CPU context given.
 
@@ -175,4 +189,34 @@ def create_local_graph_runtime(graph_json_str, mod, ctx):
     """
     device_type_id = [ctx.device_type, ctx.device_id]
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return graph_runtime.GraphModule(fcreate(graph_json_str, mod, *device_type_id))
+    return graph_runtime.GraphModule(fcreate(graph_json_str, mod, lookup_remote_linked_param,
+                                             *device_type_id))
+
+
+def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
+    """Create a local debug runtime driving execution on the remote CPU context given.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        A string containing the graph representation.
+
+    mod : tvm.runtime.Module
+        The remote module containing functions in graph_json_str.
+
+    ctx : tvm.Context
+        The remote CPU execution context.
+
+    dump_root : Optional[str]
+        If given, passed as dump_root= to GraphModuleDebug.
+
+    Returns
+    -------
+    tvm.contrib.GraphRuntime :
+         A local graph runtime instance that executes on the remote device.
+    """
+    device_type_id = [ctx.device_type, ctx.device_id]
+    fcreate = get_global_func("tvm.graph_runtime_debug.create")
+    return debug_runtime.GraphModuleDebug(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id),
+        [ctx], graph_json_str, dump_root=dump_root)
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
index a8de71e33f9d..98e4693a4fb6 100644
--- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -88,7 +88,8 @@ int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, T
   uint32_t eid = TVMGraphRuntime_GetEntryId(
     graph_runtime.runtime, graph_runtime.runtime->input_nodes[index], 0);
   ret_values[0].v_handle = (void*) &graph_runtime.runtime->data_entry[eid].dl_tensor;
-  ret_tcodes[0] = kTVMNDArrayHandle;
+  ret_tcodes[0] = kTVMOpaqueHandle;
+  //ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
@@ -131,7 +132,8 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
 
   ret_values[0].v_handle = (void*) &(graph_runtime.runtime->data_entry[eid].dl_tensor);
-  ret_tcodes[0] = kTVMNDArrayHandle;
+//  ret_tcodes[0] = kTVMNDArrayHandle;
+  ret_tcodes[0] = kTVMOpaqueHandle;
   return 0;
 }
 
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 3e9ff4f279e7..d02a6d9a0d64 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -202,9 +202,10 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
  * \param ctxs All devices contexts.
  */
 Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                               const std::vector<TVMContext>& ctxs) {
+                               const std::vector<TVMContext>& ctxs,
+                               PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntimeDebug>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
@@ -212,7 +213,15 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args,
   ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
                                  "at least 4, but it has "
                               << args.num_args;
-  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+
+  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
+                                lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index c64f773f5157..7c34d9626181 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -64,14 +64,19 @@ void GraphRuntime::Run() {
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
+ * \param lookup_linked_param_func Linked parameter lookup function.
  */
 void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module,
-                        const std::vector<TVMContext>& ctxs) {
+                        const std::vector<TVMContext>& ctxs, PackedFunc lookup_linked_param_func) {
   std::istringstream is(graph_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
   ctxs_ = ctxs;
+  lookup_linked_param_ = lookup_linked_param_func;
+  if (lookup_linked_param_ == nullptr) {
+    lookup_linked_param_ = PackedFunc(&GraphRuntime::DefaultLookupLinkedParam);
+  }
   this->SetupStorage();
   this->SetupOpExecs();
   for (size_t i = 0; i < input_nodes_.size(); i++) {
@@ -249,12 +254,47 @@ void GraphRuntime::PreAllocatedDLTensorDeleter(DLManagedTensor* tensor) {
   delete reinterpret_cast<DLTensor*>(tensor);
 }
 
-void GraphRuntime::SetupStorage() {
+void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
+  Module mod = args[0];
+  int64_t storage_id = args[1];
+  NDArray template_tensor = args[2];
+  TVMContext ctx = args[3];
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
-  tvm::runtime::PackedFunc pf = module_.GetFunction(
+  tvm::runtime::PackedFunc pf = mod.GetFunction(
     ::tvm::runtime::symbol::tvm_lookup_linked_param, true);
+  if (pf == nullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  TVMRetValue opaque_handle = pf(storage_id);
+  if (opaque_handle.type_code() == kTVMNullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  std::unique_ptr<NDArray::Container> container{new NDArray::Container(
+      static_cast<void*>(opaque_handle), template_tensor.Shape(), template_tensor.DataType(), ctx)};
+  *rv = NDArray(GetObjectPtr<Object>(container.release()));
+}
+
+std::string List2String(std::vector<int64_t> shape) {
+  if (shape.size() == 0) {
+    return "[]";
+  }
 
+  std::stringstream ss;
+  ss << "[" << shape[0];
+  for (int i = 1; i < shape.size(); i++) {
+    ss << ", " << shape[i];
+  }
+  ss << "]";
+  return ss.str();
+}
+
+
+void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
   for (const std::string& s_type : attrs_.dltype) {
@@ -288,12 +328,16 @@ void GraphRuntime::SetupStorage() {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
-    if (pf != nullptr && pool_entry[sid].pre_linked_param == nullptr) {
-      try {
-        pool_entry[sid].pre_linked_param = pf(sid);
-      } catch (std::runtime_error& e) {
-        // Indicates this storage_id is not pre-linked.
-      }
+    TVMRetValue lookup_rv;
+    {
+      std::vector<int64_t> shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()};
+      DLTensor template_tensor{
+        nullptr, TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()), vtype[i], shape_vec.data(), nullptr, 0};
+      lookup_rv = lookup_linked_param_(
+        module_, sid, &template_tensor, ctxs_[0]);
+    }
+    if (lookup_rv.type_code() != kTVMNullptr) {
+      pool_entry[sid].linked_param = lookup_rv;
     }
     pool_entry[sid].param_data_entry = i;
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
@@ -308,21 +352,11 @@ void GraphRuntime::SetupStorage() {
       return pit.device_type == static_cast<int>(c.device_type);
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
-    if (pit.pre_linked_param != nullptr) {
-      LOG(INFO) << "param " << pit.param_data_entry << " pre-loaded!";
-      auto param_shape = &attrs_.shape[pit.param_data_entry];
-      DLManagedTensor* param_tensor = new DLManagedTensor{
-        {pit.pre_linked_param, ctx, static_cast<int>(param_shape->size()),
-         vtype[pit.param_data_entry], param_shape->data(), nullptr, 0},
-        nullptr,
-        PreAllocatedDLTensorDeleter};
-
-      storage_pool_.push_back(NDArray::FromDLPack(param_tensor));
-      LOG(INFO) << "Loaded data entry " << pit.param_data_entry
-                << " from pre-linked blob: " << param_tensor->dl_tensor.data;
-
+    if (pit.linked_param.defined()) {
+      LOG(INFO) << "param " << storage_pool_.size() << " pre-loaded!";
+      storage_pool_.push_back(pit.linked_param);
     } else {
-      LOG(INFO) << "param " << pit.param_data_entry << " blank!";
+      LOG(INFO) << "param " << storage_pool_.size() << " blank!";
       std::vector<int64_t> shape;
       shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
       storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
@@ -337,6 +371,9 @@ void GraphRuntime::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
+    LOG(INFO) << "sid " << i << ": (" << List2String(storage_pool_[storage_id].Shape())
+              << ", dtype=" << storage_pool_[storage_id].DataType() << ")"
+              << ": setup view: " << List2String(attrs_.shape[i]);
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
 
     const DLTensor* tmp = data_entry_[i].operator->();
@@ -497,18 +534,19 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
 }
 
 Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                          const std::vector<TVMContext>& ctxs) {
+                          const std::vector<TVMContext>& ctxs,
+                          const PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
 // Get all context for the host and other runtime devices.
-std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg) {
   // Reserve the first item as the fallback device.
   std::vector<TVMContext> ret;
   TVMContext ctx;
-  for (int i = 2; i < args.num_args; i += 2) {
+  for (int i = ctx_start_arg; i < args.num_args; i += 2) {
     int dev_type = args[i];
     ctx.device_type = static_cast<DLDeviceType>(dev_type);
     ctx.device_id = args[i + 1];
@@ -526,8 +564,14 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRet
   ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
                                  "at least 4, but it has "
                               << args.num_args;
-  const auto& contexts = GetAllContext(args);
-  *rv = GraphRuntimeCreate(args[0], args[1], contexts);
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+  const auto& contexts = GetAllContext(args, ctx_start_arg);
+  *rv = GraphRuntimeCreate(args[0], args[1], contexts, lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 9f0b0962333a..9e95dfc9bf96 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -94,10 +94,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
    *  processor.
    * \param ctxs The context of the host and devices where graph nodes will be
    *  executed on.
+   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
+   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
+   *  which is not compatible with RPCModules.
    */
 
   void Init(const std::string& graph_json, tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs);
+            const std::vector<TVMContext>& ctxs, const PackedFunc lookup_linked_param_func);
 
   /*!
    * \brief Get the input index given the name of input.
@@ -182,8 +185,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   struct PoolEntry {
     size_t size;
     int device_type;
-    void* pre_linked_param;
     int param_data_entry;
+    NDArray linked_param;
 //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
 //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
@@ -366,6 +369,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     }
     ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
+  /*! \brief PackedFunc to lookup a linked paramter from a local Module. */
+  static void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv);
   /*! \brief Delete pre-allocated DLTensor. */
   static void PreAllocatedDLTensorDeleter(DLManagedTensor* tensor);
   /*! \brief Setup the temporal storage */
@@ -413,9 +418,12 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   std::vector<size_t> data_alignment_;
   /*! \brief Operator on each node. */
   std::vector<std::function<void()>> op_execs_;
+  /*! \brief Linked parameter lookup function. */
+  PackedFunc lookup_linked_param_;
+
 };
 
-std::vector<TVMContext> GetAllContext(const TVMArgs& args);
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index 632a25c987bc..2c055e16cc9f 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -97,7 +97,7 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
 
 Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(this->graph_json_, this->imports_[0], ctxs);
+  exec->Init(this->graph_json_, this->imports_[0], ctxs, PackedFunc());
   // set params
   SetParams(exec.get(), this->params_);
   return Module(exec);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 7f810a229887..0ddd13572949 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -22,6 +22,7 @@
  * \brief RPC runtime module.
  */
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -48,37 +49,29 @@ static void RemoteNDArrayDeleter(Object* obj) {
 
 /*!
  * \brief Build a local NDArray with remote backing storage.
+ * \param sess the RPCSession which owns the given handle.
  * \param handle A pointer valid on the remote end which should form the `data` field of the
  *     underlying DLTensor.
- * \param shape The shape field of this DLTensor.
- * \param ndim The rank of this DLTensor.
+ * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
+ *     created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg.
  * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
  * \param deleter A function invoked when the local NDArray object is no longer used. If `handle`
  *      needs to be explicitly deleted after the NDArray is freed, this function should do that.
  * \param deleter_ctx An opaque pointer passed to deleter to identify the tensor being deleted.
  */
-NDArray NDArrayFromRemoteOpaqueHandle(void* handle, int64_t* shape, int64_t ndim, DLContext* ctx, FDeleter deleter, void* deleter_ctx) {
-  NDArray::Container* data = new NDArray::Container();
+NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle, DLTensor* template_tensor, TVMContext ctx, ADTObj::FDeleter deleter, void* deleter_ctx) {
+  ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
+    << "The TVMContext given does not belong to the given session";
+  RemoteSpace* space = new RemoteSpace();
+  space->sess = sess;
+  space->data = handle;
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
+  NDArray::Container* data = new NDArray::Container(
+    static_cast<void*>(space), std::move(shape_vec), template_tensor->dtype, ctx);
   data->manager_ctx = deleter_ctx;
   data->SetDeleter(deleter);
-  RemoteSpace* space = new RemoteSpace();
-  space->sess = sess_;
-  space->data = tensor->data;
-  data->dl_tensor.data = space;
-  NDArray ret(GetObjectPtr<Object>(data));
-  // RAII now in effect
-  data->shape_ = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
-  data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
-  data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
-  // setup dtype
-  data->dl_tensor.dtype = tensor->dtype;
-  // setup ctx
-  data->dl_tensor.ctx = ctx;
-  // check strides.
-  ICHECK(tensor->strides == nullptr);
-  // setup byteoffset
-  data->dl_tensor.byte_offset = tensor->byte_offset;
-  return ret;
+  return NDArray(GetObjectPtr<Object>(data));
 }
 
 
@@ -291,7 +284,7 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
-    *rv = NDArrayFromRemoteOpaqueHandle(tensor->data, tensor->shape, tensor->ndim, AddRPCSessionMask(ctx, sess_->table_index()), RemoteNDArrayDeleter, nd_handle);
+    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor, AddRPCSessionMask(tensor->ctx, sess_->table_index()), RemoteNDArrayDeleter, nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
     *rv = args[1];
@@ -477,11 +470,20 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
-TVM_REGISTER_GLOBAL("tvm.rpc.wrap_remote_ndarray").set_body_typed([](void* remote_array, PackedFunc deleter) {
-  *rv = WrapRemoteNDArray(remote_array, [pf](Object* ctx) {
-    pf();
+TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle").set_body_typed(
+  [](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx, PackedFunc deleter) -> NDArray {
+//    auto func = new std::function<void()>([deleter]() -> void {
+//      deleter();
+//    });
+    return NDArrayFromRemoteOpaqueHandle(
+      RPCModuleGetSession(mod), remote_array, template_tensor, ctx,
+      [](Object* context) {
+//        auto container = static_cast<NDArray::Container*>(context);
+//        auto cb_func = reinterpret_cast<std::function<void()>*>(container->manager_ctx);
+//        (*cb_func)();
+//        delete cb_func;
+      }, nullptr);//(void*) func);
   });
-});
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 20cbdf83b971..1b25a691ee4a 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -228,7 +228,14 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
 
   builder_->SetInsertPoint(default_block);
-  builder_->CreateRet(ConstInt32(kTvmErrorGeneratedInvalidStorageId));
+  {
+    auto ret_types_array = builder_->CreateBitCast(
+      &function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+    builder_->CreateStore(
+      llvm::ConstantInt::get(t_int_, kTVMNullptr),
+      builder_->CreateGEP(ret_types_array, zero_index_list));
+    builder_->CreateRet(ConstInt32(kTvmErrorNoError));
+  }
 
   llvm::raw_os_ostream os{std::cout};
 
@@ -249,7 +256,6 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
     auto retval_array = builder_->CreateBitCast(
       &function->arg_begin()[3], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
     builder_->CreateStore(
-//      param_symbol,
       builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
       builder_->CreateGEP(retval_array, zero_index_list));
     auto ret_types_array = builder_->CreateBitCast(
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 9a524b0428cc..915d43cffb13 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -71,7 +71,8 @@ void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
     << tvm::runtime::symbol::tvm_lookup_linked_param;
   stream << "    switch (((int64_t*) args)[0]) {\n"
          << "    default:\n"
-         << "        return " << kTvmErrorGeneratedInvalidStorageId << ";\n";
+         << "        out_ret_tcode[0] = " << kTVMNullptr << ";\n"
+         << "        return 0;\n";
 
   function_names_.emplace_back(tvm::runtime::symbol::tvm_lookup_linked_param);
   for (auto kv : params) {

From 6e19b2593a332bc9112e2bb49ecca35104c64261 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 09:34:56 -0800
Subject: [PATCH 11/60] final changes for link-params

---
 src/runtime/crt/common/memory.c                     | 13 ++++++-------
 src/runtime/crt/graph_runtime/graph_runtime.c       |  7 +++++--
 .../crt/graph_runtime_module/graph_runtime_module.c |  6 ++----
 src/runtime/graph/graph_runtime.cc                  |  8 ++++++--
 tests/python/unittest/test_link_params.py           |  3 +--
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/runtime/crt/common/memory.c b/src/runtime/crt/common/memory.c
index 68cad3645146..646ba46feecb 100644
--- a/src/runtime/crt/common/memory.c
+++ b/src/runtime/crt/common/memory.c
@@ -151,8 +151,8 @@ void* MemoryManager_Alloc(MemoryManager* mgr, tvm_index_t size) {
   }
   vleak_size++;
 #if TVM_CRT_DEBUG > 1
-  printf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
-         ptable->max_pages, npage, vleak_size);
+  TVMLogf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
+          ptable->max_pages, npage, vleak_size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -229,9 +229,8 @@ void* MemoryManager_Realloc(MemoryManager* mgr, void* ptr, tvm_index_t size) {
     vleak_size++;
   }
 #if TVM_CRT_DEBUG > 1
-  printf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%" PRId64
-         "\n",
-         data, start, mgr->ptable.max_pages, npage, vleak_size, size);
+  TVMLogf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%zu",
+          data, start, mgr->ptable.max_pages, npage, vleak_size, size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -251,8 +250,8 @@ void MemoryManager_Free(MemoryManager* mgr, void* ptr) {
   free_map->insert(free_map, p->num_pages, p);
   vleak_size--;
 #if TVM_CRT_DEBUG > 1
-  printf("release: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", ptr,
-         entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
+  TVMLogf("release: addr=%p, start=%" PRId64 "/%zu, npage=%zu, vleak=%d", ptr,
+          entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
 #endif  // TVM_CRT_DEBUG
 }
 
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index 03d81aa184f8..c5dc792cf315 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -765,10 +765,13 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     uint8_t did_find_linked_param = 0;
     if (lookup_linked_param_valid) {
       lookup_linked_param.args.values[0].v_int64 = idx;
-      if (lookup_linked_param.Call(&lookup_linked_param) == 0) {
+      CHECK_EQ(lookup_linked_param.Call(&lookup_linked_param), 0, "lookup_linked_param");
+
+      void* linked_param_data = lookup_linked_param.ret_value.values[0].v_handle;
+      if (linked_param_data != NULL) {
         runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1;
         DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor;
-        tensor->data = lookup_linked_param.ret_value.values[0].v_handle;
+        tensor->data = linked_param_data;
         tensor->ctx = ctx;
         tensor->ndim = attrs->ndim[pit.entry_id];
         tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM;
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
index 98e4693a4fb6..a8de71e33f9d 100644
--- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -88,8 +88,7 @@ int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, T
   uint32_t eid = TVMGraphRuntime_GetEntryId(
     graph_runtime.runtime, graph_runtime.runtime->input_nodes[index], 0);
   ret_values[0].v_handle = (void*) &graph_runtime.runtime->data_entry[eid].dl_tensor;
-  ret_tcodes[0] = kTVMOpaqueHandle;
-  //ret_tcodes[0] = kTVMNDArrayHandle;
+  ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
@@ -132,8 +131,7 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
 
   ret_values[0].v_handle = (void*) &(graph_runtime.runtime->data_entry[eid].dl_tensor);
-//  ret_tcodes[0] = kTVMNDArrayHandle;
-  ret_tcodes[0] = kTVMOpaqueHandle;
+  ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7c34d9626181..423a2d62ea93 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -257,7 +257,7 @@ void GraphRuntime::PreAllocatedDLTensorDeleter(DLManagedTensor* tensor) {
 void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   Module mod = args[0];
   int64_t storage_id = args[1];
-  NDArray template_tensor = args[2];
+  DLTensor* template_tensor = args[2];
   TVMContext ctx = args[3];
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
@@ -274,8 +274,12 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
     return;
   }
 
+  std::vector<int64_t> shape_vec{
+      template_tensor->shape,
+      template_tensor->shape + template_tensor->ndim};
+
   std::unique_ptr<NDArray::Container> container{new NDArray::Container(
-      static_cast<void*>(opaque_handle), template_tensor.Shape(), template_tensor.DataType(), ctx)};
+      static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
   *rv = NDArray(GetObjectPtr<Object>(container.release()));
 }
 
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index f134db37c36d..630966892fd4 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -333,7 +333,6 @@ def _run_unlinked(lib_mod):
 def test_crt_link_params():
     import tvm.micro
 
-
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype)
@@ -395,4 +394,4 @@ def _run_unlinked(lib):
 
 
 if __name__ == '__main__':
-  sys.exit(pytest.main(sys.argv[1:]))
+  sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 22a587c1fdb07248e6b9fd87aed2ca4498336326 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 10:55:43 -0800
Subject: [PATCH 12/60] missed stuff

---
 include/tvm/tir/function.h                 | 38 ++++++++++++++++++++++
 src/relay/backend/build_module.cc          | 21 +++++++-----
 src/relay/backend/graph_runtime_codegen.cc | 31 ++++++++++++++----
 src/target/llvm/codegen_llvm.cc            |  4 +++
 src/target/llvm/codegen_llvm.h             | 12 +++++++
 src/target/llvm/llvm_module.cc             |  4 +++
 src/target/target_kind.cc                  |  2 ++
 src/tir/ir/function.cc                     |  7 ++++
 8 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 64dbb5cf8ec3..ecc0e672749a 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -28,6 +28,7 @@
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
+#include <tvm/runtime/ndarray.h>
 
 #include <string>
 
@@ -150,6 +151,32 @@ class PrimFunc : public BaseFunc {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
 };
 
+class LinkedParamNode : public Object {
+ public:
+  /*! \brief Unique numeric identifier used by runtimes to lookup this parameter. */
+  int64_t id;
+
+  /*! \brief Parameter data which should get linked into the final module. */
+  ::tvm::runtime::NDArray param;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("id", &id);
+    v->Visit("param", &param);
+  }
+
+  static constexpr const char* _type_key = "tir.LinkedParam";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LinkedParamNode, Object);
+};
+
+class LinkedParam : public ObjectRef {
+ public:
+  LinkedParam(int64_t id, ::tvm::runtime::NDArray param);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LinkedParam, ObjectRef, LinkedParamNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
+};
+
+
 /*!
  * \brief PrimFunc specific attribute names.
  *
@@ -192,6 +219,17 @@ constexpr const char* kNoAlias = "tir.noalias";
  * \note There can only be one entry function per module.
  */
 constexpr const char* kIsEntryFunc = "tir.is_entry_func";
+
+/*!
+ * \brief Parameters used in the module that should be linked by the codegen.
+ *
+ * Type: Map<String, LinkableParam>
+ *
+ * \note This should be present only on a function named
+ *     tvm::target::packed_func::kLookupLinkedParam.
+ */
+constexpr const char* kLinkedParams = "tir.linked_params";
+
 }  // namespace attr
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index cc304808b16f..762d29b90933 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -22,6 +22,7 @@
  * \brief Code generation for TVM's graph runtime.
  */
 #include <tvm/driver/driver_api.h>
+#include <tvm/ir/expr.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/qnn/transform.h>
@@ -30,6 +31,7 @@
 
 #include <memory>
 
+#include "../../target/func_registry_generator.h"
 #include "../../target/source/codegen_source_base.h"
 #include "compile_engine.h"
 #include "utils.h"
@@ -88,6 +90,17 @@ struct GraphCodegen {
     return ret;
   }
 
+  std::unordered_map<std::string, int64_t> GetParamIds() {
+    std::unordered_map<std::string, int64_t> ret;
+    auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
+    for (const auto& expr : names) {
+      // Implicit cast from runtime::String to std::string
+      std::string key = expr;
+      ret[key] = CallFunc<int64_t>("get_param_id", key);
+    }
+    return ret;
+  }
+
  protected:
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
@@ -474,14 +487,6 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     // When there is no lowered_funcs due to reasons such as optimization.
     if (lowered_funcs.size() == 0) {
-      Target target_host = GetTargetHost();
-
-      // If no target_host has been set, we choose a default one, which is
-      // llvm if "codegen.LLVMModuleCreate" is accessible.
-      const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-      if (!target_host.defined())
-        target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
-
       if (target_host.defined() && target_host->kind->name == "llvm") {
         // If we can decide the target is LLVM, we then create an empty LLVM module.
         ret_.mod = (*pf)(target_host->str(), "empty_module");
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index e24d18de931c..609327ca1071 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -56,7 +56,7 @@ struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
-  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
 };
 
 /*! \brief Node types */
@@ -203,7 +203,11 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     GetJSON(&writer);
     LoweredOutput ret;
     ret.graph_json = os.str();
-    ret.params = params_;
+    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
+    for (auto param : params_) {
+      ret.params.emplace(
+        std::make_pair(param.first, std::make_pair(int(param_storage_ids_[param.first]), param.second)));
+    }
 
     for (auto& kv : lowered_funcs_) {
       if (ret.lowered_funcs.count(kv.first) == 0) {
@@ -312,9 +316,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     Expr expr = GetRef<Expr>(op);
     size_t index = params_.size();
     std::string name = "p" + std::to_string(index);
-    params_[name] = op->data;
     auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
-    return AddNode(node, expr);
+    auto to_return = AddNode(node, expr);
+    CHECK_EQ(to_return.size(), 1) << "Expected exactly 1 parameter node created";
+    param_storage_ids_[name] = nodes_.size() - 1;
+    params_[name] = op->data;
+    return to_return;
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const TupleNode* op) override {
@@ -531,8 +538,14 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   std::unordered_map<const Object*, std::vector<GraphNodeRef>> var_map_;
   /*! \brief target device */
   TargetsMap targets_;
-  /*! \brief params */
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
+   * used to lookup the parameter.
+   */
   std::unordered_map<std::string, runtime::NDArray> params_;
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
   /*! \brief plan memory of device result */
   Map<Expr, Array<IntegerArray>> storage_device_map_;
   /*! \brief lowered funcs */
@@ -582,7 +595,13 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
         ICHECK_GT(this->output_.params.count(key), 0);
-        *rv = this->output_.params[key];
+        *rv = this->output_.params[key].second;
+      });
+    } else if (name == "get_param_id") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        String key = args[0];
+        ICHECK_GT(this->output_.params.count(key), 0);
+        *rv = this->output_.params[key].first;
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 1b25a691ee4a..87a49cc5bd06 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -26,13 +26,17 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/crt/error_codes.h>
 #include <tvm/tir/op.h>
 
 #include <algorithm>
 
+#include "llvm/Support/raw_os_ostream.h"
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
+#include "../func_registry_generator.h"
 #include "codegen_cpu.h"
+#include "codegen_params.h"
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 78eb5e2dcac7..71583708da2c 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -98,6 +98,18 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \param mod The module to be linked.
    */
   void AddLinkModule(std::unique_ptr<llvm::Module>&& mod);
+  /*!
+   * \brief Link parameters into the module so they don't need to be supplied at runtime.
+   * Parameters can be linked into the module so that the generated code is easier to use, or so
+   * that RAM space doesn't need to be allocated for them. This function adds the given parameters
+   * to the generated LLVM module.
+   * \param storage_id_offset Offset added to the index of each entry in params_by_sid to form the
+   *     storage_id of that parameter. Storage ids for parameters are expected to be contiguous.
+   * \param params_by_sid Array of NDArray. Each entry is a parameter. The index of the array (added
+   *     to sid_offset) is the storage_id of the param.
+   * \param param_names Array containing the name for each param in params_by_sid.
+   */
+  void LinkParameters(const Map<String, LinkedParam> params);
   /*!
    * \brief Create Value for expression e
    * \param e The expression to be created value for.
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 89774ec61618..ceb609c1e666 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -32,6 +32,7 @@
 
 #include "../../runtime/file_utils.h"
 #include "../../runtime/library_module.h"
+#include "../func_registry_generator.h"
 #include "codegen_blob.h"
 #include "codegen_llvm.h"
 #include "llvm_common.h"
@@ -199,6 +200,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     std::vector<PrimFunc> funcs;
     std::string entry_func;
+    Map<String,LinkedParam> linked_params;
+    bool found_linked_params = false;
+    bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
     for (auto kv : mod->functions) {
       if (could_have_linked_params &&
           kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 017ba396f861..f249ef8f529d 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -213,10 +213,12 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("mfloat-abi")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
+    .add_attr_option<Bool>("link-params")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
+    .add_attr_option<Bool>("link-params")
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .set_default_keys({"cpu"});
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index ef7f4f8e16dd..101d80a52ea1 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -28,6 +28,13 @@
 namespace tvm {
 namespace tir {
 
+LinkedParam::LinkedParam(int64_t id, ::tvm::runtime::NDArray param) {
+  auto n = make_object<LinkedParamNode>();
+  n->id = id;
+  n->param = param;
+  data_ = std::move(n);
+}
+
 // Get the function type of a PrimFunc
 PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
                    Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {

From f7b15b70dbf7d5242f1833336763d308556705f6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 13:51:45 -0800
Subject: [PATCH 13/60] git-clang-format

---
 include/tvm/tir/function.h                 | 3 +--
 src/relay/backend/graph_runtime_codegen.cc | 4 ++--
 src/target/llvm/codegen_llvm.cc            | 4 ++--
 src/target/llvm/llvm_module.cc             | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index ecc0e672749a..a22552ea190c 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -25,10 +25,10 @@
 #define TVM_TIR_FUNCTION_H_
 
 #include <tvm/ir/function.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
-#include <tvm/runtime/ndarray.h>
 
 #include <string>
 
@@ -176,7 +176,6 @@ class LinkedParam : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
 };
 
-
 /*!
  * \brief PrimFunc specific attribute names.
  *
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 609327ca1071..a5073326c13c 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -205,8 +205,8 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     ret.graph_json = os.str();
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
     for (auto param : params_) {
-      ret.params.emplace(
-        std::make_pair(param.first, std::make_pair(int(param_storage_ids_[param.first]), param.second)));
+      ret.params.emplace(std::make_pair(
+          param.first, std::make_pair(int(param_storage_ids_[param.first]), param.second)));
     }
 
     for (auto& kv : lowered_funcs_) {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 87a49cc5bd06..39ea82065377 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -25,18 +25,18 @@
 #include "codegen_llvm.h"
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/crt/error_codes.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/tir/op.h>
 
 #include <algorithm>
 
-#include "llvm/Support/raw_os_ostream.h"
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
 #include "codegen_cpu.h"
 #include "codegen_params.h"
+#include "llvm/Support/raw_os_ostream.h"
 namespace tvm {
 namespace codegen {
 
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index ceb609c1e666..ab2fcee00b9e 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -200,7 +200,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     std::vector<PrimFunc> funcs;
     std::string entry_func;
-    Map<String,LinkedParam> linked_params;
+    Map<String, LinkedParam> linked_params;
     bool found_linked_params = false;
     bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
     for (auto kv : mod->functions) {

From ef6e14f1668c3716c02c24f3d8c3338e6a6c26b9 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 13:52:40 -0800
Subject: [PATCH 14/60] black format

---
 python/tvm/contrib/binutils.py            |   2 +-
 python/tvm/micro/build.py                 |   9 +-
 python/tvm/micro/debugger.py              |   3 +-
 python/tvm/micro/session.py               |  17 +-
 python/tvm/relay/param_dict.py            |   6 +-
 python/tvm/target/target.py               |   2 +-
 tests/python/unittest/test_link_params.py | 236 +++++++++++-----------
 7 files changed, 143 insertions(+), 132 deletions(-)

diff --git a/python/tvm/contrib/binutils.py b/python/tvm/contrib/binutils.py
index 146944970827..53f92b9855fe 100644
--- a/python/tvm/contrib/binutils.py
+++ b/python/tvm/contrib/binutils.py
@@ -38,7 +38,7 @@ def run_cmd(cmd):
     output : str
         resulting stdout capture from the subprocess
     """
-    _LOG.debug('execute: %s', ' '.join(cmd))
+    _LOG.debug("execute: %s", " ".join(cmd))
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (output, _) = proc.communicate()
     output = output.decode("utf-8")
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index bed5bde6f916..4aec9ea5ecbb 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -111,8 +111,13 @@ def default_options(target_include_dir):
 
 
 def build_static_runtime(
-    workspace, compiler, module, lib_opts=None, bin_opts=None, generated_lib_opts=None,
-    extra_libs=None
+    workspace,
+    compiler,
+    module,
+    lib_opts=None,
+    bin_opts=None,
+    generated_lib_opts=None,
+    extra_libs=None,
 ):
     """Build the on-device runtime, statically linking the given modules.
 
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 18ed350305d8..9dd496a950e5 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -199,7 +199,8 @@ def popen_kwargs(self):
             raise NotImplementedError(f"System {sysname} is not yet supported")
 
         self.fd_transport = FdTransport(
-            stdout_read, stdin_write, timeouts=transport.debug_transport_timeouts())
+            stdout_read, stdin_write, timeouts=transport.debug_transport_timeouts()
+        )
         self.fd_transport.open()
 
         return {
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 5be7d59a143f..adbad20cda06 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -156,7 +156,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
 
 def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
     try:
-        lookup_linked_param = mod.get_function('_lookup_linked_param')
+        lookup_linked_param = mod.get_function("_lookup_linked_param")
     except KeyError:
         return None
 
@@ -164,8 +164,9 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
     if remote_data is None:
         return None
 
-    return get_global_func('tvm.rpc.NDArrayFromRemoteOpaqueHandle')(
-        mod, remote_data, template_tensor, ctx, lambda: None)
+    return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")(
+        mod, remote_data, template_tensor, ctx, lambda: None
+    )
 
 
 def create_local_graph_runtime(graph_json_str, mod, ctx):
@@ -189,8 +190,9 @@ def create_local_graph_runtime(graph_json_str, mod, ctx):
     """
     device_type_id = [ctx.device_type, ctx.device_id]
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return graph_runtime.GraphModule(fcreate(graph_json_str, mod, lookup_remote_linked_param,
-                                             *device_type_id))
+    return graph_runtime.GraphModule(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id)
+    )
 
 
 def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
@@ -219,4 +221,7 @@ def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
     fcreate = get_global_func("tvm.graph_runtime_debug.create")
     return debug_runtime.GraphModuleDebug(
         fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id),
-        [ctx], graph_json_str, dump_root=dump_root)
+        [ctx],
+        graph_json_str,
+        dump_root=dump_root,
+    )
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 463eae51d7b8..37b4f1c72c4a 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -83,9 +83,9 @@ def linkable_param_dict(graph_json, params, target):
     graph = json.loads(graph_json)
     data_by_sid = [None] * len(params)
     for param_name, param in params.items():
-        for node in graph['nodes']:
-            if node['name'] == param_name:
-                sid = node['storage_id']
+        for node in graph["nodes"]:
+            if node["name"] == param_name:
+                sid = node["storage_id"]
                 data_by_sid[sid] = param
 
     # GraphRuntimeCodegen is expected to allocated the first len(params) storage_ids to contain
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 6ef41748ca5b..cd874b8bffe4 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -237,7 +237,7 @@ def micro(model="unknown", options=None):
     }
     opts = _merge_opts(
         trans_table[model] + ["-runtime=c", "--system-lib", "--link-params", f"-model={model}"],
-        options
+        options,
     )
 
     # NOTE: in the future, the default micro target will be LLVM except when
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 630966892fd4..3dc3122af81c 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -36,15 +36,15 @@
 
 # The data types that are linkable.
 LINKABLE_DTYPES = (
-    [f'uint{b}' for b in (8, 16, 32, 64)] +
-    [f'int{b}' for b in (8, 16, 32, 64)] +
-    ['float32', 'float64'])
-
+    [f"uint{b}" for b in (8, 16, 32, 64)]
+    + [f"int{b}" for b in (8, 16, 32, 64)]
+    + ["float32", "float64"]
+)
 
 
 def dtype_info(dtype):
     """Lookup numpy type info for the given string dtype (of LINKABLE_DTYPES above)."""
-    if 'int' in dtype:
+    if "int" in dtype:
         return np.iinfo(getattr(np, dtype))
     else:
         return np.finfo(getattr(np, dtype))
@@ -59,18 +59,18 @@ def _make_random_tensor(dtype):
     """Create a random test tensor of shape TEST_SHAPE and the given dtype."""
     global RAND_SEED
     if RANDOM_TENSOR_START is not None:
-      to_return = np.arange(RANDOM_TENSOR_START,
-                            RANDOM_TENSOR_START + np.prod(TEST_SHAPE),
-                            dtype=dtype).reshape(TEST_SHAPE)
-      RAND_SEED += np.prod(TEST_SHAPE)
-      return to_return
+        to_return = np.arange(
+            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(TEST_SHAPE), dtype=dtype
+        ).reshape(TEST_SHAPE)
+        RAND_SEED += np.prod(TEST_SHAPE)
+        return to_return
 
     dinfo = dtype_info(dtype)
-    if 'int' in dtype:
+    if "int" in dtype:
         return np.random.randint(dinfo.min, dinfo.max, TEST_SHAPE, dtype=dtype)
     else:
         to_return = np.random.uniform(0, dinfo.max, TEST_SHAPE)
-#        to_return = dinfo.min + (np.random.random(TEST_SHAPE) * dinfo.max)
+        #        to_return = dinfo.min + (np.random.random(TEST_SHAPE) * dinfo.max)
         np.reshape(to_return, np.prod(TEST_SHAPE))[::2] *= -1
         return to_return
 
@@ -92,28 +92,28 @@ def _lookup_sid(graph, name):
         The storage_id of the parameter.
     """
     num_outputs_seen = 0
-    for i, n in enumerate(graph['nodes']):
-        if n['name'] == name:
-            return graph['attrs']['storage_id'][1][num_outputs_seen]
+    for i, n in enumerate(graph["nodes"]):
+        if n["name"] == name:
+            return graph["attrs"]["storage_id"][1][num_outputs_seen]
         else:
-            if 'attrs' in n and 'num_outputs' in n['attrs']:
-                num_outputs_seen += n['attrs']['num_outputs']
+            if "attrs" in n and "num_outputs" in n["attrs"]:
+                num_outputs_seen += n["attrs"]["num_outputs"]
             else:
                 num_outputs_seen += 1
 
-    raise KeyError(f'no such param: {name}')
+    raise KeyError(f"no such param: {name}")
 
 
 def _get_ctypes_dtype(dt):
     """Return a ctypes c_* datatype given a string data type."""
-    if 'int' in dt:
-        return getattr(ctypes, f'c_{dt}')
-    elif dt == 'float32':
+    if "int" in dt:
+        return getattr(ctypes, f"c_{dt}")
+    elif dt == "float32":
         return ctypes.c_float
-    elif dt == 'float64':
+    elif dt == "float64":
         return ctypes.c_double
     else:
-        assert False, f'unknown dtype: {dt}'
+        assert False, f"unknown dtype: {dt}"
 
 
 def _verify_linked_param(dtype, lib, mod, graph, name):
@@ -122,13 +122,12 @@ def _verify_linked_param(dtype, lib, mod, graph, name):
     # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
     # a GraphRuntimeFactory module is created instead of the module itself.
     param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
-    print('verify', param_ptr)
+    print("verify", param_ptr)
     arr_data = (_get_ctypes_dtype(dtype) * np.prod(TEST_SHAPE)).from_address(param_ptr.value)
     gen_param = lib.params[name]
-    print('gen param dtype', gen_param.dtype)
-    arr = np.ndarray(
-        shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order='C')
-    if 'int' in gen_param.dtype:
+    print("gen param dtype", gen_param.dtype)
+    arr = np.ndarray(shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order="C")
+    if "int" in gen_param.dtype:
         np.testing.assert_equal(gen_param.asnumpy(), arr)
     else:
         np.testing.assert_allclose(gen_param.asnumpy(), arr)
@@ -140,30 +139,29 @@ def _make_mod_and_params(dtype):
     param_init = {}
 
     def _add_decl(name, dtype):
-        param_decls[name] = f'%{name} : Tensor[{TEST_SHAPE}, {dtype}]'
+        param_decls[name] = f"%{name} : Tensor[{TEST_SHAPE}, {dtype}]"
         param_init[name] = _make_random_tensor(dtype)
 
-    _add_decl(f'{dtype}_a', dtype)
-    _add_decl(f'{dtype}_b', dtype)
+    _add_decl(f"{dtype}_a", dtype)
+    _add_decl(f"{dtype}_b", dtype)
 
     mod_lines = [
         '#[version = "0.0.5"]',
         f"def @main(%rand_input : Tensor[{TEST_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
     ]
-    if 'int' in dtype:
+    if "int" in dtype:
         mod_lines.append(
-#            f'    %0 = bitwise_xor(%rand_input, bitwise_xor(%{dtype}_a, %{dtype}_b));')
-            f'    %0 = add(%rand_input, %{dtype}_a);')
+            #            f'    %0 = bitwise_xor(%rand_input, bitwise_xor(%{dtype}_a, %{dtype}_b));')
+            f"    %0 = add(%rand_input, %{dtype}_a);"
+        )
     else:
         mod_lines.append(
-            f'    %0 = cast(add(%rand_input, cast(add(%{dtype}_a, %{dtype}_b), dtype="{dtype}")), dtype="{dtype}");')
-#             f'    %0 = cast(add(%rand_input, %{dtype}_a), dtype="{dtype}");')
-    mod_lines.extend([
-        '    %0',
-        '}'
-    ])
-
-    mod = tvm.parser.fromtext('\n'.join(mod_lines))
+            f'    %0 = cast(add(%rand_input, cast(add(%{dtype}_a, %{dtype}_b), dtype="{dtype}")), dtype="{dtype}");'
+        )
+    #             f'    %0 = cast(add(%rand_input, %{dtype}_a), dtype="{dtype}");')
+    mod_lines.extend(["    %0", "}"])
+
+    mod = tvm.parser.fromtext("\n".join(mod_lines))
     return mod, param_init
 
 
@@ -172,8 +170,8 @@ def test_llvm_link_params():
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype)
-        main_func = mod['main']
-        target = 'llvm --runtime=c --system-lib --link-params'
+        main_func = mod["main"]
+        target = "llvm --runtime=c --system-lib --link-params"
         with tvm.transform.PassContext(opt_level=3):
             lib = tvm.relay.build(mod, target, params=param_init)
             assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
@@ -186,60 +184,60 @@ def test_llvm_link_params():
             def _run_linked(lib):
                 graph_json, mod, _ = lib
                 graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
-                graph_rt.set_input('rand_input', rand_input) # NOTE: params not required.
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
                 graph_rt.run()
                 return graph_rt.get_output(0)
 
             linked_output = _run_linked(lib)
 
         with tvm.transform.PassContext(opt_level=3):
-            lib = tvm.relay.build(mod, 'llvm --system-lib', params=param_init)
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
 
             def _run_unlinked(lib):
                 graph_json, mod, lowered_params = lib
                 graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
-                graph_rt.set_input('rand_input', rand_input, **lowered_params)
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
                 graph_rt.run()
                 return graph_rt.get_output(0)
 
             unlinked_output = _run_unlinked(lib)
 
-        if 'int' in dtype:
+        if "int" in dtype:
             np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
         else:
             np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
 
 
 def _get_c_datatype(dtype):
-  """Translate LINKABLE_DTYPES element to c datatype."""
-  if 'int' in dtype:
-    return f'{dtype}_t'
-  elif dtype == 'float32':
-    return 'float'
-  elif dtype == 'float64':
-    return 'double'
-  else:
-    assert False, f'unknown dtype {dtype}'
+    """Translate LINKABLE_DTYPES element to c datatype."""
+    if "int" in dtype:
+        return f"{dtype}_t"
+    elif dtype == "float32":
+        return "float"
+    elif dtype == "float64":
+        return "double"
+    else:
+        assert False, f"unknown dtype {dtype}"
 
 
 def _format_c_value(dtype, width, x):
-  if 'int' in dtype:
-    hex_formatstr = f'{{:{"+" if dtype.startswith("int") else ""}#0{width}x}}'
-    return hex_formatstr.format(x)
-  elif 'float' in dtype:
-    to_ret = float(x).hex()
-    if 'inf' in to_ret:
-      return ('-' if x < 0 else '') + 'INFINITY'
-    elif 'nan' in to_ret:
-      return 'NAN'
-
-    before, after = to_ret.split('p')
-    return f'{before.rstrip("0")}p{after}'
-  else:
-    assert False, f"don't know dtype {dtype}"
+    if "int" in dtype:
+        hex_formatstr = f'{{:{"+" if dtype.startswith("int") else ""}#0{width}x}}'
+        return hex_formatstr.format(x)
+    elif "float" in dtype:
+        to_ret = float(x).hex()
+        if "inf" in to_ret:
+            return ("-" if x < 0 else "") + "INFINITY"
+        elif "nan" in to_ret:
+            return "NAN"
+
+        before, after = to_ret.split("p")
+        return f'{before.rstrip("0")}p{after}'
+    else:
+        assert False, f"don't know dtype {dtype}"
 
 
-HEX_NUM_RE = re.compile(r'[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))')
+HEX_NUM_RE = re.compile(r"[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))")
 
 
 def test_c_link_params():
@@ -247,58 +245,58 @@ def test_c_link_params():
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype)
-        main_func = mod['main']
-        target = 'c --link-params'
+        main_func = mod["main"]
+        target = "c --link-params"
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             lib = tvm.relay.build(mod, target, params=param_init)
             assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
 
             src = lib.lib.get_source()
-            lib.lib.save('test.c', 'cc')
+            lib.lib.save("test.c", "cc")
             c_dtype = _get_c_datatype(dtype)
-            src_lines = src.split('\n')
-            param = lib.params['p0'].asnumpy().reshape(np.prod(TEST_SHAPE))
-            param_def = f'static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{'
+            src_lines = src.split("\n")
+            param = lib.params["p0"].asnumpy().reshape(np.prod(TEST_SHAPE))
+            param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{"
             for i, line in enumerate(src_lines):
-              if line == param_def:
-                i += 1
-                break
+                if line == param_def:
+                    i += 1
+                    break
             else:
-              assert False, f'did not find parameter definition "{param_def}":\n{src}'
+                assert False, f'did not find parameter definition "{param_def}":\n{src}'
 
             cursor = 0
             width = dtype_info(dtype).bits // 4 + 2
             if dtype.startswith("int"):
-              width += 1  # Account for sign
-
-            print('check printing of', param)
-            while '};' not in src_lines[i]:
-              for match in HEX_NUM_RE.finditer(src_lines[i]):
-                assert match.group() == _format_c_value(dtype, width, param[cursor]), (
-                  f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
-                  f'"{match.group(0)}"; full p0 follows:\n{src}')
-                cursor += 1
-              i += 1
+                width += 1  # Account for sign
+
+            print("check printing of", param)
+            while "};" not in src_lines[i]:
+                for match in HEX_NUM_RE.finditer(src_lines[i]):
+                    assert match.group() == _format_c_value(dtype, width, param[cursor]), (
+                        f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
+                        f'"{match.group(0)}"; full p0 follows:\n{src}'
+                    )
+                    cursor += 1
+                i += 1
 
             assert cursor == np.prod(param.shape)
             temp = utils.tempdir()
 
             # Need a unique name per library to avoid dlopen caching the lib load.
-            lib_path = temp_dir.relpath(f'test-{dtype}-linked.so')
-            lib['remove_params']().export_library(lib_path)
+            lib_path = temp_dir.relpath(f"test-{dtype}-linked.so")
+            lib["remove_params"]().export_library(lib_path)
             lib_mod = tvm.runtime.load_module(lib_path)
 
-#            lib_mod = lib_factory['default']()
+            #            lib_mod = lib_factory['default']()
             graph = json.loads(lib.graph_json)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, lib_mod, graph, p)
 
             # Wrap in function to explicitly deallocate the runtime.
             def _run_linked(lib_mod):
-                graph_rt = tvm.contrib.graph_runtime.GraphModule(
-                  lib_mod['default'](tvm.cpu(0)))
-                graph_rt.set_input('rand_input', rand_input) # NOTE: params not required.
-                print('linked', graph_rt.get_input('p0'))
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
+                print("linked", graph_rt.get_input("p0"))
                 graph_rt.run()
 
                 return graph_rt.get_output(0)
@@ -307,23 +305,24 @@ def _run_linked(lib_mod):
 
         linked_params = lib.params
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-            lib = tvm.relay.build(mod, 'c', params=param_init)
+            lib = tvm.relay.build(mod, "c", params=param_init)
             _, _, params = lib
             # Need a unique name per library to avoid dlopen caching the lib load.
-            lib_path = temp_dir.relpath(f'test-{dtype}-unlinked.so')
+            lib_path = temp_dir.relpath(f"test-{dtype}-unlinked.so")
             lib.export_library(lib_path)
             lib_mod = tvm.runtime.load_module(lib_path)
 
-            print('unlinked', params)
+            print("unlinked", params)
+
             def _run_unlinked(lib_mod):
-                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod['default'](tvm.cpu(0)))
-                graph_rt.set_input('rand_input', rand_input, **params)
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input, **params)
                 graph_rt.run()
                 return graph_rt.get_output(0)
 
             unlinked_output = _run_unlinked(lib_mod)
 
-        if 'int' in dtype:
+        if "int" in dtype:
             np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
         else:
             np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
@@ -336,8 +335,8 @@ def test_crt_link_params():
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype)
-        main_func = mod['main']
-        target = 'c -mcpu=native --system-lib --runtime=c --link-params'
+        main_func = mod["main"]
+        target = "c -mcpu=native --system-lib --runtime=c --link-params"
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
             assert set(params.keys()) == {"p0"}  # NOTE: op folded
@@ -345,7 +344,7 @@ def test_crt_link_params():
             workspace = tvm.micro.Workspace()
             compiler = tvm.micro.DefaultCompiler(target=target)
             opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
-            opts['bin_opts']['ldflags'].append('-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE')
+            opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE")
 
             micro_binary = tvm.micro.build_static_runtime(
                 # the x86 compiler *expects* you to give the exact same dictionary for both
@@ -357,8 +356,10 @@ def test_crt_link_params():
                 lib,
                 lib_opts=opts["bin_opts"],
                 bin_opts=opts["bin_opts"],
-                extra_libs=[os.path.join(tvm.micro.CRT_ROOT_DIR, m)
-                            for m in ('graph_runtime', 'graph_runtime_module')],
+                extra_libs=[
+                    os.path.join(tvm.micro.CRT_ROOT_DIR, m)
+                    for m in ("graph_runtime", "graph_runtime_module")
+                ],
             )
 
             flasher_kw = {
@@ -367,31 +368,30 @@ def test_crt_link_params():
             flasher = compiler.flasher(**flasher_kw)
             with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
                 rpc_lib = sess.get_system_lib()
-                graph_rt = tvm.contrib.graph_runtime.create(
-                  graph_json, rpc_lib, sess.context)
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, rpc_lib, sess.context)
 
                 # NOTE: not setting params here.
-                graph_rt.set_input('rand_input', rand_input)
+                graph_rt.set_input("rand_input", rand_input)
                 graph_rt.run()
                 linked_output = graph_rt.get_output(0).asnumpy()
 
         with tvm.transform.PassContext(opt_level=3):
-            lib = tvm.relay.build(mod, 'llvm --system-lib', params=param_init)
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
 
             def _run_unlinked(lib):
                 graph_json, mod, lowered_params = lib
                 graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
-                graph_rt.set_input('rand_input', rand_input, **lowered_params)
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
                 graph_rt.run()
                 return graph_rt.get_output(0)
 
             unlinked_output = _run_unlinked(lib).asnumpy()
 
-        if 'int' in dtype:
+        if "int" in dtype:
             np.testing.assert_equal(unlinked_output, linked_output)
         else:
             np.testing.assert_allclose(unlinked_output, linked_output)
 
 
-if __name__ == '__main__':
-  sys.exit(pytest.main([__file__] + sys.argv[1:]))
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 6d6aa6687e25ce7f1dab66549bd67a4984cadc97 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 14:09:36 -0800
Subject: [PATCH 15/60] git-clang-format again

---
 include/tvm/runtime/crt/platform.h            |   4 +-
 include/tvm/runtime/device_api.h              |  10 +-
 src/relay/backend/build_module.cc             |  12 +-
 src/runtime/crt/common/memory.c               |   4 +-
 src/runtime/crt/graph_runtime/graph_runtime.c |  19 +-
 .../graph_runtime_module.c                    |  70 ++-
 src/runtime/crt/host/main.cc                  |   4 +-
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |   3 +-
 src/runtime/graph/graph_runtime.cc            |  18 +-
 src/runtime/graph/graph_runtime.h             |   5 +-
 src/runtime/rpc/rpc_module.cc                 |  47 +-
 src/target/llvm/codegen_llvm.cc               |  50 +-
 src/target/llvm/codegen_params.cc             | 537 +++++++++---------
 src/target/llvm/codegen_params.h              |  10 +-
 src/target/llvm/llvm_module.cc                |   9 +-
 src/target/source/codegen_c_host.cc           |  22 +-
 tests/micro/qemu/zephyr-runtime/src/main.c    |   4 +-
 17 files changed, 428 insertions(+), 400 deletions(-)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 3eac45f64e4e..0f8c6ba7baf2 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -53,8 +53,8 @@ void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t code);
  * \param args extra arguments to be formatted.
  * \return number of bytes written.
  */
-size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
-                                const char* fmt, va_list args);
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 40989e4057e0..a6f5624de084 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -243,9 +243,7 @@ inline const char* DeviceName(int type) {
 /*!
  * \brief Return true if a TVMContext is owned by an RPC session.
  */
-inline bool IsRPCSessionContext(TVMContext ctx) {
-  return (ctx.device_type / kRPCSessMask) > 0;
-}
+inline bool IsRPCSessionContext(TVMContext ctx) { return (ctx.device_type / kRPCSessMask) > 0; }
 
 /*!
  * \brief Return the RPCSessTable index of the RPC Session that owns this context.
@@ -279,9 +277,9 @@ inline std::ostream& operator<<(std::ostream& os, DLContext ctx);
  */
 inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) {
   CHECK(!IsRPCSessionContext(ctx))
-    << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
-  ctx.device_type = static_cast<DLDeviceType>(
-    ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
+      << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
+  ctx.device_type =
+      static_cast<DLDeviceType>(ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
   return ctx;
 }
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 762d29b90933..189227bb15a1 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -460,29 +460,27 @@ class RelayBuildModule : public runtime::ModuleNode {
     // If no target_host has been set, we choose a default one, which is
     // llvm if "codegen.LLVMModuleCreate" is accessible.
     const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-    if (!target_host.defined())
-      target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
       CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
       auto param_ids = graph_codegen_->GetParamIds();
       auto link_params = Map<String, tir::LinkedParam>();
       for (auto param : ret_.params) {
-        link_params.Set(
-          param.first, tir::LinkedParam(param_ids[param.first], param.second));
+        link_params.Set(param.first, tir::LinkedParam(param_ids[param.first], param.second));
       }
 
       Map<String, ObjectRef> dict;
       dict.Set(tvm::tir::attr::kLinkedParams, link_params);
       dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::runtime::symbol::tvm_lookup_linked_param));
       DictAttrs attrs{dict};
-      auto prim = tir::PrimFunc(
-        Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(), Map<tir::Var, tir::Buffer>(), attrs);
+      auto prim = tir::PrimFunc(Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(),
+                                Map<tir::Var, tir::Buffer>(), attrs);
       if (lowered_funcs.find(target_host->str()) == lowered_funcs.end()) {
         lowered_funcs.Set(target_host->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
       }
       lowered_funcs[target_host->str()]->Add(
-        GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param), prim);
+          GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param), prim);
     }
 
     // When there is no lowered_funcs due to reasons such as optimization.
diff --git a/src/runtime/crt/common/memory.c b/src/runtime/crt/common/memory.c
index 646ba46feecb..876c10efe3ea 100644
--- a/src/runtime/crt/common/memory.c
+++ b/src/runtime/crt/common/memory.c
@@ -229,8 +229,8 @@ void* MemoryManager_Realloc(MemoryManager* mgr, void* ptr, tvm_index_t size) {
     vleak_size++;
   }
 #if TVM_CRT_DEBUG > 1
-  TVMLogf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%zu",
-          data, start, mgr->ptable.max_pages, npage, vleak_size, size);
+  TVMLogf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%zu", data,
+          start, mgr->ptable.max_pages, npage, vleak_size, size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index c5dc792cf315..450272d8722b 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -544,9 +544,7 @@ uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint
  * \param runtime The graph runtime.
  * \return the number of input tensors allocated.
  */
-int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) {
-  return runtime->input_nodes_count;
-}
+int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) { return runtime->input_nodes_count; }
 
 /*!
  * \brief Get the input index given the name of input.
@@ -689,9 +687,7 @@ void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) {
  * \param runtime The graph runtime.
  * \return the number of output tensors allocated.
  */
-int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) {
-  return runtime->outputs_count;
-}
+int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) { return runtime->outputs_count; }
 
 int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) {
   int status = 0;
@@ -721,7 +717,8 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     temp_args.tcodes[0] = kTVMArgInt;
     temp_args.values_count = 1;
     lookup_linked_param_valid =
-      (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle, "_lookup_linked_param", &temp_args) == 0);
+        (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle,
+                                      "_lookup_linked_param", &temp_args) == 0);
   }
 
   // Grab saved optimization plan from graph.
@@ -757,9 +754,8 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
 
   // Allocate the space.
   for (idx = 0; idx < pool_entry_count; idx++) {
-    runtime->storage_pool =
-        vrealloc(runtime->storage_pool,
-                 sizeof(TVMGraphRuntimeStorageEntry) * (runtime->storage_pool_count + 1));
+    runtime->storage_pool = vrealloc(runtime->storage_pool, sizeof(TVMGraphRuntimeStorageEntry) *
+                                                                (runtime->storage_pool_count + 1));
     TVMGraphRuntimePoolEntry pit = pool_entry[idx];
     TVMContext ctx = runtime->ctxs[0];
     uint8_t did_find_linked_param = 0;
@@ -787,7 +783,8 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
       DLDataType dtype = {kDLFloat, 32, 1};
       shape[0] = (pit.size + 3) / 4;
       runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 0;
-      runtime->storage_pool[runtime->storage_pool_count].array = TVMNDArray_Empty(1, shape, dtype, ctx);
+      runtime->storage_pool[runtime->storage_pool_count].array =
+          TVMNDArray_Empty(1, shape, dtype, ctx);
       CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor.data, 0,
                "fail to create storage_pool with idx=%d\n", idx);
     }
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
index a8de71e33f9d..3e73efcc62ab 100644
--- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -24,9 +24,9 @@
  * \brief wrap graph_runtime into a TVMModule for use with RPC.
  */
 
-#include <tvm/runtime/crt/graph_runtime_module.h>
 #include <tvm/runtime/crt/func_registry.h>
 #include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_runtime_module.h>
 #include <tvm/runtime/crt/module.h>
 
 #include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h"
@@ -38,7 +38,8 @@ typedef struct {
 
 static GraphRuntimeModule graph_runtime;
 
-int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                     int* ret_tcodes, void* resource_handle) {
   if (graph_runtime.runtime != NULL) {
     return kTvmErrorGraphModuleAlreadyCreated;
   }
@@ -47,7 +48,8 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMModuleHandle || tcodes[2] != kTVMArgInt || tcodes[3] != kTVMArgInt) {
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMModuleHandle || tcodes[2] != kTVMArgInt ||
+      tcodes[3] != kTVMArgInt) {
     return kTvmErrorFunctionCallWrongArgType;
   }
 
@@ -55,7 +57,7 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM
     return kTvmErrorGraphModuleBadContext;
   }
 
-  TVMContext ctx = {(DLDeviceType) args[2].v_int64, (int) args[3].v_int64};
+  TVMContext ctx = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
   graph_runtime.runtime = TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx);
 
   TVMModuleHandle out;
@@ -71,7 +73,8 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM
   return kTvmErrorNoError;
 }
 
-int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -85,14 +88,16 @@ int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, T
     return kTvmErrorGraphModuleNoSuchInput;
   }
 
-  uint32_t eid = TVMGraphRuntime_GetEntryId(
-    graph_runtime.runtime, graph_runtime.runtime->input_nodes[index], 0);
-  ret_values[0].v_handle = (void*) &graph_runtime.runtime->data_entry[eid].dl_tensor;
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime,
+                                            graph_runtime.runtime->input_nodes[index], 0);
+  ret_values[0].v_handle = (void*)&graph_runtime.runtime->data_entry[eid].dl_tensor;
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
+                                           TVMValue* ret_values, int* ret_tcodes,
+                                           void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -102,7 +107,9 @@ int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int narg
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
+                                            TVMValue* ret_values, int* ret_tcodes,
+                                            void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -112,7 +119,9 @@ int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nar
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
+                                        TVMValue* ret_values, int* ret_tcodes,
+                                        void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -130,12 +139,14 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t index = graph_runtime.runtime->outputs[output_index].index;
   uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
 
-  ret_values[0].v_handle = (void*) &(graph_runtime.runtime->data_entry[eid].dl_tensor);
+  ret_values[0].v_handle = (void*)&(graph_runtime.runtime->data_entry[eid].dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
+                                         TVMValue* ret_values, int* ret_tcodes,
+                                         void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -146,11 +157,12 @@ int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
 
   ret_tcodes[0] = kTVMNullptr;
 
-  TVMByteArray* arr = (TVMByteArray*) args[0].v_handle;
+  TVMByteArray* arr = (TVMByteArray*)args[0].v_handle;
   return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size);
 }
 
-int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                  int* ret_tcodes, void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -161,7 +173,8 @@ int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMVal
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
   if (nargs != 2) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -170,28 +183,25 @@ int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, T
     return kTvmErrorFunctionCallWrongArgType;
   }
 
-  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*) args[1].v_handle);
+  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*)args[1].v_handle);
 
   ret_tcodes[0] = kTVMNullptr;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
+                                             TVMValue* ret_values, int* ret_tcodes,
+                                             void* resource_handle) {
   return kTvmErrorFunctionCallNotImplemented;
 }
 
 static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = {
-    &TVMGraphRuntimeModule_GetInput,
-    &TVMGraphRuntimeModule_GetNumInputs,
-    &TVMGraphRuntimeModule_GetNumOutputs,
-    &TVMGraphRuntimeModule_GetOutput,
-    &TVMGraphRuntimeModule_LoadParams,
-    &TVMGraphRuntimeModule_Run,
-    &TVMGraphRuntimeModule_SetInput,
-    &TVMGraphRuntimeModule_NotImplemented,
+    &TVMGraphRuntimeModule_GetInput,      &TVMGraphRuntimeModule_GetNumInputs,
+    &TVMGraphRuntimeModule_GetNumOutputs, &TVMGraphRuntimeModule_GetOutput,
+    &TVMGraphRuntimeModule_LoadParams,    &TVMGraphRuntimeModule_Run,
+    &TVMGraphRuntimeModule_SetInput,      &TVMGraphRuntimeModule_NotImplemented,
 };
 
-
 static const TVMFuncRegistry graph_runtime_registry = {
     "\x08get_input\0"
     "get_num_inputs\0"
@@ -204,8 +214,8 @@ static const TVMFuncRegistry graph_runtime_registry = {
     graph_runtime_registry_funcs};
 
 tvm_crt_error_t TVMGraphRuntimeModule_Register() {
-    graph_runtime.mod.registry = &graph_runtime_registry;
-    graph_runtime.runtime = NULL;
+  graph_runtime.mod.registry = &graph_runtime_registry;
+  graph_runtime.runtime = NULL;
 
-    return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
+  return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
 }
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 8705ca899103..41f2dc3b0a1b 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -47,8 +47,8 @@ ssize_t UTvmWriteFunc(void* context, const uint8_t* data, size_t num_bytes) {
   return to_return;
 }
 
-size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
-                                const char* fmt, va_list args) {
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
   return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
 }
 
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 84930866367e..6674d5993cc6 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -219,8 +219,7 @@ void TVMLogf(const char* format, ...) {
   va_list args;
   char log_buffer[256];
   va_start(args, format);
-  size_t num_bytes_logged = TVMPlatformFormatMessage(
-    log_buffer, sizeof(log_buffer), format, args);
+  size_t num_bytes_logged = TVMPlatformFormatMessage(log_buffer, sizeof(log_buffer), format, args);
   va_end(args);
 
   // Most header-based logging frameworks tend to insert '\n' at the end of the log message.
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 423a2d62ea93..e0401134cccc 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -261,8 +261,8 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   TVMContext ctx = args[3];
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
-  tvm::runtime::PackedFunc pf = mod.GetFunction(
-    ::tvm::runtime::symbol::tvm_lookup_linked_param, true);
+  tvm::runtime::PackedFunc pf =
+      mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
   if (pf == nullptr) {
     *rv = nullptr;
     return;
@@ -274,9 +274,8 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
     return;
   }
 
-  std::vector<int64_t> shape_vec{
-      template_tensor->shape,
-      template_tensor->shape + template_tensor->ndim};
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
 
   std::unique_ptr<NDArray::Container> container{new NDArray::Container(
       static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
@@ -297,7 +296,6 @@ std::string List2String(std::vector<int64_t> shape) {
   return ss.str();
 }
 
-
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
@@ -335,10 +333,10 @@ void GraphRuntime::SetupStorage() {
     TVMRetValue lookup_rv;
     {
       std::vector<int64_t> shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()};
-      DLTensor template_tensor{
-        nullptr, TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()), vtype[i], shape_vec.data(), nullptr, 0};
-      lookup_rv = lookup_linked_param_(
-        module_, sid, &template_tensor, ctxs_[0]);
+      DLTensor template_tensor{nullptr,  TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()),
+                               vtype[i], shape_vec.data(),      nullptr,
+                               0};
+      lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, ctxs_[0]);
     }
     if (lookup_rv.type_code() != kTVMNullptr) {
       pool_entry[sid].linked_param = lookup_rv;
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 9e95dfc9bf96..f1894c4830d0 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -187,8 +187,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     int device_type;
     int param_data_entry;
     NDArray linked_param;
-//    PoolEntry(int s, int dev_type, void* pre_linked_param) :
-//        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
+    //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
+    //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
   // Node entry
   struct NodeEntry {
@@ -420,7 +420,6 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   std::vector<std::function<void()>> op_execs_;
   /*! \brief Linked parameter lookup function. */
   PackedFunc lookup_linked_param_;
-
 };
 
 std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 0ddd13572949..c34ec26fb6c4 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -37,7 +37,6 @@
 namespace tvm {
 namespace runtime {
 
-
 // deleter of RPC remote array
 static void RemoteNDArrayDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
@@ -59,22 +58,23 @@ static void RemoteNDArrayDeleter(Object* obj) {
  *      needs to be explicitly deleted after the NDArray is freed, this function should do that.
  * \param deleter_ctx An opaque pointer passed to deleter to identify the tensor being deleted.
  */
-NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle, DLTensor* template_tensor, TVMContext ctx, ADTObj::FDeleter deleter, void* deleter_ctx) {
+NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
+                                      DLTensor* template_tensor, TVMContext ctx,
+                                      ADTObj::FDeleter deleter, void* deleter_ctx) {
   ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
-    << "The TVMContext given does not belong to the given session";
+      << "The TVMContext given does not belong to the given session";
   RemoteSpace* space = new RemoteSpace();
   space->sess = sess;
   space->data = handle;
   std::vector<int64_t> shape_vec{template_tensor->shape,
                                  template_tensor->shape + template_tensor->ndim};
-  NDArray::Container* data = new NDArray::Container(
-    static_cast<void*>(space), std::move(shape_vec), template_tensor->dtype, ctx);
+  NDArray::Container* data = new NDArray::Container(static_cast<void*>(space), std::move(shape_vec),
+                                                    template_tensor->dtype, ctx);
   data->manager_ctx = deleter_ctx;
   data->SetDeleter(deleter);
   return NDArray(GetObjectPtr<Object>(data));
 }
 
-
 /*!
  * \brief A wrapped remote function as a PackedFunc.
  */
@@ -284,7 +284,9 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
-    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor, AddRPCSessionMask(tensor->ctx, sess_->table_index()), RemoteNDArrayDeleter, nd_handle);
+    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
+                                        AddRPCSessionMask(tensor->ctx, sess_->table_index()),
+                                        RemoteNDArrayDeleter, nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
     *rv = args[1];
@@ -470,20 +472,23 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
-TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle").set_body_typed(
-  [](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx, PackedFunc deleter) -> NDArray {
-//    auto func = new std::function<void()>([deleter]() -> void {
-//      deleter();
-//    });
-    return NDArrayFromRemoteOpaqueHandle(
-      RPCModuleGetSession(mod), remote_array, template_tensor, ctx,
-      [](Object* context) {
-//        auto container = static_cast<NDArray::Container*>(context);
-//        auto cb_func = reinterpret_cast<std::function<void()>*>(container->manager_ctx);
-//        (*cb_func)();
-//        delete cb_func;
-      }, nullptr);//(void*) func);
-  });
+TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
+    .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
+                       PackedFunc deleter) -> NDArray {
+      //    auto func = new std::function<void()>([deleter]() -> void {
+      //      deleter();
+      //    });
+      return NDArrayFromRemoteOpaqueHandle(
+          RPCModuleGetSession(mod), remote_array, template_tensor, ctx,
+          [](Object* context) {
+            //        auto container = static_cast<NDArray::Container*>(context);
+            //        auto cb_func =
+            //        reinterpret_cast<std::function<void()>*>(container->manager_ctx);
+            //        (*cb_func)();
+            //        delete cb_func;
+          },
+          nullptr);  //(void*) func);
+    });
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 39ea82065377..611ce47c6126 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -210,9 +210,9 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   // Once we allow more flexibility in the PrimFunc.
   llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
 
-  llvm::Function* function = llvm::Function::Create(
-    ftype, llvm::Function::ExternalLinkage,
-    ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
+  llvm::Function* function =
+      llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                             ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
 
@@ -220,24 +220,24 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   builder_->SetInsertPoint(entry);
   std::vector<llvm::Value*> zero_index_list{llvm::ConstantInt::get(t_int32_, 0)};
   auto args_array = builder_->CreateBitCast(
-    &function->arg_begin()[0], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
-  llvm::Value* sid =
-    builder_->CreateBitCast(
+      &function->arg_begin()[0],
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+  llvm::Value* sid = builder_->CreateBitCast(
       builder_->CreateLoad(t_void_->getPointerTo(GetGlobalAddressSpace()),
-                           builder_->CreateInBoundsGEP(args_array, zero_index_list)), t_int64_);
-    //
-//    builder_->CreateGEP(&function->arg_begin()[0], zero_index_list), t_int64_);
+                           builder_->CreateInBoundsGEP(args_array, zero_index_list)),
+      t_int64_);
+  //
+  //    builder_->CreateGEP(&function->arg_begin()[0], zero_index_list), t_int64_);
 
   llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
   llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
 
   builder_->SetInsertPoint(default_block);
   {
-    auto ret_types_array = builder_->CreateBitCast(
-      &function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
-    builder_->CreateStore(
-      llvm::ConstantInt::get(t_int_, kTVMNullptr),
-      builder_->CreateGEP(ret_types_array, zero_index_list));
+    auto ret_types_array =
+        builder_->CreateBitCast(&function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
+                          builder_->CreateGEP(ret_types_array, zero_index_list));
     builder_->CreateRet(ConstInt32(kTvmErrorNoError));
   }
 
@@ -249,24 +249,22 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
     array->print(os);
     std::string symbol_name = std::string{::tvm::runtime::symbol::tvm_param_prefix} + kv.first;
     llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
-      *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage,
-      array, symbol_name);
+        *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
 
     llvm::BasicBlock* case_block = llvm::BasicBlock::Create(*ctx_, "case_" + symbol_name, function);
     switch_inst->addCase(
-      llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)),
-      case_block);
+        llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
     builder_->SetInsertPoint(case_block);
     auto retval_array = builder_->CreateBitCast(
-      &function->arg_begin()[3], llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
-    builder_->CreateStore(
-      builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
-      builder_->CreateGEP(retval_array, zero_index_list));
-    auto ret_types_array = builder_->CreateBitCast(
-      &function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+        &function->arg_begin()[3],
+        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
     builder_->CreateStore(
-      llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
-      builder_->CreateGEP(ret_types_array, zero_index_list));
+        builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
+        builder_->CreateGEP(retval_array, zero_index_list));
+    auto ret_types_array =
+        builder_->CreateBitCast(&function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+                          builder_->CreateGEP(ret_types_array, zero_index_list));
     builder_->CreateRet(ConstInt32(0));
   }
 
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 9c0b979044f4..688daf6a7191 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -31,17 +31,15 @@ namespace codegen {
 
 class DLManagedTensorDeleter {
  public:
-  void operator()(DLManagedTensor* ptr) {
-    ptr->deleter(ptr);
-  }
+  void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
 };
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
   llvm::Type* element_type = nullptr;
 
   auto arr_type = arr.DataType();
-  CHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
 
   auto shape = arr.Shape();
   int num_elements = 1;
@@ -53,128 +51,121 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
   std::vector<llvm::Constant*> elements;
 
   switch (arr_type.code()) {
-  case runtime::DataType::kInt:
-    CHECK(arr_type.bits() == 8 ||
-          arr_type.bits() == 16 ||
-          arr_type.bits() == 32 ||
-          arr_type.bits() == 64)
-      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-      << arr_type.bits() << "-bit array";
-    element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
-
-    if (arr_type.bits() == 8) {
-      int8_t* data_buf = static_cast<int8_t*>(tensor->dl_tensor.data);
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(llvm::ConstantInt::getSigned(element_type, data_buf[i]));
-      }
-    } else if (arr_type.bits() == 16) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::getSigned(element_type, ((int16_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else if (arr_type.bits() == 32) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::getSigned(element_type, ((int32_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else if (arr_type.bits() == 64) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::getSigned(element_type, ((int64_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else {
-      CHECK(false) << "should not get here";
-    }
-    break;
-
-  case runtime::DataType::TypeCode::kUInt:
-    CHECK(arr_type.bits() == 8 ||
-          arr_type.bits() == 16 ||
-          arr_type.bits() == 32 ||
-          arr_type.bits() == 64)
-      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-      << arr_type.bits() << "-bit array";
-    element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
-
-    if (arr_type.bits() == 8) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::get(element_type, ((int8_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else if (arr_type.bits() == 16) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::get(element_type, ((int16_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else if (arr_type.bits() == 32) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::get(element_type, ((int32_t*) tensor->dl_tensor.data)[i]));
-      }
-    } else if (arr_type.bits() == 64) {
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantInt::get(element_type, ((int64_t*) tensor->dl_tensor.data)[i]));
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      if (arr_type.bits() == 8) {
+        int8_t* data_buf = static_cast<int8_t*>(tensor->dl_tensor.data);
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(llvm::ConstantInt::getSigned(element_type, data_buf[i]));
+        }
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::getSigned(element_type, ((int16_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::getSigned(element_type, ((int32_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::getSigned(element_type, ((int64_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else {
+        CHECK(false) << "should not get here";
       }
-    } else {
-      CHECK(false) << "should not get here";
-    }
-    break;
-
-  case runtime::DataType::TypeCode::kFloat:
-    if (arr_type.bits() == 32) {
-      element_type = llvm::Type::getFloatTy(*ctx);
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantFP::get(element_type, ((float*) tensor->dl_tensor.data)[i]));
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      if (arr_type.bits() == 8) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::get(element_type, ((int8_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::get(element_type, ((int16_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::get(element_type, ((int32_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantInt::get(element_type, ((int64_t*)tensor->dl_tensor.data)[i]));
+        }
+      } else {
+        CHECK(false) << "should not get here";
       }
-    } else if (arr_type.bits() == 64) {
-      element_type = llvm::Type::getDoubleTy(*ctx);
-      for (int i = 0; i < num_elements; i++) {
-        elements.emplace_back(
-          llvm::ConstantFP::get(element_type, ((double*) tensor->dl_tensor.data)[i]));
+      break;
+
+    case runtime::DataType::TypeCode::kFloat:
+      if (arr_type.bits() == 32) {
+        element_type = llvm::Type::getFloatTy(*ctx);
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantFP::get(element_type, ((float*)tensor->dl_tensor.data)[i]));
+        }
+      } else if (arr_type.bits() == 64) {
+        element_type = llvm::Type::getDoubleTy(*ctx);
+        for (int i = 0; i < num_elements; i++) {
+          elements.emplace_back(
+              llvm::ConstantFP::get(element_type, ((double*)tensor->dl_tensor.data)[i]));
+        }
+      } else {
+        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                     << arr_type.bits() << "-bit array";
       }
-    } else {
-      CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                   << arr_type.bits() << "-bit array";
-    }
-    break;
+      break;
 
-  default:
-    CHECK(false) << "Data type not supported";
+    default:
+      CHECK(false) << "Data type not supported";
   }
 
-  return llvm::cast<llvm::ConstantArray>(
-    llvm::ConstantArray::get(llvm::ArrayType::get(element_type, num_elements),
-                             llvm::ArrayRef<llvm::Constant*>(elements)));
+  return llvm::cast<llvm::ConstantArray>(llvm::ConstantArray::get(
+      llvm::ArrayType::get(element_type, num_elements), llvm::ArrayRef<llvm::Constant*>(elements)));
 }
 
-
 static constexpr const char* kFloatCast = "(float)";
 static constexpr const char* kDoubleCast = "(double)";
 
 static constexpr const int kMaxLineLength = 80;
 
-
 void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
   auto arr_type = arr.DataType();
-  CHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
 
   int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
   if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
-    one_element_size_bytes += 1; // sign bit
+    one_element_size_bytes += 1;  // sign bit
     if (arr_type.bits() > 32) {
       one_element_size_bytes += 2;  // "UL"
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
     if (arr_type.bits() > 32) {
-      one_element_size_bytes += 1; // "L"
+      one_element_size_bytes += 1;  // "L"
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
     // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ +
-      1 /* exponent sign */ + 1 /* extra decimal digit in exponent */;
+    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */ +
+                              1 /* extra decimal digit in exponent */;
   }
 
   int elements_per_row = 16;
@@ -198,171 +189,207 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
   os.fill('0');
   switch (arr_type.code()) {
-  case runtime::DataType::kInt:
-    CHECK(arr_type.bits() == 8 ||
-          arr_type.bits() == 16 ||
-          arr_type.bits() == 32 ||
-          arr_type.bits() == 64)
-      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-      << arr_type.bits() << "-bit array";
-
-    if (arr_type.bits() == 8) {
-      for (int i = 0; i < num_elements; i++) {
-        // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid printing
-        // as a char.
-        int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
-        uint16_t to_print;
-        if (elem < 0) {
-          os << "-";
-          to_print = -elem;
-        } else {
-          os << "+";
-          to_print = elem;
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        for (int i = 0; i < num_elements; i++) {
+          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
+          // printing as a char.
+          int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
+          uint16_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else if (arr_type.bits() == 16) {
-      for (int i = 0; i < num_elements; i++) {
-        int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
-        uint16_t to_print;
-        if (elem < 0) {
-          os << "-";
-          to_print = -elem;
-        } else {
-          os << "+";
-          to_print = elem;
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
+          uint16_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(4) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        os << "0x" << std::setw(4) << to_print;
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else if (arr_type.bits() == 32) {
-      for (int i = 0; i < num_elements; i++) {
-        int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
-        uint32_t to_print;
-        if (elem < 0) {
-          os << "-";
-          to_print = -elem;
-        } else {
-          os << "+";
-          to_print = elem;
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
+          uint32_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(8) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        os << "0x" << std::setw(8) << to_print;
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else if (arr_type.bits() == 64) {
-      for (int i = 0; i < num_elements; i++) {
-        int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
-        uint64_t to_print;
-        if (elem < 0) {
-          os << "-";
-          to_print = -elem;
-        } else {
-          os << "+";
-          to_print = elem;
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
+          uint64_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(16) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        os << "0x" << std::setw(16) << to_print;
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else {
-      CHECK(false) << "should not get here";
-    }
-    break;
-
-  case runtime::DataType::TypeCode::kUInt:
-    CHECK(arr_type.bits() == 8 ||
-          arr_type.bits() == 16 ||
-          arr_type.bits() == 32 ||
-          arr_type.bits() == 64)
-      << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-      << arr_type.bits() << "-bit array";
-
-    if (arr_type.bits() == 8) {
-      for (int i = 0; i < num_elements; i++) {
-        // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid printing
-        // as a char.
-        os << "0x" << std::setw(2)
-           << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      } else {
+        CHECK(false) << "should not get here";
       }
-    } else if (arr_type.bits() == 16) {
-      for (int i = 0; i < num_elements; i++) {
-        os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else if (arr_type.bits() == 32) {
-      for (int i = 0; i < num_elements; i++) {
-        os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else if (arr_type.bits() == 64) {
-      for (int i = 0; i < num_elements; i++) {
-        os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
-      }
-    } else {
-      CHECK(false) << "should not get here";
-    }
-    break;
-
-  case runtime::DataType::TypeCode::kFloat: {
-    std::stringstream ss;
-    ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
-            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
-    os.fill(' ');
-    os.setf(std::ios::left, std::ios::adjustfield);
-    if (arr_type.bits() == 32) {
-      for (int i = 0; i < num_elements; i++) {
-        float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
-        if (std::isinf(elem)) {
-          // C99 standard.
-          os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-        } else if (std::isnan(elem)) {
-          // GNU extension, implemenatation-dependent.
-          os << std::setw(one_element_size_bytes) << "NAN";
-        } else {
-          ss << elem;
-          os << std::setw(one_element_size_bytes) << ss.str();
-          ss.str("");
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        for (int i = 0; i < num_elements; i++) {
+          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
+          // printing as a char.
+          os << "0x" << std::setw(2)
+             << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else {
+        CHECK(false) << "should not get here";
       }
-      std::cout << "\n";
-    } else if (arr_type.bits() == 64) {
-      for (int i = 0; i < num_elements; i++) {
-        double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
-        if (std::isinf(elem)) {
-          // C99 standard.
-          os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-        } else if (std::isnan(elem)) {
-          // GNU extension, implemenatation-dependent.
-          os << std::setw(one_element_size_bytes) << "NAN";
-        } else {
-          ss << elem;
-          os << std::setw(one_element_size_bytes) << ss.str();
-          ss.str("");
+      break;
+
+    case runtime::DataType::TypeCode::kFloat: {
+      std::stringstream ss;
+      ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+              std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+      os.fill(' ');
+      os.setf(std::ios::left, std::ios::adjustfield);
+      if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
+          if (std::isinf(elem)) {
+            // C99 standard.
+            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+          } else if (std::isnan(elem)) {
+            // GNU extension, implemenatation-dependent.
+            os << std::setw(one_element_size_bytes) << "NAN";
+          } else {
+            ss << elem;
+            os << std::setw(one_element_size_bytes) << ss.str();
+            ss.str("");
+          }
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
         }
-        if (i < num_elements - 1) { os << ", "; }
-        if (((i + 1) % elements_per_row) == 0) { os << "\n" << indent_str; }
+        std::cout << "\n";
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
+          if (std::isinf(elem)) {
+            // C99 standard.
+            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+          } else if (std::isnan(elem)) {
+            // GNU extension, implemenatation-dependent.
+            os << std::setw(one_element_size_bytes) << "NAN";
+          } else {
+            ss << elem;
+            os << std::setw(one_element_size_bytes) << ss.str();
+            ss.str("");
+          }
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else {
+        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                     << arr_type.bits() << "-bit array";
       }
-    } else {
-      CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                   << arr_type.bits() << "-bit array";
+      break;
     }
-    break;
-  }
 
-  default:
-    CHECK(false) << "Data type not supported";
+    default:
+      CHECK(false) << "Data type not supported";
   }
 
   if (num_elements % elements_per_row != 0) {
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
index 8b8ba4f23cc6..6e3c3e8eafd6 100644
--- a/src/target/llvm/codegen_params.h
+++ b/src/target/llvm/codegen_params.h
@@ -24,10 +24,11 @@
 #ifndef TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
 #define TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
 
-#include "llvm_common.h"
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
 
+#include "llvm_common.h"
+
 namespace tvm {
 namespace codegen {
 
@@ -35,14 +36,11 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
 void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
 
-void LLVMCodeGenParams(llvm::LLVMContext* ctx,
-                       llvm::Module* module,
-                       int64_t storage_id_offset,
+void LLVMCodeGenParams(llvm::LLVMContext* ctx, llvm::Module* module, int64_t storage_id_offset,
                        ::tvm::runtime::Array<String> param_names,
                        ::tvm::runtime::Array<runtime::NDArray> params_by_sid);
 
-
 }  // namespace codegen
 }  // namespace tvm
 
-#endif // TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+#endif  // TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index ab2fcee00b9e..73a3594427d3 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -206,11 +206,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     for (auto kv : mod->functions) {
       if (could_have_linked_params &&
           kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
-        Map<String,ObjectRef> attrs_dict = Downcast<Map<String,ObjectRef>>(kv.second->attrs->dict);
+        Map<String, ObjectRef> attrs_dict =
+            Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
         CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
-          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
-        linked_params = Downcast<Map<String,LinkedParam>>(
-          attrs_dict[::tvm::tir::attr::kLinkedParams]);
+            << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+        linked_params =
+            Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
         found_linked_params = true;
         continue;
       }
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 915d43cffb13..f47e07e94bd1 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -23,8 +23,8 @@
 #include "codegen_c_host.h"
 
 #include <tvm/runtime/container.h>
-#include <tvm/runtime/module.h>
 #include <tvm/runtime/crt/error_codes.h>
+#include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 
 #include <string>
@@ -67,8 +67,7 @@ void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
          << "int* out_ret_tcode, void* resource_handle) {\n";
   ICHECK_EQ(GetUniqueName(tvm::runtime::symbol::tvm_lookup_linked_param),
             tvm::runtime::symbol::tvm_lookup_linked_param)
-    << "builtin PackedFunc name already taken: "
-    << tvm::runtime::symbol::tvm_lookup_linked_param;
+      << "builtin PackedFunc name already taken: " << tvm::runtime::symbol::tvm_lookup_linked_param;
   stream << "    switch (((int64_t*) args)[0]) {\n"
          << "    default:\n"
          << "        out_ret_tcode[0] = " << kTVMNullptr << ";\n"
@@ -86,15 +85,16 @@ void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
       num_elements *= dim;
     }
     PrintType(kv.second->param.DataType(), decl_stream);
-    decl_stream << " " << ::tvm::runtime::symbol::tvm_param_prefix
-                << kv.first << "[" << num_elements << "] = {\n";
+    decl_stream << " " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << "["
+                << num_elements << "] = {\n";
     NDArrayDataToC(kv.second->param, 4, decl_stream);
     decl_stream << "};\n"
                 << "#ifdef __cplusplus\n"
                 << "}  // extern \"C\"\n"
                 << "#endif\n";
     stream << "    case " << kv.second->id << ":\n"
-           << "        ((uint64_t*)out_ret_value)[0] = (uint64_t) (uintptr_t) " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
+           << "        ((uint64_t*)out_ret_value)[0] = (uint64_t) (uintptr_t) "
+           << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
            << "        out_ret_tcode[0] = " << kTVMOpaqueHandle << ";\n"
            << "        return 0;\n";
   }
@@ -352,17 +352,17 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   CodeGenCHost cg;
   cg.Init(output_ssa, emit_asserts, target->str());
 
-  Map<String,LinkedParam> linked_params;
+  Map<String, LinkedParam> linked_params;
   bool found_linked_params = false;
   bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
   for (auto kv : mod->functions) {
     if (could_have_linked_params &&
         kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
-      Map<String,ObjectRef> attrs_dict = Downcast<Map<String,ObjectRef>>(kv.second->attrs->dict);
+      Map<String, ObjectRef> attrs_dict = Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
       CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
-        << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
-      linked_params = Downcast<Map<String,LinkedParam>>(
-        attrs_dict[::tvm::tir::attr::kLinkedParams]);
+          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+      linked_params =
+          Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
       found_linked_params = true;
       continue;
     }
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
index 91b13de7d04d..1fa32e384c0b 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -57,8 +57,8 @@ ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
   return size;
 }
 
-size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes,
-                                const char* fmt, va_list args) {
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
   return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
 }
 

From 261eda746032407416bd6a745821e3a92791eb1d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 15:13:29 -0800
Subject: [PATCH 16/60] address c++ lint

---
 src/relay/backend/graph_runtime_codegen.cc |  2 +-
 src/target/llvm/codegen_llvm.cc            |  2 +-
 src/target/llvm/codegen_params.cc          | 21 ++++++++++++---------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index a5073326c13c..b0b9d206a1a2 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -206,7 +206,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
     for (auto param : params_) {
       ret.params.emplace(std::make_pair(
-          param.first, std::make_pair(int(param_storage_ids_[param.first]), param.second)));
+                           param.first, std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
     }
 
     for (auto& kv : lowered_funcs_) {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 611ce47c6126..dbc41a6e1a54 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -247,7 +247,7 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
     auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
     std::cout << "param " << kv.first << ": ";
     array->print(os);
-    std::string symbol_name = std::string{::tvm::runtime::symbol::tvm_param_prefix} + kv.first;
+    std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
     llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
         *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
 
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 688daf6a7191..958d4db0ac29 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -25,6 +25,9 @@
 #include "codegen_params.h"
 
 #include <iomanip>
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace tvm {
 namespace codegen {
@@ -66,17 +69,17 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       } else if (arr_type.bits() == 16) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::getSigned(element_type, ((int16_t*)tensor->dl_tensor.data)[i]));
+            llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 32) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::getSigned(element_type, ((int32_t*)tensor->dl_tensor.data)[i]));
+            llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::getSigned(element_type, ((int64_t*)tensor->dl_tensor.data)[i]));
+              llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "should not get here";
@@ -93,22 +96,22 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       if (arr_type.bits() == 8) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::get(element_type, ((int8_t*)tensor->dl_tensor.data)[i]));
+              llvm::ConstantInt::get(element_type, reinterpret_cast<int8_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 16) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::get(element_type, ((int16_t*)tensor->dl_tensor.data)[i]));
+              llvm::ConstantInt::get(element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 32) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::get(element_type, ((int32_t*)tensor->dl_tensor.data)[i]));
+              llvm::ConstantInt::get(element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantInt::get(element_type, ((int64_t*)tensor->dl_tensor.data)[i]));
+              llvm::ConstantInt::get(element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "should not get here";
@@ -120,13 +123,13 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
         element_type = llvm::Type::getFloatTy(*ctx);
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantFP::get(element_type, ((float*)tensor->dl_tensor.data)[i]));
+            llvm::ConstantFP::get(element_type, reinterpret_cast<float*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         element_type = llvm::Type::getDoubleTy(*ctx);
         for (int i = 0; i < num_elements; i++) {
           elements.emplace_back(
-              llvm::ConstantFP::get(element_type, ((double*)tensor->dl_tensor.data)[i]));
+            llvm::ConstantFP::get(element_type, reinterpret_cast<double*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "

From c0d2c0d3d572d7ee6d754c8c432f07ce4d15d78b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 15:13:43 -0800
Subject: [PATCH 17/60] git-clang-format

---
 src/relay/backend/graph_runtime_codegen.cc |  3 +-
 src/target/llvm/codegen_params.cc          | 36 +++++++++++-----------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index b0b9d206a1a2..f35f144181c6 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -206,7 +206,8 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
     for (auto param : params_) {
       ret.params.emplace(std::make_pair(
-                           param.first, std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+          param.first,
+          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
     }
 
     for (auto& kv : lowered_funcs_) {
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 958d4db0ac29..a7d79c466391 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -68,18 +68,18 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
         }
       } else if (arr_type.bits() == 16) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-            llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::getSigned(
+              element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 32) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-            llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::getSigned(
+              element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-              llvm::ConstantInt::getSigned(element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::getSigned(
+              element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "should not get here";
@@ -95,23 +95,23 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
       if (arr_type.bits() == 8) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-              llvm::ConstantInt::get(element_type, reinterpret_cast<int8_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::get(
+              element_type, reinterpret_cast<int8_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 16) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-              llvm::ConstantInt::get(element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::get(
+              element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 32) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-              llvm::ConstantInt::get(element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::get(
+              element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-              llvm::ConstantInt::get(element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantInt::get(
+              element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "should not get here";
@@ -122,14 +122,14 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       if (arr_type.bits() == 32) {
         element_type = llvm::Type::getFloatTy(*ctx);
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-            llvm::ConstantFP::get(element_type, reinterpret_cast<float*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantFP::get(
+              element_type, reinterpret_cast<float*>(tensor->dl_tensor.data)[i]));
         }
       } else if (arr_type.bits() == 64) {
         element_type = llvm::Type::getDoubleTy(*ctx);
         for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(
-            llvm::ConstantFP::get(element_type, reinterpret_cast<double*>(tensor->dl_tensor.data)[i]));
+          elements.emplace_back(llvm::ConstantFP::get(
+              element_type, reinterpret_cast<double*>(tensor->dl_tensor.data)[i]));
         }
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "

From 601616a0d69ac4ec5d23ce2111ddb1c3438f5c68 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 15:19:44 -0800
Subject: [PATCH 18/60] rm extra comments

---
 src/runtime/rpc/rpc_module.cc | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index c34ec26fb6c4..f8f95c5b7e28 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -475,19 +475,9 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
 TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
     .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
                        PackedFunc deleter) -> NDArray {
-      //    auto func = new std::function<void()>([deleter]() -> void {
-      //      deleter();
-      //    });
       return NDArrayFromRemoteOpaqueHandle(
           RPCModuleGetSession(mod), remote_array, template_tensor, ctx,
-          [](Object* context) {
-            //        auto container = static_cast<NDArray::Container*>(context);
-            //        auto cb_func =
-            //        reinterpret_cast<std::function<void()>*>(container->manager_ctx);
-            //        (*cb_func)();
-            //        delete cb_func;
-          },
-          nullptr);  //(void*) func);
+          [](Object* context) {}, nullptr);
     });
 
 }  // namespace runtime

From cf22894043f180584beb0cc35482a13f9bc43de2 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 15:20:03 -0800
Subject: [PATCH 19/60] git-clang-format

---
 src/runtime/rpc/rpc_module.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index f8f95c5b7e28..12510e0fac3a 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -476,8 +476,8 @@ TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
     .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
                        PackedFunc deleter) -> NDArray {
       return NDArrayFromRemoteOpaqueHandle(
-          RPCModuleGetSession(mod), remote_array, template_tensor, ctx,
-          [](Object* context) {}, nullptr);
+          RPCModuleGetSession(mod), remote_array, template_tensor, ctx, [](Object* context) {},
+          nullptr);
     });
 
 }  // namespace runtime

From ad5837e91249972b2fbe2998150c639b6e098f88 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 16:52:43 -0800
Subject: [PATCH 20/60] pylint

---
 python/tvm/contrib/binutils.py |  3 ---
 python/tvm/micro/debugger.py   | 12 ++++++------
 python/tvm/micro/session.py    | 22 ++++++++++++++++++++++
 python/tvm/relay/param_dict.py | 16 ----------------
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/python/tvm/contrib/binutils.py b/python/tvm/contrib/binutils.py
index 53f92b9855fe..ea6b6320fbbd 100644
--- a/python/tvm/contrib/binutils.py
+++ b/python/tvm/contrib/binutils.py
@@ -17,10 +17,7 @@
 
 """Utilities for binary file manipulation"""
 import logging
-import os
 import subprocess
-import tvm._ffi
-from . import utils
 
 _LOG = logging.getLogger(__name__)
 
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 9dd496a950e5..138b43d7cdc1 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -236,11 +236,11 @@ def write(self, data, timeout_sec):
             while timeout_sec == 0 or time.monotonic() < end_time:
                 try:
                     return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
-                except OSError as e:
-                    if e.errno == errno.EAGAIN:
+                except OSError as exc:
+                    if exc.errno == errno.EAGAIN:
                         time.sleep(0.1)
                         continue
-                    raise e
+                    raise exc
 
             raise base.IoTimeoutError()
 
@@ -249,11 +249,11 @@ def read(self, n, timeout_sec):
             while timeout_sec == 0 or time.monotonic() < end_time:
                 try:
                     return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
-                except OSError as e:
-                    if e.errno == errno.EAGAIN:
+                except OSError as exc:
+                    if exc.errno == errno.EAGAIN:
                         time.sleep(0.1)
                         continue
-                    raise e
+                    raise exc
 
             raise base.IoTimeoutError()
 
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index adbad20cda06..5bc01186096f 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -155,6 +155,28 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
 
 
 def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
+    """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module.
+
+    This function signature matches the signature built by
+
+    Parameters
+    ----------
+    mod : tvm.runtime.Module
+        The remote Module containing the pre-linked parameters.
+    storage_id : int
+        An integer identifying the pre-linked paramter to find
+    template_tensor : DLTensor
+        A DLTensor containing metadata that should be filled-in to the returned NDArray. This
+        function should mostly not inspect this, and just pass it along to
+        NDArrayFromRemoteOpaqueHandle.
+    ctx : TVMContext
+        The remote CPU context to be used with the returned NDArray.
+
+    Returns
+    -------
+    tvm.nd.NDArray :
+        NDArray containing the pre-linked parameter.
+    """
     try:
         lookup_linked_param = mod.get_function("_lookup_linked_param")
     except KeyError:
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 37b4f1c72c4a..503a43ce3d4b 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -77,19 +77,3 @@ def load_param_dict(param_bytes):
         param_bytes = bytearray(param_bytes)
     load_arr = _load_param_dict(param_bytes)
     return {v.name: v.array for v in load_arr}
-
-
-def linkable_param_dict(graph_json, params, target):
-    graph = json.loads(graph_json)
-    data_by_sid = [None] * len(params)
-    for param_name, param in params.items():
-        for node in graph["nodes"]:
-            if node["name"] == param_name:
-                sid = node["storage_id"]
-                data_by_sid[sid] = param
-
-    # GraphRuntimeCodegen is expected to allocated the first len(params) storage_ids to contain
-    # parameters.
-    assert all(d is not None for d in data_by_sid)
-
-    data_

From 154bf5f871ee9428ba86f9d2b85cc48ec39baa9e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 16:59:54 -0800
Subject: [PATCH 21/60] pylint again

---
 python/tvm/relay/param_dict.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 503a43ce3d4b..2d0398e20486 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name
 """Helper utility to save parameter dicts."""
-import json
 import tvm
 import tvm._ffi
 

From 4d9fc2ed22338b78b889bbe606c5424f4cc1b6c8 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 17:16:50 -0800
Subject: [PATCH 22/60] rm debugging breaking build

---
 src/runtime/graph/graph_runtime.cc | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index e0401134cccc..293de2276621 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -282,20 +282,6 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   *rv = NDArray(GetObjectPtr<Object>(container.release()));
 }
 
-std::string List2String(std::vector<int64_t> shape) {
-  if (shape.size() == 0) {
-    return "[]";
-  }
-
-  std::stringstream ss;
-  ss << "[" << shape[0];
-  for (int i = 1; i < shape.size(); i++) {
-    ss << ", " << shape[i];
-  }
-  ss << "]";
-  return ss.str();
-}
-
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
@@ -373,9 +359,6 @@ void GraphRuntime::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    LOG(INFO) << "sid " << i << ": (" << List2String(storage_pool_[storage_id].Shape())
-              << ", dtype=" << storage_pool_[storage_id].DataType() << ")"
-              << ": setup view: " << List2String(attrs_.shape[i]);
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
 
     const DLTensor* tmp = data_entry_[i].operator->();

From 891ccf52e7f83036f2ad8af22120435efdf0eab6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 19:37:04 -0800
Subject: [PATCH 23/60] fix incorrect parameter passing in GraphRuntimeModule

---
 src/runtime/crt/graph_runtime_module/graph_runtime_module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
index 3e73efcc62ab..2a32a0251507 100644
--- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -114,7 +114,7 @@ int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nar
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs();
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime);
   ret_tcodes[0] = kTVMArgInt;
   return 0;
 }
@@ -131,7 +131,7 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   }
 
   int output_index = args[0].v_int64;
-  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs()) {
+  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime)) {
     return kTvmErrorGraphModuleNoSuchInput;
   }
 

From df132faf28ee9627220ea6f5bef3041a8997aaa4 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 19:37:30 -0800
Subject: [PATCH 24/60] fixes for LLVM 4.0 and i386

---
 src/target/llvm/codegen_llvm.cc | 68 +++++++++++++++------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index dbc41a6e1a54..1d63d35055dd 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -197,7 +197,7 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   // tcodes
   param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
   // num_args
-  param_types.push_back(t_int64_);
+  param_types.push_back(t_int_);
   // ret_args
   param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
   // ret_tcodes
@@ -219,15 +219,19 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
   builder_->SetInsertPoint(entry);
   std::vector<llvm::Value*> zero_index_list{llvm::ConstantInt::get(t_int32_, 0)};
+  std::vector<llvm::Value*> zero_array_index_list{llvm::ConstantInt::get(t_int32_, 0),
+                                                  llvm::ConstantInt::get(t_int32_, 0)};
   auto args_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
       &function->arg_begin()[0],
+#else
+      &(*(function->arg_begin())),
+#endif
       llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
   llvm::Value* sid = builder_->CreateBitCast(
       builder_->CreateLoad(t_void_->getPointerTo(GetGlobalAddressSpace()),
                            builder_->CreateInBoundsGEP(args_array, zero_index_list)),
       t_int64_);
-  //
-  //    builder_->CreateGEP(&function->arg_begin()[0], zero_index_list), t_int64_);
 
   llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
   llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
@@ -235,9 +239,16 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   builder_->SetInsertPoint(default_block);
   {
     auto ret_types_array =
-        builder_->CreateBitCast(&function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
+        builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+          &function->arg_begin()[4],
+#else
+          &(*(std::next(function->arg_begin(), 4))),
+#endif
+          llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+
     builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
-                          builder_->CreateGEP(ret_types_array, zero_index_list));
+                          builder_->CreateGEP(ret_types_array, zero_array_index_list));
     builder_->CreateRet(ConstInt32(kTvmErrorNoError));
   }
 
@@ -245,8 +256,6 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
 
   for (auto kv : params) {
     auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
-    std::cout << "param " << kv.first << ": ";
-    array->print(os);
     std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
     llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
         *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
@@ -256,42 +265,27 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
         llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
     builder_->SetInsertPoint(case_block);
     auto retval_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
         &function->arg_begin()[3],
-        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+#else
+        &(*std::next(function->arg_begin(), 3)),
+#endif
+        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
     builder_->CreateStore(
         builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
-        builder_->CreateGEP(retval_array, zero_index_list));
+        builder_->CreateGEP(retval_array, zero_array_index_list));
     auto ret_types_array =
-        builder_->CreateBitCast(&function->arg_begin()[4], llvm::ArrayType::get(t_int_, 1));
-    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
-                          builder_->CreateGEP(ret_types_array, zero_index_list));
+        builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+          &function->arg_begin()[4],
+#else
+        &(*std::next(function->arg_begin(), 4)),
+#endif
+          llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+   builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+                         builder_->CreateGEP(ret_types_array, zero_array_index_list));
     builder_->CreateRet(ConstInt32(0));
   }
-
-  std::cout << "generated function: " << std::endl;
-  function->print(os);
-
-  // llvm::Value* sid_start = module_->getGlobalVariable(module::tvm_param_array_sid_start);
-  // llvm::Value* cond = builder_->CreateAnd(
-  //   builder_->CreateICmpSGE(sid, sid_start),
-  //   builder_->CreateICmpSLT(sid,
-  //                           module_->getGlobalVariable(module::tvm_param_array_sid_end)));
-
-  // BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
-  // builder_->CreateCondBr(cond, then_block, else_block);
-
-  // // SID valid block (fetch sid data pointer and write to ret_values).
-  // builder_->SetInsertPoint(then_block);
-  // std::vector<llvm::Value> sid_index_list{builder_->CreateISub(sid, sid_start)};
-  // builder_->CreateStore(
-  //   builder_->CreateGEP(module_->getGlobalVariable(module::tvm_param_array), sid_index_list),
-  //   builder_->CreateBitCast(
-  //     builder_->CreateGEP(function->getArg(3), zero_index_list), t_int64_ty_));
-  // NOTE: set ret_tcode[0] to kTVMOpaqueHandle because the 'data' pointer of a DLTensor is returned
-  // here, *not* a proper DLTensor. It is up to the caller to create a DLTensor that correctly
-  // describes the returned data pointer.
-
-  // SID invalid block (return invalid SID error).
 }
 
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {

From 8bb51e426bd87ed60267f6208ca4e3063e929d76 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 19:37:45 -0800
Subject: [PATCH 25/60] set default for --link-params

---
 src/target/target_kind.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index f249ef8f529d..3294a7870d66 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -213,12 +213,12 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("mfloat-abi")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
-    .add_attr_option<Bool>("link-params")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
-    .add_attr_option<Bool>("link-params")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .set_default_keys({"cpu"});

From 03432d2c2b566fcce2d6e35ed9d2348db9c4bc8a Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 19:39:09 -0800
Subject: [PATCH 26/60] switch link order for proper library symbol resolution

---
 tests/python/unittest/test_link_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 3dc3122af81c..4b6692d79d10 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -358,7 +358,7 @@ def test_crt_link_params():
                 bin_opts=opts["bin_opts"],
                 extra_libs=[
                     os.path.join(tvm.micro.CRT_ROOT_DIR, m)
-                    for m in ("graph_runtime", "graph_runtime_module")
+                    for m in ("graph_runtime_module", "graph_runtime")
                 ],
             )
 

From b13472a9732c5947675367243818a017aa14f271 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 20:23:34 -0800
Subject: [PATCH 27/60] git-clang-format

---
 src/target/llvm/codegen_llvm.cc | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 1d63d35055dd..11da661cceac 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -238,14 +238,13 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
 
   builder_->SetInsertPoint(default_block);
   {
-    auto ret_types_array =
-        builder_->CreateBitCast(
+    auto ret_types_array = builder_->CreateBitCast(
 #if TVM_LLVM_VERSION >= 50
-          &function->arg_begin()[4],
+        &function->arg_begin()[4],
 #else
-          &(*(std::next(function->arg_begin(), 4))),
+        &(*(std::next(function->arg_begin(), 4))),
 #endif
-          llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+        llvm::ArrayType::get(t_int_, 1)->getPointerTo());
 
     builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
                           builder_->CreateGEP(ret_types_array, zero_array_index_list));
@@ -274,16 +273,15 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
     builder_->CreateStore(
         builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
         builder_->CreateGEP(retval_array, zero_array_index_list));
-    auto ret_types_array =
-        builder_->CreateBitCast(
+    auto ret_types_array = builder_->CreateBitCast(
 #if TVM_LLVM_VERSION >= 50
-          &function->arg_begin()[4],
+        &function->arg_begin()[4],
 #else
         &(*std::next(function->arg_begin(), 4)),
 #endif
-          llvm::ArrayType::get(t_int_, 1)->getPointerTo());
-   builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
-                         builder_->CreateGEP(ret_types_array, zero_array_index_list));
+        llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+                          builder_->CreateGEP(ret_types_array, zero_array_index_list));
     builder_->CreateRet(ConstInt32(0));
   }
 }

From 02d9744f63793e82ccb9338d37d6676852818bb9 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 20:35:42 -0800
Subject: [PATCH 28/60] black format + pylint

---
 python/tvm/micro/transport/file_descriptor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
index 6df6cd425eff..22377f0eea2e 100644
--- a/python/tvm/micro/transport/file_descriptor.py
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -20,7 +20,6 @@
 import fcntl
 import os
 import select
-import time
 from . import base
 
 

From d9c7b9cc6bfc8dd6d79c188df16465b8ca8d04b9 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 20:38:20 -0800
Subject: [PATCH 29/60] pylint again

---
 python/tvm/micro/transport/file_descriptor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
index 22377f0eea2e..6df6cd425eff 100644
--- a/python/tvm/micro/transport/file_descriptor.py
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -20,6 +20,7 @@
 import fcntl
 import os
 import select
+import time
 from . import base
 
 

From 3ddadf337b171eb31c9e1b1519ee7664bd558122 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 16 Nov 2020 20:58:12 -0800
Subject: [PATCH 30/60] fix target_test to recognize --link-params

---
 tests/cpp/target_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 3d528f821059..a422f12b04d7 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -147,8 +147,9 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 1U);
+  ICHECK_EQ(target->attrs.size(), 2U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
+  ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
 int main(int argc, char** argv) {

From ef740219d0ce61aac67fefbc74de0a06002689a1 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 09:15:48 -0800
Subject: [PATCH 31/60] remove --link-params from default micro() target.

 * can be set with options=.
---
 python/tvm/target/target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index cd874b8bffe4..a8934539020a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -236,7 +236,7 @@ def micro(model="unknown", options=None):
         "stm32f746xx": ["-mcpu=cortex-m7"],
     }
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", "--link-params", f"-model={model}"],
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
         options,
     )
 

From 51776082a48d1fb38d2a3a6eb92d36f924004e98 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 09:16:21 -0800
Subject: [PATCH 32/60] import testing, somehow not needed before

---
 tests/python/unittest/test_crt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 3b5471d0bb8b..3d6923342652 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -28,6 +28,7 @@
 
 import tvm
 import tvm.relay
+import tvm.testing
 
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python

From 1f471b1d60de219908c56fc1a028bd33f7acfb08 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 10:20:41 -0800
Subject: [PATCH 33/60] catch correct error from remote module lookup

---
 python/tvm/micro/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 5bc01186096f..8aa4ad44f607 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -179,7 +179,7 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
     """
     try:
         lookup_linked_param = mod.get_function("_lookup_linked_param")
-    except KeyError:
+    except AttributeError:
         return None
 
     remote_data = lookup_linked_param(storage_id)

From 3e1eb4a06d47e43fb9f236bcf040041c302a91ec Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 10:27:53 -0800
Subject: [PATCH 34/60] CRT RPC-level ModuleGetFunction behaves like C++ on
 error

 * returns no error/kNullptr on function name not found
---
 src/runtime/crt/common/crt_runtime_api.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 9a0663fc704d..f2d67ccfbeab 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -229,17 +229,17 @@ int TVMFuncCall(TVMFunctionHandle func_handle, TVMValue* arg_values, int* type_c
   return func(arg_values, type_codes, num_args, ret_val, ret_type_code, resource_handle);
 }
 
-static int FindFunctionOrSetAPIError(tvm_module_index_t module_index,
-                                     const TVMFuncRegistry* registry, const char* name,
-                                     TVMFunctionHandle* out) {
+static tvm_crt_error_t FindFunctionOrSetAPIError(tvm_module_index_t module_index,
+                                                 const TVMFuncRegistry* registry, const char* name,
+                                                 TVMFunctionHandle* out) {
   tvm_function_index_t function_index;
-  if (TVMFuncRegistry_Lookup(registry, name, &function_index) != 0) {
-    TVMAPIErrorf("failed to get function: mod_index=%04" PRIx16 ", name=%s", module_index, name);
-    return -1;
+  tvm_crt_error_t err = TVMFuncRegistry_Lookup(registry, name, &function_index);
+  if (err != kTvmErrorNoError) {
+    return err;
   }
 
   *out = EncodeFunctionHandle(module_index, function_index);
-  return 0;
+  return kTvmErrorNoError;
 }
 
 int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out) {
@@ -279,6 +279,14 @@ int ModuleGetFunction(TVMValue* args, int* type_codes, int num_args, TVMValue* r
 
   if (to_return == 0) {
     ret_type_codes[0] = kTVMPackedFuncHandle;
+  } else {
+    ret_value->v_handle = NULL;
+  }
+
+  // NOTE: For compatibility with C++ runtime API, return no error (but NULL function) when the
+  // function lookup failed.
+  if (to_return == kTvmErrorFunctionNameNotFound) {
+    to_return = kTvmErrorNoError;
   }
 
   return to_return;

From 289b1b795c229057cea3f65dae90cfd7df971d01 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 10:29:22 -0800
Subject: [PATCH 35/60] black format

---
 python/tvm/target/target.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index a8934539020a..9a609e5a334b 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -236,8 +236,7 @@ def micro(model="unknown", options=None):
         "stm32f746xx": ["-mcpu=cortex-m7"],
     }
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
-        options,
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options,
     )
 
     # NOTE: in the future, the default micro target will be LLVM except when

From ea0290baf41b79e7c01902cb26adc50a95100cec Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 11:38:10 -0800
Subject: [PATCH 36/60] black format again

---
 python/tvm/target/target.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 9a609e5a334b..a8934539020a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -236,7 +236,8 @@ def micro(model="unknown", options=None):
         "stm32f746xx": ["-mcpu=cortex-m7"],
     }
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options,
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
+        options,
     )
 
     # NOTE: in the future, the default micro target will be LLVM except when

From 652571ddd3824d99e0a8b9beba499439208cd4b6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 13:08:18 -0800
Subject: [PATCH 37/60] rm test_binutils, code is gone

---
 tests/python/contrib/test_binutils.py | 167 --------------------------
 1 file changed, 167 deletions(-)
 delete mode 100644 tests/python/contrib/test_binutils.py

diff --git a/tests/python/contrib/test_binutils.py b/tests/python/contrib/test_binutils.py
deleted file mode 100644
index f0aa2d157aed..000000000000
--- a/tests/python/contrib/test_binutils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test various utilities for interaction with compiled binaries.
-
-Specifically, we test the following capabilities:
-  - querying the size of a binary section
-  - relocating sections within a binary to new addresses
-  - reading the contents of a binary section
-  - querying the address of a symbol in the binary
-"""
-
-import tvm
-from tvm import te
-import subprocess
-from tvm.contrib import utils
-from tvm.contrib import cc
-from tvm.contrib.binutils import *
-
-TOOLCHAIN_PREFIX = ""
-
-
-def make_binary():
-    prog = "int a = 7; \
-            int main() { \
-                int b = 5; \
-                return 0; \
-            }"
-    tmp_dir = utils.tempdir()
-    tmp_source = tmp_dir.relpath("source.c")
-    tmp_obj = tmp_dir.relpath("obj.obj")
-    with open(tmp_source, "w") as f:
-        f.write(prog)
-    cc.create_executable(tmp_obj, tmp_source, [], cc="{}gcc".format(TOOLCHAIN_PREFIX))
-    prog_bin = bytearray(open(tmp_obj, "rb").read())
-    return prog_bin
-
-
-def test_tvm_callback_get_section_size(binary=None):
-    if binary is None:
-        binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        print(
-            "Text section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Data section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "data", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss", TOOLCHAIN_PREFIX)
-        )
-        print()
-
-    verify()
-
-
-def test_tvm_callback_relocate_binary():
-    binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        print("Relocated binary section sizes")
-        test_tvm_callback_get_section_size(binary=rel_bin)
-        relf = tmp_dir.relpath("rel.bin")
-        with open(relf, "wb") as f:
-            f.write(rel_bin)
-        nm_proc = subprocess.Popen(
-            ["nm", "-C", "--defined-only", relf], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        )
-        (out, _) = nm_proc.communicate()
-        symbol_entries = out.decode("utf-8").split("\n")
-        for entry in symbol_entries:
-            if len(entry) == 0:
-                continue
-            sym_loc, section, sym_name = entry.split(" ")
-            sym_loc = int(sym_loc, 16)
-            if section == "T":  # text
-                assert sym_loc >= text_loc and sym_loc < data_loc
-            elif section == "D":  # data
-                assert sym_loc >= data_loc and sym_loc < bss_loc
-            elif section == "B":  # bss
-                assert sym_loc >= bss_loc
-
-    verify()
-
-
-def test_tvm_callback_read_binary_section():
-    binary = make_binary()
-
-    def verify():
-        text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX)
-        data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX)
-        bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX)
-        print("Read text section part of binary? %r" % (text_bin in binary))
-        print("Read data section part of binary? %r" % (data_bin in binary))
-        print("Read bss section part of binary? %r" % (bss_bin in binary))
-        print()
-
-    verify()
-
-
-def test_tvm_callback_get_symbol_map():
-    binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX)
-        symbols = set()
-        for i, line in enumerate(symbol_map.split("\n")):
-            # Every other line is the value the symbol maps to.
-            if i % 2 == 0:
-                symbols.add(line)
-        assert "a" in symbols
-        assert "main" in symbols
-
-    verify()
-
-
-if __name__ == "__main__":
-    test_tvm_callback_get_section_size()
-    test_tvm_callback_relocate_binary()
-    test_tvm_callback_read_binary_section()
-    test_tvm_callback_get_symbol_map()

From 4976e50d3a562e667483873bc550795f6cf5a911 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 13:18:42 -0800
Subject: [PATCH 38/60] split codegen_params into two files, C and LLVM

---
 src/target/llvm/codegen_params.cc   | 263 +------------------------
 src/target/llvm/codegen_params.h    |   2 -
 src/target/source/codegen_c_host.cc |   2 +-
 src/target/source/codegen_params.cc | 295 ++++++++++++++++++++++++++++
 src/target/source/codegen_params.h  |  38 ++++
 5 files changed, 337 insertions(+), 263 deletions(-)
 create mode 100644 src/target/source/codegen_params.cc
 create mode 100644 src/target/source/codegen_params.h

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index a7d79c466391..243de66381e4 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -18,24 +18,23 @@
  */
 
 /*!
- * \file codegen_blob.cc
+ * \file codegen_params.cc
  */
 #ifdef TVM_LLVM_VERSION
 
 #include "codegen_params.h"
 
-#include <iomanip>
-#include <memory>
-#include <string>
 #include <vector>
 
 namespace tvm {
 namespace codegen {
 
+namespace {
 class DLManagedTensorDeleter {
  public:
   void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
 };
+}
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
   llvm::Type* element_type = nullptr;
@@ -145,262 +144,6 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       llvm::ArrayType::get(element_type, num_elements), llvm::ArrayRef<llvm::Constant*>(elements)));
 }
 
-static constexpr const char* kFloatCast = "(float)";
-static constexpr const char* kDoubleCast = "(double)";
-
-static constexpr const int kMaxLineLength = 80;
-
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
-  auto arr_type = arr.DataType();
-  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
-                                << arr_type.lanes();
-
-  int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
-  if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
-    one_element_size_bytes += 1;  // sign bit
-    if (arr_type.bits() > 32) {
-      one_element_size_bytes += 2;  // "UL"
-    }
-  } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
-    if (arr_type.bits() > 32) {
-      one_element_size_bytes += 1;  // "L"
-    }
-  } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
-    // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */ +
-                              1 /* extra decimal digit in exponent */;
-  }
-
-  int elements_per_row = 16;
-  while (elements_per_row > 1 &&
-         (elements_per_row * one_element_size_bytes) > (kMaxLineLength - indent_chars)) {
-    elements_per_row /= 2;
-  }
-
-  std::string indent_str(indent_chars, ' ');
-  os << indent_str;
-
-  auto shape = arr.Shape();
-  int num_elements = 1;
-  for (auto shape_elem : shape) {
-    num_elements *= shape_elem;
-  }
-
-  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
-  auto old_fmtflags = os.flags();
-  os.setf(std::ios::internal | std::ios::hex,
-          std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
-  os.fill('0');
-  switch (arr_type.code()) {
-    case runtime::DataType::kInt:
-      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-            arr_type.bits() == 64)
-          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-
-      if (arr_type.bits() == 8) {
-        for (int i = 0; i < num_elements; i++) {
-          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
-          // printing as a char.
-          int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
-          uint16_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
-          uint16_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(4) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
-          uint32_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(8) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
-          uint64_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(16) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else {
-        CHECK(false) << "should not get here";
-      }
-      break;
-
-    case runtime::DataType::TypeCode::kUInt:
-      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-            arr_type.bits() == 64)
-          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-
-      if (arr_type.bits() == 8) {
-        for (int i = 0; i < num_elements; i++) {
-          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
-          // printing as a char.
-          os << "0x" << std::setw(2)
-             << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else {
-        CHECK(false) << "should not get here";
-      }
-      break;
-
-    case runtime::DataType::TypeCode::kFloat: {
-      std::stringstream ss;
-      ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
-              std::ios::basefield | std::ios::showbase | std::ios::floatfield);
-      os.fill(' ');
-      os.setf(std::ios::left, std::ios::adjustfield);
-      if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
-          if (std::isinf(elem)) {
-            // C99 standard.
-            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-          } else if (std::isnan(elem)) {
-            // GNU extension, implemenatation-dependent.
-            os << std::setw(one_element_size_bytes) << "NAN";
-          } else {
-            ss << elem;
-            os << std::setw(one_element_size_bytes) << ss.str();
-            ss.str("");
-          }
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-        std::cout << "\n";
-      } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
-          if (std::isinf(elem)) {
-            // C99 standard.
-            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-          } else if (std::isnan(elem)) {
-            // GNU extension, implemenatation-dependent.
-            os << std::setw(one_element_size_bytes) << "NAN";
-          } else {
-            ss << elem;
-            os << std::setw(one_element_size_bytes) << ss.str();
-            ss.str("");
-          }
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-      } else {
-        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                     << arr_type.bits() << "-bit array";
-      }
-      break;
-    }
-
-    default:
-      CHECK(false) << "Data type not supported";
-  }
-
-  if (num_elements % elements_per_row != 0) {
-    os << "\n";
-  }
-  os.flags(old_fmtflags);
-}
-
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
index 6e3c3e8eafd6..c21820aa6c3f 100644
--- a/src/target/llvm/codegen_params.h
+++ b/src/target/llvm/codegen_params.h
@@ -34,8 +34,6 @@ namespace codegen {
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
 
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
-
 void LLVMCodeGenParams(llvm::LLVMContext* ctx, llvm::Module* module, int64_t storage_id_offset,
                        ::tvm::runtime::Array<String> param_names,
                        ::tvm::runtime::Array<runtime::NDArray> params_by_sid);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index f47e07e94bd1..0a19fc1399b7 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -33,7 +33,7 @@
 #include "../../support/str_escape.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
-#include "../llvm/codegen_params.h"
+#include "codegen_params.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
new file mode 100644
index 000000000000..ae02a957467e
--- /dev/null
+++ b/src/target/source/codegen_params.cc
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.cc
+ */
+
+#include "codegen_params.h"
+
+#include <iomanip>
+#include <memory>
+#include <string>
+#include <dlpack/dlpack.h>
+
+namespace tvm {
+namespace codegen {
+
+namespace {
+class DLManagedTensorDeleter {
+ public:
+  void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
+};
+}
+
+static constexpr const int kMaxLineLength = 80;
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
+  auto arr_type = arr.DataType();
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
+
+  int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
+  if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
+    one_element_size_bytes += 1;  // sign bit
+    if (arr_type.bits() > 32) {
+      one_element_size_bytes += 2;  // "UL"
+    }
+  } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
+    if (arr_type.bits() > 32) {
+      one_element_size_bytes += 1;  // "L"
+    }
+  } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
+    // Floats and doubles are printed as hex but casted.
+    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */ +
+                              1 /* extra decimal digit in exponent */;
+  }
+
+  int elements_per_row = 16;
+  while (elements_per_row > 1 &&
+         (elements_per_row * one_element_size_bytes) > (kMaxLineLength - indent_chars)) {
+    elements_per_row /= 2;
+  }
+
+  std::string indent_str(indent_chars, ' ');
+  os << indent_str;
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
+  auto old_fmtflags = os.flags();
+  os.setf(std::ios::internal | std::ios::hex,
+          std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
+  os.fill('0');
+  switch (arr_type.code()) {
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        for (int i = 0; i < num_elements; i++) {
+          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
+          // printing as a char.
+          int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
+          uint16_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
+          uint16_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(4) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
+          uint32_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(8) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
+          uint64_t to_print;
+          if (elem < 0) {
+            os << "-";
+            to_print = -elem;
+          } else {
+            os << "+";
+            to_print = elem;
+          }
+          os << "0x" << std::setw(16) << to_print;
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        for (int i = 0; i < num_elements; i++) {
+          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
+          // printing as a char.
+          os << "0x" << std::setw(2)
+             << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 16) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kFloat: {
+      std::stringstream ss;
+      ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+              std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+      os.fill(' ');
+      os.setf(std::ios::left, std::ios::adjustfield);
+      if (arr_type.bits() == 32) {
+        for (int i = 0; i < num_elements; i++) {
+          float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
+          if (std::isinf(elem)) {
+            // C99 standard.
+            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+          } else if (std::isnan(elem)) {
+            // GNU extension, implemenatation-dependent.
+            os << std::setw(one_element_size_bytes) << "NAN";
+          } else {
+            ss << elem;
+            os << std::setw(one_element_size_bytes) << ss.str();
+            ss.str("");
+          }
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+        std::cout << "\n";
+      } else if (arr_type.bits() == 64) {
+        for (int i = 0; i < num_elements; i++) {
+          double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
+          if (std::isinf(elem)) {
+            // C99 standard.
+            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+          } else if (std::isnan(elem)) {
+            // GNU extension, implemenatation-dependent.
+            os << std::setw(one_element_size_bytes) << "NAN";
+          } else {
+            ss << elem;
+            os << std::setw(one_element_size_bytes) << ss.str();
+            ss.str("");
+          }
+          if (i < num_elements - 1) {
+            os << ", ";
+          }
+          if (((i + 1) % elements_per_row) == 0) {
+            os << "\n" << indent_str;
+          }
+        }
+      } else {
+        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                     << arr_type.bits() << "-bit array";
+      }
+      break;
+    }
+
+    default:
+      CHECK(false) << "Data type not supported";
+  }
+
+  if (num_elements % elements_per_row != 0) {
+    os << "\n";
+  }
+  os.flags(old_fmtflags);
+}
+
+}  // codegen codegen
+}  // codegen tvm
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
new file mode 100644
index 000000000000..6ef3f4fbc63e
--- /dev/null
+++ b/src/target/source/codegen_params.h
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+#define TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+
+#include <iostream>
+#include <tvm/runtime/ndarray.h>
+
+namespace tvm {
+namespace codegen {
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_

From 219e3769864d1698628f342abe53a6ba2e1f873c Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 13:19:10 -0800
Subject: [PATCH 39/60] git-clang-format

---
 src/target/llvm/codegen_params.cc   | 2 +-
 src/target/source/codegen_params.cc | 9 +++++----
 src/target/source/codegen_params.h  | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 243de66381e4..8b405d6e92f8 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -34,7 +34,7 @@ class DLManagedTensorDeleter {
  public:
   void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
 };
-}
+}  // namespace
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
   llvm::Type* element_type = nullptr;
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index ae02a957467e..f55071f16c19 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -23,10 +23,11 @@
 
 #include "codegen_params.h"
 
+#include <dlpack/dlpack.h>
+
 #include <iomanip>
 #include <memory>
 #include <string>
-#include <dlpack/dlpack.h>
 
 namespace tvm {
 namespace codegen {
@@ -36,7 +37,7 @@ class DLManagedTensorDeleter {
  public:
   void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
 };
-}
+}  // namespace
 
 static constexpr const int kMaxLineLength = 80;
 
@@ -291,5 +292,5 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
   os.flags(old_fmtflags);
 }
 
-}  // codegen codegen
-}  // codegen tvm
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
index 6ef3f4fbc63e..a3d277eac590 100644
--- a/src/target/source/codegen_params.h
+++ b/src/target/source/codegen_params.h
@@ -24,9 +24,10 @@
 #ifndef TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
 #define TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
 
-#include <iostream>
 #include <tvm/runtime/ndarray.h>
 
+#include <iostream>
+
 namespace tvm {
 namespace codegen {
 

From 762310b416d7bb097175239e13ff5917d0d338d0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 13:24:50 -0800
Subject: [PATCH 40/60] cpplint

---
 src/target/llvm/codegen_params.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 8b405d6e92f8..8f92e4f19b3a 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -24,6 +24,7 @@
 
 #include "codegen_params.h"
 
+#include <memory>
 #include <vector>
 
 namespace tvm {

From 7af6c9a76020a75fdff8396c9ef34b8d0f7d431d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 13:44:28 -0800
Subject: [PATCH 41/60] add missing include for linux compilation

---
 src/target/source/codegen_params.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index f55071f16c19..99f7e44ca8e7 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -23,6 +23,7 @@
 
 #include "codegen_params.h"
 
+#include <cmath>
 #include <dlpack/dlpack.h>
 
 #include <iomanip>

From b76d2cea6419e87e584e1f865ae02ccdeaf5ce41 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 17 Nov 2020 14:43:07 -0800
Subject: [PATCH 42/60] cpplint

---
 src/target/source/codegen_params.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index 99f7e44ca8e7..74524b3545d3 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -23,9 +23,9 @@
 
 #include "codegen_params.h"
 
-#include <cmath>
 #include <dlpack/dlpack.h>
 
+#include <cmath>
 #include <iomanip>
 #include <memory>
 #include <string>

From 78de39ee78d8ff72cb0e57d550cc89bb530faf27 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 10:48:48 -0800
Subject: [PATCH 43/60] Keep binutils as it was, not part of this PR

---
 python/tvm/contrib/binutils.py        | 280 +++++++++++++++++++++++++-
 tests/python/contrib/test_binutils.py | 167 +++++++++++++++
 2 files changed, 444 insertions(+), 3 deletions(-)
 create mode 100644 tests/python/contrib/test_binutils.py

diff --git a/python/tvm/contrib/binutils.py b/python/tvm/contrib/binutils.py
index ea6b6320fbbd..646362a5587f 100644
--- a/python/tvm/contrib/binutils.py
+++ b/python/tvm/contrib/binutils.py
@@ -16,10 +16,61 @@
 # under the License.
 
 """Utilities for binary file manipulation"""
-import logging
+import os
 import subprocess
+import tvm._ffi
+from . import utils
 
-_LOG = logging.getLogger(__name__)
+# TODO does this file still belong in `contrib`. is it too µTVM-specific?
+
+# TODO shouldn't need so many `ALIGN` directives
+RELOCATION_LD_SCRIPT_TEMPLATE = """
+/* linker symbol for use in UTVMInit */
+_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
+
+SECTIONS
+{{
+  . = 0x{text_start:x};
+  . = ALIGN({word_size});
+  .text :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.text))
+    KEEP(*(.text*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{rodata_start:x};
+  . = ALIGN({word_size});
+  .rodata :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.rodata))
+    KEEP(*(.rodata*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{data_start:x};
+  . = ALIGN({word_size});
+  .data :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.data))
+    KEEP(*(.data*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{bss_start:x};
+  . = ALIGN({word_size});
+  .bss :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.bss))
+    KEEP(*(.bss*))
+    . = ALIGN({word_size});
+  }}
+}}
+"""
 
 
 def run_cmd(cmd):
@@ -35,7 +86,6 @@ def run_cmd(cmd):
     output : str
         resulting stdout capture from the subprocess
     """
-    _LOG.debug("execute: %s", " ".join(cmd))
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (output, _) = proc.communicate()
     output = output.decode("utf-8")
@@ -44,3 +94,227 @@ def run_cmd(cmd):
         msg = f'error while running command "{cmd_str}":\n{output}'
         raise RuntimeError(msg)
     return output
+
+
+@tvm._ffi.register_func("tvm_callback_get_section_size")
+def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
+    """Finds size of the section in the binary.
+    Assumes `size` shell command exists (typically works only on Linux machines)
+
+    Parameters
+    ----------
+    binary_path : str
+        path of the binary file
+
+    section_name : str
+        name of section
+
+    toolchain_prefix : str
+        prefix for binary names in target compiler toolchain
+
+    Returns
+    -------
+    size : integer
+        size of the section in bytes
+    """
+    if not os.path.isfile(binary_path):
+        raise RuntimeError('no such file "{}"'.format(binary_path))
+    # We use the "-A" flag here to get the ".rodata" section's size, which is
+    # not included by default.
+    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
+
+    # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
+    # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
+    section_mapping = {
+        ".text": [".text"],
+        ".rodata": [".rodata"],
+        ".data": [".data", ".sdata"],
+        ".bss": [".bss", ".sbss"],
+    }
+    sections_to_sum = section_mapping["." + section_name]
+    section_size = 0
+    # Skip the first two header lines in the `size` output.
+    for line in size_output.split("\n")[2:]:
+        tokens = list(filter(lambda s: len(s) != 0, line.split(" ")))
+        if len(tokens) != 3:
+            continue
+        entry_name = tokens[0]
+        entry_size = int(tokens[1])
+        for section in sections_to_sum:
+            if entry_name.startswith(section):
+                section_size += entry_size
+                break
+
+    # NOTE: in the past, section_size has been wrong on x86. it may be
+    # inconsistent. TODO: maybe stop relying on `*size` to give us the size and
+    # instead read the section with `*objcopy` and count the bytes.
+    # NOTE(areusch): I think the problem is due to alignment ops in the linker.
+    # Since this is going away in the impending switch to on-device runtime,
+    # add a constant to hopefully absorb these relocations.
+    if section_size > 0:
+        section_size += 64
+
+    return section_size
+
+
+@tvm._ffi.register_func("tvm_callback_relocate_binary")
+def tvm_callback_relocate_binary(
+    binary_path,
+    word_size,
+    text_start,
+    rodata_start,
+    data_start,
+    bss_start,
+    stack_end,
+    toolchain_prefix,
+):
+    """Relocates sections in the binary to new addresses
+
+    Parameters
+    ----------
+    binary_path : str
+        path of the binary file
+
+    word_size : int
+        word size on the target machine
+
+    text_start : int
+        text section address
+
+    rodata_start : int
+        rodata section address
+
+    data_start : int
+        data section address
+
+    bss_start : int
+        bss section address
+
+    stack_end : int
+        stack section end address
+
+    toolchain_prefix : str
+        prefix for binary names in target compiler toolchain
+
+    Returns
+    -------
+    rel_bin : bytearray
+        the relocated binary
+    """
+    assert text_start < rodata_start < data_start < bss_start < stack_end
+    stack_pointer_init = stack_end - word_size
+    ld_script_contents = ""
+    # TODO(weberlo): There should be a better way to configure this for different archs.
+    # TODO is this line even necessary?
+    if "riscv" in toolchain_prefix:
+        ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
+    ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
+        word_size=word_size,
+        text_start=text_start,
+        rodata_start=rodata_start,
+        data_start=data_start,
+        bss_start=bss_start,
+        stack_pointer_init=stack_pointer_init,
+    )
+
+    tmp_dir = utils.tempdir()
+    rel_obj_path = tmp_dir.relpath("relocated.obj")
+    rel_ld_script_path = tmp_dir.relpath("relocate.lds")
+    with open(rel_ld_script_path, "w") as f:
+        f.write(ld_script_contents)
+    run_cmd(
+        ["{}ld".format(toolchain_prefix), binary_path, "-T", rel_ld_script_path, "-o", rel_obj_path]
+    )
+
+    with open(rel_obj_path, "rb") as f:
+        rel_bin = bytearray(f.read())
+
+    gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
+    if gdb_init_dir is not None:
+        gdb_init_path = f"{gdb_init_dir}/.gdbinit"
+        with open(gdb_init_path, "r") as f:
+            gdbinit_contents = f.read().split("\n")
+        new_contents = []
+        for line in gdbinit_contents:
+            new_contents.append(line)
+            if line.startswith("target"):
+                new_contents.append(f"add-symbol-file {rel_obj_path}")
+        with open(gdb_init_path, "w") as f:
+            f.write("\n".join(new_contents))
+
+    return rel_bin
+
+
+@tvm._ffi.register_func("tvm_callback_read_binary_section")
+def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
+    """Returns the contents of the specified section in the binary byte array
+
+    Parameters
+    ----------
+    binary : bytearray
+        contents of the binary
+
+    section : str
+        type of section
+
+    toolchain_prefix : str
+        prefix for binary names in target compiler toolchain
+
+    Returns
+    -------
+    section_bin : bytearray
+        contents of the read section
+    """
+    tmp_dir = utils.tempdir()
+    tmp_bin = tmp_dir.relpath("temp.bin")
+    tmp_section = tmp_dir.relpath("tmp_section.bin")
+    with open(tmp_bin, "wb") as out_file:
+        out_file.write(bytes(binary))
+    run_cmd(
+        [
+            "{}objcopy".format(toolchain_prefix),
+            "--dump-section",
+            ".{}={}".format(section, tmp_section),
+            tmp_bin,
+        ]
+    )
+    if os.path.isfile(tmp_section):
+        # Get section content if it exists.
+        with open(tmp_section, "rb") as f:
+            section_bin = bytearray(f.read())
+    else:
+        # Return empty bytearray if the section does not exist.
+        section_bin = bytearray("", "utf-8")
+    return section_bin
+
+
+@tvm._ffi.register_func("tvm_callback_get_symbol_map")
+def tvm_callback_get_symbol_map(binary, toolchain_prefix):
+    """Obtains a map of symbols to addresses in the passed binary
+
+    Parameters
+    ----------
+    binary : bytearray
+        contents of the binary
+
+    toolchain_prefix : str
+        prefix for binary names in target compiler toolchain
+
+    Returns
+    -------
+    map_str : str
+        map of defined symbols to addresses, encoded as a series of
+        alternating newline-separated keys and values
+    """
+    tmp_dir = utils.tempdir()
+    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
+    with open(tmp_obj, "wb") as out_file:
+        out_file.write(bytes(binary))
+    nm_output = run_cmd(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj])
+    nm_output = nm_output.splitlines()
+    map_str = ""
+    for line in nm_output:
+        line = line.split()
+        map_str += line[2] + "\n"
+        map_str += line[0] + "\n"
+    return map_str
diff --git a/tests/python/contrib/test_binutils.py b/tests/python/contrib/test_binutils.py
new file mode 100644
index 000000000000..f0aa2d157aed
--- /dev/null
+++ b/tests/python/contrib/test_binutils.py
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test various utilities for interaction with compiled binaries.
+
+Specifically, we test the following capabilities:
+  - querying the size of a binary section
+  - relocating sections within a binary to new addresses
+  - reading the contents of a binary section
+  - querying the address of a symbol in the binary
+"""
+
+import tvm
+from tvm import te
+import subprocess
+from tvm.contrib import utils
+from tvm.contrib import cc
+from tvm.contrib.binutils import *
+
+TOOLCHAIN_PREFIX = ""
+
+
+def make_binary():
+    prog = "int a = 7; \
+            int main() { \
+                int b = 5; \
+                return 0; \
+            }"
+    tmp_dir = utils.tempdir()
+    tmp_source = tmp_dir.relpath("source.c")
+    tmp_obj = tmp_dir.relpath("obj.obj")
+    with open(tmp_source, "w") as f:
+        f.write(prog)
+    cc.create_executable(tmp_obj, tmp_source, [], cc="{}gcc".format(TOOLCHAIN_PREFIX))
+    prog_bin = bytearray(open(tmp_obj, "rb").read())
+    return prog_bin
+
+
+def test_tvm_callback_get_section_size(binary=None):
+    if binary is None:
+        binary = make_binary()
+    tmp_dir = utils.tempdir()
+    tmp_bin = tmp_dir.relpath("obj.bin")
+    with open(tmp_bin, "wb") as f:
+        f.write(binary)
+
+    def verify():
+        print(
+            "Text section size: %d"
+            % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)
+        )
+        print(
+            "Data section size: %d"
+            % tvm_callback_get_section_size(tmp_bin, "data", TOOLCHAIN_PREFIX)
+        )
+        print(
+            "Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss", TOOLCHAIN_PREFIX)
+        )
+        print()
+
+    verify()
+
+
+def test_tvm_callback_relocate_binary():
+    binary = make_binary()
+    tmp_dir = utils.tempdir()
+    tmp_bin = tmp_dir.relpath("obj.bin")
+    with open(tmp_bin, "wb") as f:
+        f.write(binary)
+
+    def verify():
+        word_size = 8
+        text_loc = 0x0
+        rodata_loc = 0x10000
+        data_loc = 0x20000
+        bss_loc = 0x30000
+        stack_end = 0x50000
+        rel_bin = tvm_callback_relocate_binary(
+            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
+        )
+        print("Relocated binary section sizes")
+        test_tvm_callback_get_section_size(binary=rel_bin)
+        relf = tmp_dir.relpath("rel.bin")
+        with open(relf, "wb") as f:
+            f.write(rel_bin)
+        nm_proc = subprocess.Popen(
+            ["nm", "-C", "--defined-only", relf], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+        (out, _) = nm_proc.communicate()
+        symbol_entries = out.decode("utf-8").split("\n")
+        for entry in symbol_entries:
+            if len(entry) == 0:
+                continue
+            sym_loc, section, sym_name = entry.split(" ")
+            sym_loc = int(sym_loc, 16)
+            if section == "T":  # text
+                assert sym_loc >= text_loc and sym_loc < data_loc
+            elif section == "D":  # data
+                assert sym_loc >= data_loc and sym_loc < bss_loc
+            elif section == "B":  # bss
+                assert sym_loc >= bss_loc
+
+    verify()
+
+
+def test_tvm_callback_read_binary_section():
+    binary = make_binary()
+
+    def verify():
+        text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX)
+        data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX)
+        bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX)
+        print("Read text section part of binary? %r" % (text_bin in binary))
+        print("Read data section part of binary? %r" % (data_bin in binary))
+        print("Read bss section part of binary? %r" % (bss_bin in binary))
+        print()
+
+    verify()
+
+
+def test_tvm_callback_get_symbol_map():
+    binary = make_binary()
+    tmp_dir = utils.tempdir()
+    tmp_bin = tmp_dir.relpath("obj.bin")
+    with open(tmp_bin, "wb") as f:
+        f.write(binary)
+
+    def verify():
+        word_size = 8
+        text_loc = 0x0
+        rodata_loc = 0x10000
+        data_loc = 0x20000
+        bss_loc = 0x30000
+        stack_end = 0x50000
+        rel_bin = tvm_callback_relocate_binary(
+            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
+        )
+        symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX)
+        symbols = set()
+        for i, line in enumerate(symbol_map.split("\n")):
+            # Every other line is the value the symbol maps to.
+            if i % 2 == 0:
+                symbols.add(line)
+        assert "a" in symbols
+        assert "main" in symbols
+
+    verify()
+
+
+if __name__ == "__main__":
+    test_tvm_callback_get_section_size()
+    test_tvm_callback_relocate_binary()
+    test_tvm_callback_read_binary_section()
+    test_tvm_callback_get_symbol_map()

From 148bdfc6f4426800ce9c3bb9e913be92502f7c11 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 12:15:57 -0800
Subject: [PATCH 44/60] templatize LLVM param codegen

---
 src/target/llvm/codegen_params.cc | 130 +++++++++++++++++-------------
 1 file changed, 75 insertions(+), 55 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 8f92e4f19b3a..c27a97d2f611 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -37,6 +37,40 @@ class DLManagedTensorDeleter {
 };
 }  // namespace
 
+template <typename T, typename E = void>
+struct LLVMConstantGetter {
+  static llvm::Constant* getElement(llvm::Type* ty, T t);
+};
+
+template <typename T>
+struct LLVMConstantGetter<T, std::enable_if_t<(std::is_integral<T>::value && std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) {
+    return llvm::ConstantInt::getSigned(ty, t);
+  }
+};
+
+template <typename T>
+struct LLVMConstantGetter<T, std::enable_if_t<(std::is_integral<T>::value && !std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) {
+    return llvm::ConstantInt::get(ty, t);
+  }
+};
+
+template <typename T>
+struct LLVMConstantGetter<T, std::enable_if_t<std::is_floating_point<T>::value>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) {
+    return llvm::ConstantFP::get(ty, t);
+  }
+};
+
+template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
+void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements, std::vector<llvm::Constant*>* elements) {
+  for (size_t i = 0; i < num_elements; i++) {
+    auto llvm_element = LLVMConstantGetter<T>::getElement(element_type, static_cast<T*>(tensor_data)[i]);
+    elements->emplace_back(llvm_element);
+  }
+}
+
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
   llvm::Type* element_type = nullptr;
 
@@ -61,28 +95,22 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
           << arr_type.bits() << "-bit array";
       element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
 
-      if (arr_type.bits() == 8) {
-        int8_t* data_buf = static_cast<int8_t*>(tensor->dl_tensor.data);
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::getSigned(element_type, data_buf[i]));
-        }
-      } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::getSigned(
-              element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::getSigned(
-              element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::getSigned(
-              element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else {
-        CHECK(false) << "should not get here";
+      switch (arr_type.bits()) {
+      case 8:
+        BuildLLVMVector<int8_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 16:
+        BuildLLVMVector<int16_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 32:
+        BuildLLVMVector<int32_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 64:
+        BuildLLVMVector<int64_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      default:
+        ICHECK(false) << "should not get here";
+        break;
       }
       break;
 
@@ -93,47 +121,39 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
           << arr_type.bits() << "-bit array";
       element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
 
-      if (arr_type.bits() == 8) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::get(
-              element_type, reinterpret_cast<int8_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::get(
-              element_type, reinterpret_cast<int16_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::get(
-              element_type, reinterpret_cast<int32_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantInt::get(
-              element_type, reinterpret_cast<int64_t*>(tensor->dl_tensor.data)[i]));
-        }
-      } else {
-        CHECK(false) << "should not get here";
+      switch (arr_type.bits()) {
+      case 8:
+        BuildLLVMVector<uint8_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 16:
+        BuildLLVMVector<uint16_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 32:
+        BuildLLVMVector<uint32_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 64:
+        BuildLLVMVector<uint64_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      default:
+        ICHECK(false) << "should not get here";
+        break;
       }
       break;
 
     case runtime::DataType::TypeCode::kFloat:
-      if (arr_type.bits() == 32) {
+      switch (arr_type.bits()) {
+      case 32:
         element_type = llvm::Type::getFloatTy(*ctx);
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantFP::get(
-              element_type, reinterpret_cast<float*>(tensor->dl_tensor.data)[i]));
-        }
-      } else if (arr_type.bits() == 64) {
+        BuildLLVMVector<float>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      case 64:
         element_type = llvm::Type::getDoubleTy(*ctx);
-        for (int i = 0; i < num_elements; i++) {
-          elements.emplace_back(llvm::ConstantFP::get(
-              element_type, reinterpret_cast<double*>(tensor->dl_tensor.data)[i]));
-        }
-      } else {
+        BuildLLVMVector<double>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        break;
+      default:
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
                      << arr_type.bits() << "-bit array";
+        break;
       }
       break;
 

From 38a73ea165e100b7d4fb03a34fb9884b2ae70c10 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 13:17:45 -0800
Subject: [PATCH 45/60] address tqchen comments

---
 include/tvm/tir/function.h                 | 13 ++-
 pyproject.toml                             | 92 ++++++++++++++++++++++
 src/relay/backend/graph_runtime_codegen.cc | 10 ++-
 src/runtime/graph/graph_runtime.cc         | 20 +++--
 src/runtime/graph/graph_runtime.h          | 13 ++-
 src/runtime/rpc/rpc_module.cc              | 21 +++--
 src/target/llvm/codegen_llvm.cc            | 52 +++++-------
 src/target/llvm/codegen_params.cc          | 23 +++---
 8 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index a22552ea190c..97ee7f7211d4 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -151,6 +151,14 @@ class PrimFunc : public BaseFunc {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
 };
 
+/*!
+ * \brief Describes one parameter that should be linked into the generated module.
+ *
+ * When parameters are to be linked in with generated code (i.e. on target_host-compatible
+ * backends), Relay attaches instances of this object to a global TIR function. Code-generators
+ * use the information contained in this node to include the parameter data in the generated
+ * module.
+ */
 class LinkedParamNode : public Object {
  public:
   /*! \brief Unique numeric identifier used by runtimes to lookup this parameter. */
@@ -168,9 +176,12 @@ class LinkedParamNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(LinkedParamNode, Object);
 };
 
+/*!
+ * \brief Managed reference to LinkedParamNode.
+ */
 class LinkedParam : public ObjectRef {
  public:
-  LinkedParam(int64_t id, ::tvm::runtime::NDArray param);
+  TVM_DLL LinkedParam(int64_t id, ::tvm::runtime::NDArray param);
 
   TVM_DEFINE_OBJECT_REF_METHODS(LinkedParam, ObjectRef, LinkedParamNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
diff --git a/pyproject.toml b/pyproject.toml
index 5cca711ddbe6..d273b25eb3cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,3 +46,95 @@ exclude = '''
   )/
 )
 '''
+[tool.poetry]
+name = "incubator-tvm"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
+packages = [
+    { include = "tvm", from = "../../../../python" },
+]
+
+[tool.poetry.dependencies]
+attrs = "^19"
+decorator = "^4.4"
+numpy = "~1.19"
+psutil = "^5"
+scipy = "^1.4"
+python = "^3.6"
+tornado = "^6"
+typed_ast = "^1.4"
+
+# AutoTVM
+xgboost = {version = "^1.1", optional = true}
+
+#############
+# Importers #
+#############
+
+# NOTE: Caffe frontend dependency is from torch package.
+
+# CoreML
+coremltools = {version = "^3.3", optional = true}
+
+# Darknet
+opencv-python = {version = "^4.2", optional = true}
+cffi = {version = "^1.14", optional = true}
+
+# NOTE: Keras provided by tensorflow package.
+# If TF version conflict, maybe try: keras = "2.3.1"
+
+# MXNet frontend
+mxnet = {version = "^1.6.0", optional = true}
+
+# ONNX frontend
+onnx = {version = "1.6.0", optional = true}
+onnxruntime = {version = "1.0.0", optional = true}
+
+# Pytorch (also used by ONNX)
+torch = {version = "1.4.0", optional = true}
+torchvision = {version = "0.5.0", optional = true}
+# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
+# wheel!!!
+future = {version = "*", optional = true}
+
+# Tensorflow frontend
+tensorflow = {version = "^2.1", optional = true}
+tensorflow-estimator = {version = "^2.1", optional = true}
+
+# TFLite frontend
+tflite = {version = "2.1.0", optional = true}
+wheel = "*"
+
+
+[tool.poetry.extras]
+xgboost = ["xgboost"]
+importer-caffe2 = ["torch"]
+importer-coreml = ["coremltools"]
+importer-darknet = ["opencv-python"]
+importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
+importer-pytorch = ["torch", "torchvision", "future"]
+importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"]
+
+[tool.poetry.dev-dependencies]
+autodocsumm = "^0.1"
+black = "^19.10b0"
+sphinx = "^3.0"
+sphinx-gallery = "^0.4"
+sphinx-rtd-theme = "^0.4"
+matplotlib = "^3.2"
+Image = "^1.5"
+recommonmark = "^0.6"
+pillow = "< 7"
+pyformat = "^0.7"
+pylint = "^2.4"
+pytest = "^5.4"
+
+[build-system]
+requires = ["poetry>=0.12"]
+build-backend = "poetry.masonry.api"
+
+[tool.autopep8]
+max_line_length = 100
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index f35f144181c6..93439ba04f2d 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -595,14 +595,16 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        ICHECK_GT(this->output_.params.count(key), 0);
-        *rv = this->output_.params[key].second;
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.second;
       });
     } else if (name == "get_param_id") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        ICHECK_GT(this->output_.params.count(key), 0);
-        *rv = this->output_.params[key].first;
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.first;
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 293de2276621..6d08019a0275 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -75,7 +75,9 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu
   ctxs_ = ctxs;
   lookup_linked_param_ = lookup_linked_param_func;
   if (lookup_linked_param_ == nullptr) {
-    lookup_linked_param_ = PackedFunc(&GraphRuntime::DefaultLookupLinkedParam);
+    lookup_linked_param_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
+                                        this->DefaultLookupLinkedParam(args, rv);
+                                      });
   }
   this->SetupStorage();
   this->SetupOpExecs();
@@ -249,9 +251,10 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   this->SetupOpExecs();
 }
 
-void GraphRuntime::PreAllocatedDLTensorDeleter(DLManagedTensor* tensor) {
-  // ctx is the DLTensor which needs to get deleted. The data member points to global const memory.
-  delete reinterpret_cast<DLTensor*>(tensor);
+void GraphRuntime::LinkedNDArrayDeleter(Object* container) {
+  // container is the NDArray::Container which needs to get deleted.
+  // The data member points to global const memory, so it does not need deleting.
+  delete reinterpret_cast<NDArray::Container*>(container);
 }
 
 void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
@@ -261,14 +264,16 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   TVMContext ctx = args[3];
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
-  tvm::runtime::PackedFunc pf =
+  if (!module_lookup_linked_param_valid_) {
+    module_lookup_linked_param_ =
       mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
-  if (pf == nullptr) {
+  }
+  if (module_lookup_linked_param_ == nullptr) {
     *rv = nullptr;
     return;
   }
 
-  TVMRetValue opaque_handle = pf(storage_id);
+  TVMRetValue opaque_handle = module_lookup_linked_param_(storage_id);
   if (opaque_handle.type_code() == kTVMNullptr) {
     *rv = nullptr;
     return;
@@ -279,6 +284,7 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
 
   std::unique_ptr<NDArray::Container> container{new NDArray::Container(
       static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
+  container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter);
   *rv = NDArray(GetObjectPtr<Object>(container.release()));
 }
 
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index f1894c4830d0..627911883dfb 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -370,9 +370,9 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
   /*! \brief PackedFunc to lookup a linked paramter from a local Module. */
-  static void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv);
-  /*! \brief Delete pre-allocated DLTensor. */
-  static void PreAllocatedDLTensorDeleter(DLManagedTensor* tensor);
+  void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv);
+  /*! \brief Delete NDArray::Container with linked (i.e. static) data. */
+  static void LinkedNDArrayDeleter(Object* container);
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors. */
@@ -420,6 +420,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   std::vector<std::function<void()>> op_execs_;
   /*! \brief Linked parameter lookup function. */
   PackedFunc lookup_linked_param_;
+  /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */
+  PackedFunc module_lookup_linked_param_;
+  /*!
+   * \brief True when module_lookup_linked_param_ is valid.
+   * When the module does not include linked parmeters, module_lookup_linked_param_ will be nullptr.
+   */
+  bool module_lookup_linked_param_valid_;
 };
 
 std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 12510e0fac3a..cb115b4a1def 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -41,7 +41,9 @@ namespace runtime {
 static void RemoteNDArrayDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
   RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
-  space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
+  if (ptr->manager_ctx != nullptr) {
+    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
+  }
   delete space;
   delete ptr;
 }
@@ -54,13 +56,11 @@ static void RemoteNDArrayDeleter(Object* obj) {
  * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
  *     created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg.
  * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
- * \param deleter A function invoked when the local NDArray object is no longer used. If `handle`
- *      needs to be explicitly deleted after the NDArray is freed, this function should do that.
- * \param deleter_ctx An opaque pointer passed to deleter to identify the tensor being deleted.
+ * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray.
  */
 NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
                                       DLTensor* template_tensor, TVMContext ctx,
-                                      ADTObj::FDeleter deleter, void* deleter_ctx) {
+                                      void* remote_ndarray_handle) {
   ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
       << "The TVMContext given does not belong to the given session";
   RemoteSpace* space = new RemoteSpace();
@@ -70,8 +70,8 @@ NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* ha
                                  template_tensor->shape + template_tensor->ndim};
   NDArray::Container* data = new NDArray::Container(static_cast<void*>(space), std::move(shape_vec),
                                                     template_tensor->dtype, ctx);
-  data->manager_ctx = deleter_ctx;
-  data->SetDeleter(deleter);
+  data->manager_ctx = remote_ndarray_handle;
+  data->SetDeleter(RemoteNDArrayDeleter);
   return NDArray(GetObjectPtr<Object>(data));
 }
 
@@ -286,7 +286,7 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     void* nd_handle = args[2];
     *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
                                         AddRPCSessionMask(tensor->ctx, sess_->table_index()),
-                                        RemoteNDArrayDeleter, nd_handle);
+                                        nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
     *rv = args[1];
@@ -474,10 +474,9 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
 
 TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
     .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
-                       PackedFunc deleter) -> NDArray {
+                       void* ndarray_handle) -> NDArray {
       return NDArrayFromRemoteOpaqueHandle(
-          RPCModuleGetSession(mod), remote_array, template_tensor, ctx, [](Object* context) {},
-          nullptr);
+          RPCModuleGetSession(mod), remote_array, template_tensor, ctx, ndarray_handle);
     });
 
 }  // namespace runtime
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 11da661cceac..0fc36d96747f 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -205,9 +205,6 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   // resource_handle
   param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
 
-  // TODO(tvm-team):
-  // Update the function type to respect the ret_type field of f.
-  // Once we allow more flexibility in the PrimFunc.
   llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
 
   llvm::Function* function =
@@ -234,25 +231,28 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
       t_int64_);
 
   llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
-  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
-
-  builder_->SetInsertPoint(default_block);
-  {
-    auto ret_types_array = builder_->CreateBitCast(
+  auto ret_types_array = builder_->CreateBitCast(
 #if TVM_LLVM_VERSION >= 50
-        &function->arg_begin()[4],
+    &function->arg_begin()[4],
 #else
-        &(*(std::next(function->arg_begin(), 4))),
+    &(*(std::next(function->arg_begin(), 4))),
 #endif
-        llvm::ArrayType::get(t_int_, 1)->getPointerTo());
-
-    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
-                          builder_->CreateGEP(ret_types_array, zero_array_index_list));
-    builder_->CreateRet(ConstInt32(kTvmErrorNoError));
-  }
+    llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+    auto retval_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+        &function->arg_begin()[3],
+#else
+        &(*std::next(function->arg_begin(), 3)),
+#endif
+        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
+  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
 
-  llvm::raw_os_ostream os{std::cout};
+  builder_->SetInsertPoint(default_block);
+  builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
+                        builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
+  builder_->CreateRet(ConstInt32(kTvmErrorNoError));
 
+  // Add data to the global section.
   for (auto kv : params) {
     auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
     std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
@@ -263,25 +263,11 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
     switch_inst->addCase(
         llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
     builder_->SetInsertPoint(case_block);
-    auto retval_array = builder_->CreateBitCast(
-#if TVM_LLVM_VERSION >= 50
-        &function->arg_begin()[3],
-#else
-        &(*std::next(function->arg_begin(), 3)),
-#endif
-        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
     builder_->CreateStore(
         builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
-        builder_->CreateGEP(retval_array, zero_array_index_list));
-    auto ret_types_array = builder_->CreateBitCast(
-#if TVM_LLVM_VERSION >= 50
-        &function->arg_begin()[4],
-#else
-        &(*std::next(function->arg_begin(), 4)),
-#endif
-        llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+        builder_->CreateInBoundsGEP(retval_array, zero_array_index_list));
     builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
-                          builder_->CreateGEP(ret_types_array, zero_array_index_list));
+                          builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
     builder_->CreateRet(ConstInt32(0));
   }
 }
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index c27a97d2f611..8a675efeded1 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -75,6 +75,8 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
   llvm::Type* element_type = nullptr;
 
   auto arr_type = arr.DataType();
+  CHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr->ctx.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays";
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
                                 << arr_type.lanes();
 
@@ -84,7 +86,6 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
     num_elements *= shape_elem;
   }
 
-  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
   std::vector<llvm::Constant*> elements;
 
   switch (arr_type.code()) {
@@ -97,16 +98,16 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
       switch (arr_type.bits()) {
       case 8:
-        BuildLLVMVector<int8_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 16:
-        BuildLLVMVector<int16_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<int16_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 32:
-        BuildLLVMVector<int32_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<int32_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 64:
-        BuildLLVMVector<int64_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<int64_t>(element_type, arr->data, num_elements, &elements);
         break;
       default:
         ICHECK(false) << "should not get here";
@@ -123,16 +124,16 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
       switch (arr_type.bits()) {
       case 8:
-        BuildLLVMVector<uint8_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 16:
-        BuildLLVMVector<uint16_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 32:
-        BuildLLVMVector<uint32_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<uint32_t>(element_type, arr->data, num_elements, &elements);
         break;
       case 64:
-        BuildLLVMVector<uint64_t>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<uint64_t>(element_type, arr->data, num_elements, &elements);
         break;
       default:
         ICHECK(false) << "should not get here";
@@ -144,11 +145,11 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       switch (arr_type.bits()) {
       case 32:
         element_type = llvm::Type::getFloatTy(*ctx);
-        BuildLLVMVector<float>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
         break;
       case 64:
         element_type = llvm::Type::getDoubleTy(*ctx);
-        BuildLLVMVector<double>(element_type, tensor->dl_tensor.data, num_elements, &elements);
+        BuildLLVMVector<double>(element_type, arr->data, num_elements, &elements);
         break;
       default:
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "

From 0fc11f1a9c362006ba4f85a17a1fdcdd778346d0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 13:18:34 -0800
Subject: [PATCH 46/60] git-clang-format

---
 src/runtime/graph/graph_runtime.cc |   7 +-
 src/runtime/rpc/rpc_module.cc      |   4 +-
 src/target/llvm/codegen_llvm.cc    |  14 ++--
 src/target/llvm/codegen_params.cc  | 104 ++++++++++++++---------------
 4 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 6d08019a0275..38815396b5e5 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -75,9 +75,8 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu
   ctxs_ = ctxs;
   lookup_linked_param_ = lookup_linked_param_func;
   if (lookup_linked_param_ == nullptr) {
-    lookup_linked_param_ = PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
-                                        this->DefaultLookupLinkedParam(args, rv);
-                                      });
+    lookup_linked_param_ = PackedFunc(
+        [this](TVMArgs args, TVMRetValue* rv) { this->DefaultLookupLinkedParam(args, rv); });
   }
   this->SetupStorage();
   this->SetupOpExecs();
@@ -266,7 +265,7 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   // params are present.
   if (!module_lookup_linked_param_valid_) {
     module_lookup_linked_param_ =
-      mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
+        mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
   }
   if (module_lookup_linked_param_ == nullptr) {
     *rv = nullptr;
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index cb115b4a1def..4f721e122a4c 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -475,8 +475,8 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
 TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
     .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
                        void* ndarray_handle) -> NDArray {
-      return NDArrayFromRemoteOpaqueHandle(
-          RPCModuleGetSession(mod), remote_array, template_tensor, ctx, ndarray_handle);
+      return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array, template_tensor,
+                                           ctx, ndarray_handle);
     });
 
 }  // namespace runtime
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 0fc36d96747f..d10ed311949c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -233,18 +233,18 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
   auto ret_types_array = builder_->CreateBitCast(
 #if TVM_LLVM_VERSION >= 50
-    &function->arg_begin()[4],
+      &function->arg_begin()[4],
 #else
-    &(*(std::next(function->arg_begin(), 4))),
+      &(*(std::next(function->arg_begin(), 4))),
 #endif
-    llvm::ArrayType::get(t_int_, 1)->getPointerTo());
-    auto retval_array = builder_->CreateBitCast(
+      llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+  auto retval_array = builder_->CreateBitCast(
 #if TVM_LLVM_VERSION >= 50
-        &function->arg_begin()[3],
+      &function->arg_begin()[3],
 #else
-        &(*std::next(function->arg_begin(), 3)),
+      &(*std::next(function->arg_begin(), 3)),
 #endif
-        llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
   llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
 
   builder_->SetInsertPoint(default_block);
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 8a675efeded1..9588f876abed 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -43,30 +43,30 @@ struct LLVMConstantGetter {
 };
 
 template <typename T>
-struct LLVMConstantGetter<T, std::enable_if_t<(std::is_integral<T>::value && std::is_signed<T>::value)>> {
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && std::is_signed<T>::value)>> {
   static llvm::Constant* getElement(llvm::Type* ty, T t) {
     return llvm::ConstantInt::getSigned(ty, t);
   }
 };
 
 template <typename T>
-struct LLVMConstantGetter<T, std::enable_if_t<(std::is_integral<T>::value && !std::is_signed<T>::value)>> {
-  static llvm::Constant* getElement(llvm::Type* ty, T t) {
-    return llvm::ConstantInt::get(ty, t);
-  }
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && !std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantInt::get(ty, t); }
 };
 
 template <typename T>
 struct LLVMConstantGetter<T, std::enable_if_t<std::is_floating_point<T>::value>> {
-  static llvm::Constant* getElement(llvm::Type* ty, T t) {
-    return llvm::ConstantFP::get(ty, t);
-  }
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantFP::get(ty, t); }
 };
 
 template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
-void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements, std::vector<llvm::Constant*>* elements) {
+void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements,
+                     std::vector<llvm::Constant*>* elements) {
   for (size_t i = 0; i < num_elements; i++) {
-    auto llvm_element = LLVMConstantGetter<T>::getElement(element_type, static_cast<T*>(tensor_data)[i]);
+    auto llvm_element =
+        LLVMConstantGetter<T>::getElement(element_type, static_cast<T*>(tensor_data)[i]);
     elements->emplace_back(llvm_element);
   }
 }
@@ -97,21 +97,21 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
 
       switch (arr_type.bits()) {
-      case 8:
-        BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 16:
-        BuildLLVMVector<int16_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 32:
-        BuildLLVMVector<int32_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 64:
-        BuildLLVMVector<int64_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      default:
-        ICHECK(false) << "should not get here";
-        break;
+        case 8:
+          BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<int16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<int32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<int64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
       }
       break;
 
@@ -123,38 +123,38 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
 
       switch (arr_type.bits()) {
-      case 8:
-        BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 16:
-        BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 32:
-        BuildLLVMVector<uint32_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 64:
-        BuildLLVMVector<uint64_t>(element_type, arr->data, num_elements, &elements);
-        break;
-      default:
-        ICHECK(false) << "should not get here";
-        break;
+        case 8:
+          BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<uint32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<uint64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
       }
       break;
 
     case runtime::DataType::TypeCode::kFloat:
       switch (arr_type.bits()) {
-      case 32:
-        element_type = llvm::Type::getFloatTy(*ctx);
-        BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
-        break;
-      case 64:
-        element_type = llvm::Type::getDoubleTy(*ctx);
-        BuildLLVMVector<double>(element_type, arr->data, num_elements, &elements);
-        break;
-      default:
-        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                     << arr_type.bits() << "-bit array";
-        break;
+        case 32:
+          element_type = llvm::Type::getFloatTy(*ctx);
+          BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          element_type = llvm::Type::getDoubleTy(*ctx);
+          BuildLLVMVector<double>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                       << arr_type.bits() << "-bit array";
+          break;
       }
       break;
 

From 5bf324695af48868f162d159853465f26b740419 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 18:30:01 -0800
Subject: [PATCH 47/60] actually use storage_id, not graph node id, for param
 id

---
 src/relay/backend/graph_runtime_codegen.cc |  2 +-
 tests/python/unittest/test_link_params.py  | 87 +++++++++++-----------
 2 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 93439ba04f2d..7ed150495104 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -320,7 +320,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
     auto to_return = AddNode(node, expr);
     CHECK_EQ(to_return.size(), 1) << "Expected exactly 1 parameter node created";
-    param_storage_ids_[name] = nodes_.size() - 1;
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
     params_[name] = op->data;
     return to_return;
   }
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 4b6692d79d10..65316aad866d 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -31,7 +31,10 @@
 from tvm.contrib import utils
 
 
-TEST_SHAPE = (3, 4, 5)
+INPUT_SHAPE = (1, 3, 16, 16)
+
+
+KERNEL_SHAPE = (3, 3, 3, 3)
 
 
 # The data types that are linkable.
@@ -55,23 +58,22 @@ def dtype_info(dtype):
 RANDOM_TENSOR_START = None
 
 
-def _make_random_tensor(dtype):
-    """Create a random test tensor of shape TEST_SHAPE and the given dtype."""
+def _make_random_tensor(dtype, shape):
+    """Create a random test tensor with given shape and dtype."""
     global RAND_SEED
     if RANDOM_TENSOR_START is not None:
         to_return = np.arange(
-            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(TEST_SHAPE), dtype=dtype
-        ).reshape(TEST_SHAPE)
-        RAND_SEED += np.prod(TEST_SHAPE)
+            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(shape), dtype=dtype
+        ).reshape(shape)
+        RAND_SEED += np.prod(shape)
         return to_return
 
     dinfo = dtype_info(dtype)
     if "int" in dtype:
-        return np.random.randint(dinfo.min, dinfo.max, TEST_SHAPE, dtype=dtype)
+        return np.random.randint(dinfo.min, dinfo.max, shape, dtype=dtype)
     else:
-        to_return = np.random.uniform(0, dinfo.max, TEST_SHAPE)
-        #        to_return = dinfo.min + (np.random.random(TEST_SHAPE) * dinfo.max)
-        np.reshape(to_return, np.prod(TEST_SHAPE))[::2] *= -1
+        to_return = np.random.uniform(0, dinfo.max, shape).astype(dtype)
+        np.reshape(to_return, np.prod(shape))[::2] *= -1
         return to_return
 
 
@@ -94,10 +96,11 @@ def _lookup_sid(graph, name):
     num_outputs_seen = 0
     for i, n in enumerate(graph["nodes"]):
         if n["name"] == name:
+            print('sid', name, graph["attrs"]["storage_id"][1], num_outputs_seen)
             return graph["attrs"]["storage_id"][1][num_outputs_seen]
         else:
             if "attrs" in n and "num_outputs" in n["attrs"]:
-                num_outputs_seen += n["attrs"]["num_outputs"]
+                num_outputs_seen += int(n["attrs"]["num_outputs"])
             else:
                 num_outputs_seen += 1
 
@@ -122,15 +125,14 @@ def _verify_linked_param(dtype, lib, mod, graph, name):
     # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
     # a GraphRuntimeFactory module is created instead of the module itself.
     param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
-    print("verify", param_ptr)
-    arr_data = (_get_ctypes_dtype(dtype) * np.prod(TEST_SHAPE)).from_address(param_ptr.value)
     gen_param = lib.params[name]
-    print("gen param dtype", gen_param.dtype)
+    arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value)
     arr = np.ndarray(shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order="C")
     if "int" in gen_param.dtype:
         np.testing.assert_equal(gen_param.asnumpy(), arr)
     else:
         np.testing.assert_allclose(gen_param.asnumpy(), arr)
+    return dtype == gen_param.dtype
 
 
 def _make_mod_and_params(dtype):
@@ -139,27 +141,31 @@ def _make_mod_and_params(dtype):
     param_init = {}
 
     def _add_decl(name, dtype):
-        param_decls[name] = f"%{name} : Tensor[{TEST_SHAPE}, {dtype}]"
-        param_init[name] = _make_random_tensor(dtype)
+        param_decls[name] = f"%{name} : Tensor[{KERNEL_SHAPE}, {dtype}]"
+        param_init[name] = _make_random_tensor(dtype, KERNEL_SHAPE)
 
+    # Add several parameters so that the number of parameters
     _add_decl(f"{dtype}_a", dtype)
     _add_decl(f"{dtype}_b", dtype)
 
     mod_lines = [
         '#[version = "0.0.5"]',
-        f"def @main(%rand_input : Tensor[{TEST_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
+        f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
+        # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a
+        # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
+        # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id
+        # and not the parameter index into the graph.
+        (f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
+        (f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
+        (f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
+        (f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
+         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
+        "    %3",
+        "}",
     ]
-    if "int" in dtype:
-        mod_lines.append(
-            #            f'    %0 = bitwise_xor(%rand_input, bitwise_xor(%{dtype}_a, %{dtype}_b));')
-            f"    %0 = add(%rand_input, %{dtype}_a);"
-        )
-    else:
-        mod_lines.append(
-            f'    %0 = cast(add(%rand_input, cast(add(%{dtype}_a, %{dtype}_b), dtype="{dtype}")), dtype="{dtype}");'
-        )
-    #             f'    %0 = cast(add(%rand_input, %{dtype}_a), dtype="{dtype}");')
-    mod_lines.extend(["    %0", "}"])
 
     mod = tvm.parser.fromtext("\n".join(mod_lines))
     return mod, param_init
@@ -169,16 +175,17 @@ def _add_decl(name, dtype):
 def test_llvm_link_params():
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
-        rand_input = _make_random_tensor(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
         main_func = mod["main"]
         target = "llvm --runtime=c --system-lib --link-params"
         with tvm.transform.PassContext(opt_level=3):
             lib = tvm.relay.build(mod, target, params=param_init)
-            assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
 
+            print('graph', lib.graph_json)
             graph = json.loads(lib.graph_json)
             for p in lib.params:
-                _verify_linked_param(dtype, lib, lib.lib, graph, p)
+                _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one
 
             # Wrap in function to explicitly deallocate the runtime.
             def _run_linked(lib):
@@ -244,18 +251,18 @@ def test_c_link_params():
     temp_dir = utils.tempdir()
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
-        rand_input = _make_random_tensor(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
         main_func = mod["main"]
         target = "c --link-params"
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             lib = tvm.relay.build(mod, target, params=param_init)
-            assert set(lib.params.keys()) == {"p0"}  # NOTE: op folded
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
 
             src = lib.lib.get_source()
             lib.lib.save("test.c", "cc")
             c_dtype = _get_c_datatype(dtype)
             src_lines = src.split("\n")
-            param = lib.params["p0"].asnumpy().reshape(np.prod(TEST_SHAPE))
+            param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE))
             param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{"
             for i, line in enumerate(src_lines):
                 if line == param_def:
@@ -269,7 +276,6 @@ def test_c_link_params():
             if dtype.startswith("int"):
                 width += 1  # Account for sign
 
-            print("check printing of", param)
             while "};" not in src_lines[i]:
                 for match in HEX_NUM_RE.finditer(src_lines[i]):
                     assert match.group() == _format_c_value(dtype, width, param[cursor]), (
@@ -296,7 +302,6 @@ def test_c_link_params():
             def _run_linked(lib_mod):
                 graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
                 graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
-                print("linked", graph_rt.get_input("p0"))
                 graph_rt.run()
 
                 return graph_rt.get_output(0)
@@ -312,8 +317,6 @@ def _run_linked(lib_mod):
             lib.export_library(lib_path)
             lib_mod = tvm.runtime.load_module(lib_path)
 
-            print("unlinked", params)
-
             def _run_unlinked(lib_mod):
                 graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
                 graph_rt.set_input("rand_input", rand_input, **params)
@@ -334,12 +337,12 @@ def test_crt_link_params():
 
     for dtype in LINKABLE_DTYPES:
         mod, param_init = _make_mod_and_params(dtype)
-        rand_input = _make_random_tensor(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
         main_func = mod["main"]
         target = "c -mcpu=native --system-lib --runtime=c --link-params"
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
-            assert set(params.keys()) == {"p0"}  # NOTE: op folded
+            assert set(params.keys()) == {"p0", "p1"}  # NOTE: op folded
 
             workspace = tvm.micro.Workspace()
             compiler = tvm.micro.DefaultCompiler(target=target)
@@ -383,9 +386,9 @@ def _run_unlinked(lib):
                 graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
                 graph_rt.set_input("rand_input", rand_input, **lowered_params)
                 graph_rt.run()
-                return graph_rt.get_output(0)
+                return graph_rt.get_output(0).asnumpy()
 
-            unlinked_output = _run_unlinked(lib).asnumpy()
+            unlinked_output = _run_unlinked(lib)
 
         if "int" in dtype:
             np.testing.assert_equal(unlinked_output, linked_output)

From 953fb98eba35bf8d109adc13278026299c80d8c0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 18:30:40 -0800
Subject: [PATCH 48/60] demote log level

---
 src/runtime/graph/graph_runtime.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 38815396b5e5..e51998574eb2 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -346,10 +346,10 @@ void GraphRuntime::SetupStorage() {
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
     if (pit.linked_param.defined()) {
-      LOG(INFO) << "param " << storage_pool_.size() << " pre-loaded!";
+      LOG(DEBUG) << "param " << storage_pool_.size() << " pre-loaded!";
       storage_pool_.push_back(pit.linked_param);
     } else {
-      LOG(INFO) << "param " << storage_pool_.size() << " blank!";
+      LOG(DEBUG) << "param " << storage_pool_.size() << " blank!";
       std::vector<int64_t> shape;
       shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
       storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));

From 03b79e9c295c13069b33edb58438f879acdcdde8 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 18:31:03 -0800
Subject: [PATCH 49/60] black format

---
 tests/python/unittest/test_link_params.py | 28 +++++++++++++++--------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 65316aad866d..7b6910b0ea57 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -96,7 +96,7 @@ def _lookup_sid(graph, name):
     num_outputs_seen = 0
     for i, n in enumerate(graph["nodes"]):
         if n["name"] == name:
-            print('sid', name, graph["attrs"]["storage_id"][1], num_outputs_seen)
+            print("sid", name, graph["attrs"]["storage_id"][1], num_outputs_seen)
             return graph["attrs"]["storage_id"][1][num_outputs_seen]
         else:
             if "attrs" in n and "num_outputs" in n["attrs"]:
@@ -155,14 +155,22 @@ def _add_decl(name, dtype):
         # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
         # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id
         # and not the parameter index into the graph.
-        (f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
-        (f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
-        (f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
-         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
-        (f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
-         f'kernel_size=[3, 3], out_dtype="{dtype}");'),
+        (
+            f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
         "    %3",
         "}",
     ]
@@ -182,7 +190,7 @@ def test_llvm_link_params():
             lib = tvm.relay.build(mod, target, params=param_init)
             assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
 
-            print('graph', lib.graph_json)
+            print("graph", lib.graph_json)
             graph = json.loads(lib.graph_json)
             for p in lib.params:
                 _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one

From 7a3a9dd16953bf0a9eb4fedb9007a8b41a127d88 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 22 Nov 2020 19:05:16 -0800
Subject: [PATCH 50/60] rm debug logs

---
 src/runtime/graph/graph_runtime.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index e51998574eb2..0033a1d5d8d2 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -346,10 +346,8 @@ void GraphRuntime::SetupStorage() {
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
     if (pit.linked_param.defined()) {
-      LOG(DEBUG) << "param " << storage_pool_.size() << " pre-loaded!";
       storage_pool_.push_back(pit.linked_param);
     } else {
-      LOG(DEBUG) << "param " << storage_pool_.size() << " blank!";
       std::vector<int64_t> shape;
       shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
       storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));

From e4296eff29f929cc7f3343eff21bc1e8bc26eb9f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 15:10:43 -0800
Subject: [PATCH 51/60] address kparzysz comments

---
 src/target/llvm/codegen_params.cc   |   8 +-
 src/target/source/codegen_params.cc | 230 +++++++++-------------------
 2 files changed, 72 insertions(+), 166 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 9588f876abed..a5ddb02ac35b 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -64,11 +64,9 @@ struct LLVMConstantGetter<T, std::enable_if_t<std::is_floating_point<T>::value>>
 template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
 void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements,
                      std::vector<llvm::Constant*>* elements) {
-  for (size_t i = 0; i < num_elements; i++) {
-    auto llvm_element =
-        LLVMConstantGetter<T>::getElement(element_type, static_cast<T*>(tensor_data)[i]);
-    elements->emplace_back(llvm_element);
-  }
+  elements->resize(num_elements, nullptr);
+  std::transform(static_cast<T*>(tensor_data), static_cast<T*>(tensor_data) + num_elements,
+                 elements->begin(), [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
 }
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index 74524b3545d3..80d131e7360d 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -42,6 +42,62 @@ class DLManagedTensorDeleter {
 
 static constexpr const int kMaxLineLength = 80;
 
+template <typename T, typename = std::enable_if<std::is_integral<T>::value>>
+void PrintArray(void* data, size_t num_elements, int elements_per_row, std::string indent_str,
+                std::ostream& os) {
+  for (size_t i = 0; i < num_elements; i++) {
+    int64_t elem = static_cast<T*>(data)[i];
+    if (std::is_signed<T>::value) {
+      uint64_t to_print;
+      if (elem < 0) {
+        os << "-";
+        to_print = -elem;
+      } else {
+        os << "+";
+        to_print = elem;
+      }
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(to_print);
+    } else {
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(elem);
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+    if (((i + 1) % elements_per_row) == 0) {
+      os << "\n" << indent_str;
+    }
+  }
+}
+
+template <typename T, typename = std::enable_if<std::is_floating_point<T>::value>>
+void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int elements_per_row,
+                std::string indent_str, std::ostream& os) {
+  std::stringstream ss;
+  ss.setf(std::ios::hex | (std::is_signed<T>::value ? std::ios::showbase : 0) | std::ios::fixed | std::ios::scientific,
+          std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  for (int i = 0; i < num_elements; i++) {
+    T elem = static_cast<T*>(data)[i];
+    if (std::isinf(elem)) {
+      // C99 standard.
+      os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+    } else if (std::isnan(elem)) {
+      // GNU extension, implemenatation-dependent.
+      os << std::setw(one_element_size_bytes) << "NAN";
+    } else {
+      ss << elem;
+      os << std::setw(one_element_size_bytes) << ss.str();
+      ss.str("");
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+    if (((i + 1) % elements_per_row) == 0) {
+      os << "\n" << indent_str;
+    }
+  }
+}
+
+
 void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
   auto arr_type = arr.DataType();
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
@@ -49,13 +105,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
 
   int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
   if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
-    one_element_size_bytes += 1;  // sign bit
+    one_element_size_bytes += 1;  // sign character
     if (arr_type.bits() > 32) {
-      one_element_size_bytes += 2;  // "UL"
+      one_element_size_bytes += 2;  // "LL"
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
     if (arr_type.bits() > 32) {
-      one_element_size_bytes += 1;  // "L"
+      one_element_size_bytes += 3;  // "ULL"
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
     // Floats and doubles are printed as hex but casted.
@@ -89,85 +145,14 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
             arr_type.bits() == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
           << arr_type.bits() << "-bit array";
-
       if (arr_type.bits() == 8) {
-        for (int i = 0; i < num_elements; i++) {
-          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
-          // printing as a char.
-          int8_t elem = static_cast<int8_t*>(tensor->dl_tensor.data)[i];
-          uint16_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(2) << +static_cast<std::uint8_t>(to_print);
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<int8_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          int16_t elem = static_cast<int16_t*>(tensor->dl_tensor.data)[i];
-          uint16_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(4) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<int16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          int32_t elem = static_cast<int32_t*>(tensor->dl_tensor.data)[i];
-          uint32_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(8) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<int32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          int64_t elem = static_cast<int64_t*>(tensor->dl_tensor.data)[i];
-          uint64_t to_print;
-          if (elem < 0) {
-            os << "-";
-            to_print = -elem;
-          } else {
-            os << "+";
-            to_print = elem;
-          }
-          os << "0x" << std::setw(16) << to_print;
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<int64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -180,102 +165,25 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           << arr_type.bits() << "-bit array";
 
       if (arr_type.bits() == 8) {
-        for (int i = 0; i < num_elements; i++) {
-          // NOTE: for special types int8_t and uint8_t, need to promote to int type to avoid
-          // printing as a char.
-          os << "0x" << std::setw(2)
-             << +static_cast<std::uint8_t>(static_cast<uint8_t*>(tensor->dl_tensor.data)[i]);
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<uint8_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 16) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(4) << static_cast<uint16_t*>(tensor->dl_tensor.data)[i];
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<uint16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(8) << static_cast<uint32_t*>(tensor->dl_tensor.data)[i];
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<uint32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          os << "0x" << std::setw(16) << static_cast<uint64_t*>(tensor->dl_tensor.data)[i] << "UL";
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<uint64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else {
         CHECK(false) << "should not get here";
       }
       break;
 
     case runtime::DataType::TypeCode::kFloat: {
-      std::stringstream ss;
-      ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
-              std::ios::basefield | std::ios::showbase | std::ios::floatfield);
       os.fill(' ');
       os.setf(std::ios::left, std::ios::adjustfield);
       if (arr_type.bits() == 32) {
-        for (int i = 0; i < num_elements; i++) {
-          float elem = static_cast<float*>(tensor->dl_tensor.data)[i];
-          if (std::isinf(elem)) {
-            // C99 standard.
-            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-          } else if (std::isnan(elem)) {
-            // GNU extension, implemenatation-dependent.
-            os << std::setw(one_element_size_bytes) << "NAN";
-          } else {
-            ss << elem;
-            os << std::setw(one_element_size_bytes) << ss.str();
-            ss.str("");
-          }
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
-        std::cout << "\n";
+        PrintArray<float>(tensor->dl_tensor.data, num_elements, one_element_size_bytes, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 64) {
-        for (int i = 0; i < num_elements; i++) {
-          double elem = static_cast<double*>(tensor->dl_tensor.data)[i];
-          if (std::isinf(elem)) {
-            // C99 standard.
-            os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
-          } else if (std::isnan(elem)) {
-            // GNU extension, implemenatation-dependent.
-            os << std::setw(one_element_size_bytes) << "NAN";
-          } else {
-            ss << elem;
-            os << std::setw(one_element_size_bytes) << ss.str();
-            ss.str("");
-          }
-          if (i < num_elements - 1) {
-            os << ", ";
-          }
-          if (((i + 1) % elements_per_row) == 0) {
-            os << "\n" << indent_str;
-          }
-        }
+        PrintArray<double>(tensor->dl_tensor.data, num_elements, one_element_size_bytes, elements_per_row, indent_str, os);
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
                      << arr_type.bits() << "-bit array";

From a05871fa904cc545d4827ce20a697ee8afe93550 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 15:11:21 -0800
Subject: [PATCH 52/60] git-clang-format

---
 src/target/llvm/codegen_params.cc   |  3 ++-
 src/target/source/codegen_params.cc | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index a5ddb02ac35b..925754c3eef4 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -66,7 +66,8 @@ void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_ele
                      std::vector<llvm::Constant*>* elements) {
   elements->resize(num_elements, nullptr);
   std::transform(static_cast<T*>(tensor_data), static_cast<T*>(tensor_data) + num_elements,
-                 elements->begin(), [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
+                 elements->begin(),
+                 [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
 }
 
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index 80d131e7360d..223073717312 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -73,7 +73,8 @@ template <typename T, typename = std::enable_if<std::is_floating_point<T>::value
 void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int elements_per_row,
                 std::string indent_str, std::ostream& os) {
   std::stringstream ss;
-  ss.setf(std::ios::hex | (std::is_signed<T>::value ? std::ios::showbase : 0) | std::ios::fixed | std::ios::scientific,
+  ss.setf(std::ios::hex | (std::is_signed<T>::value ? std::ios::showbase : 0) | std::ios::fixed |
+              std::ios::scientific,
           std::ios::basefield | std::ios::showbase | std::ios::floatfield);
   for (int i = 0; i < num_elements; i++) {
     T elem = static_cast<T*>(data)[i];
@@ -97,7 +98,6 @@ void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int
   }
 }
 
-
 void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
   auto arr_type = arr.DataType();
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
@@ -167,11 +167,14 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
       if (arr_type.bits() == 8) {
         PrintArray<uint8_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 16) {
-        PrintArray<uint16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintArray<uint16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
+                             os);
       } else if (arr_type.bits() == 32) {
-        PrintArray<uint32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintArray<uint32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
+                             os);
       } else if (arr_type.bits() == 64) {
-        PrintArray<uint64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintArray<uint64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
+                             os);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -181,9 +184,11 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
       os.fill(' ');
       os.setf(std::ios::left, std::ios::adjustfield);
       if (arr_type.bits() == 32) {
-        PrintArray<float>(tensor->dl_tensor.data, num_elements, one_element_size_bytes, elements_per_row, indent_str, os);
+        PrintArray<float>(tensor->dl_tensor.data, num_elements, one_element_size_bytes,
+                          elements_per_row, indent_str, os);
       } else if (arr_type.bits() == 64) {
-        PrintArray<double>(tensor->dl_tensor.data, num_elements, one_element_size_bytes, elements_per_row, indent_str, os);
+        PrintArray<double>(tensor->dl_tensor.data, num_elements, one_element_size_bytes,
+                           elements_per_row, indent_str, os);
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
                      << arr_type.bits() << "-bit array";

From dd862fc549a7111165064f162e7c76d472bbf3fc Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 15:14:29 -0800
Subject: [PATCH 53/60] cpplint

---
 src/target/llvm/codegen_params.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 925754c3eef4..fd44117ade95 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -24,6 +24,7 @@
 
 #include "codegen_params.h"
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 

From b08e24f982eacf24817ba9bfd722e34fb75ffb1f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 15:35:31 -0800
Subject: [PATCH 54/60] fix compile bugs on linux

---
 src/target/source/codegen_params.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index 223073717312..ea7f19418f20 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -73,10 +73,14 @@ template <typename T, typename = std::enable_if<std::is_floating_point<T>::value
 void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int elements_per_row,
                 std::string indent_str, std::ostream& os) {
   std::stringstream ss;
-  ss.setf(std::ios::hex | (std::is_signed<T>::value ? std::ios::showbase : 0) | std::ios::fixed |
-              std::ios::scientific,
-          std::ios::basefield | std::ios::showbase | std::ios::floatfield);
-  for (int i = 0; i < num_elements; i++) {
+  if (std::is_signed<T>::value) {
+    ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  } else {
+    ss.setf(std::ios::hex | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  }
+  for (size_t i = 0; i < num_elements; i++) {
     T elem = static_cast<T*>(data)[i];
     if (std::isinf(elem)) {
       // C99 standard.

From bcbeda4be7679c65b1b8b81b5e546100952ab447 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 20:45:47 -0800
Subject: [PATCH 55/60] revert pyproject, address tqchen, kparzysz comments

---
 pyproject.toml                      | 92 -----------------------------
 src/relay/backend/build_module.cc   |  1 +
 src/target/source/codegen_params.cc |  8 ++-
 3 files changed, 7 insertions(+), 94 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d273b25eb3cd..5cca711ddbe6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,95 +46,3 @@ exclude = '''
   )/
 )
 '''
-[tool.poetry]
-name = "incubator-tvm"
-version = "0.1.0"
-description = ""
-authors = ["Your Name <you@example.com>"]
-packages = [
-    { include = "tvm", from = "../../../../python" },
-]
-
-[tool.poetry.dependencies]
-attrs = "^19"
-decorator = "^4.4"
-numpy = "~1.19"
-psutil = "^5"
-scipy = "^1.4"
-python = "^3.6"
-tornado = "^6"
-typed_ast = "^1.4"
-
-# AutoTVM
-xgboost = {version = "^1.1", optional = true}
-
-#############
-# Importers #
-#############
-
-# NOTE: Caffe frontend dependency is from torch package.
-
-# CoreML
-coremltools = {version = "^3.3", optional = true}
-
-# Darknet
-opencv-python = {version = "^4.2", optional = true}
-cffi = {version = "^1.14", optional = true}
-
-# NOTE: Keras provided by tensorflow package.
-# If TF version conflict, maybe try: keras = "2.3.1"
-
-# MXNet frontend
-mxnet = {version = "^1.6.0", optional = true}
-
-# ONNX frontend
-onnx = {version = "1.6.0", optional = true}
-onnxruntime = {version = "1.0.0", optional = true}
-
-# Pytorch (also used by ONNX)
-torch = {version = "1.4.0", optional = true}
-torchvision = {version = "0.5.0", optional = true}
-# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
-# wheel!!!
-future = {version = "*", optional = true}
-
-# Tensorflow frontend
-tensorflow = {version = "^2.1", optional = true}
-tensorflow-estimator = {version = "^2.1", optional = true}
-
-# TFLite frontend
-tflite = {version = "2.1.0", optional = true}
-wheel = "*"
-
-
-[tool.poetry.extras]
-xgboost = ["xgboost"]
-importer-caffe2 = ["torch"]
-importer-coreml = ["coremltools"]
-importer-darknet = ["opencv-python"]
-importer-keras = ["tensorflow", "tensorflow-estimator"]
-importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
-importer-pytorch = ["torch", "torchvision", "future"]
-importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
-importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"]
-
-[tool.poetry.dev-dependencies]
-autodocsumm = "^0.1"
-black = "^19.10b0"
-sphinx = "^3.0"
-sphinx-gallery = "^0.4"
-sphinx-rtd-theme = "^0.4"
-matplotlib = "^3.2"
-Image = "^1.5"
-recommonmark = "^0.6"
-pillow = "< 7"
-pyformat = "^0.7"
-pylint = "^2.4"
-pytest = "^5.4"
-
-[build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
-
-[tool.autopep8]
-max_line_length = 100
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 189227bb15a1..82ac1c57018e 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -462,6 +462,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
     if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
+    // Generate a placeholder function that attaches linked params as its arguments.
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
       CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
       auto param_ids = graph_codegen_->GetParamIds();
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index ea7f19418f20..b36bff019980 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -119,8 +119,12 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     }
   } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
     // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */ +
-                              1 /* extra decimal digit in exponent */;
+    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
+    if (arr_type.bits() == 64) {
+      one_element_size_bytes += 2;  /* 4 decimal digits in exponent, relative to bits / 4 */
+    } else if (arr_type.bits() == 32) {
+      one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
+    }
   }
 
   int elements_per_row = 16;

From bf4207765dc6eab6f81061c2e968326f2020940f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 20:55:17 -0800
Subject: [PATCH 56/60] git-clang-format

---
 src/target/source/codegen_params.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index b36bff019980..dde95d1964aa 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -121,7 +121,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     // Floats and doubles are printed as hex but casted.
     one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
     if (arr_type.bits() == 64) {
-      one_element_size_bytes += 2;  /* 4 decimal digits in exponent, relative to bits / 4 */
+      one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
     } else if (arr_type.bits() == 32) {
       one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
     }

From 883c878986d8de43d2dacfb0dfcd701b023c8b26 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 24 Nov 2020 13:48:38 -0800
Subject: [PATCH 57/60] address tqchen, others' comments

---
 src/target/llvm/codegen_params.cc   |  18 ++--
 src/target/llvm/codegen_params.h    |  13 ++-
 src/target/source/codegen_params.cc | 162 ++++++++++++++++------------
 src/target/source/codegen_params.h  |  13 +++
 4 files changed, 127 insertions(+), 79 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index fd44117ade95..254cfe8d1283 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -31,13 +31,6 @@
 namespace tvm {
 namespace codegen {
 
-namespace {
-class DLManagedTensorDeleter {
- public:
-  void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
-};
-}  // namespace
-
 template <typename T, typename E = void>
 struct LLVMConstantGetter {
   static llvm::Constant* getElement(llvm::Type* ty, T t);
@@ -143,6 +136,11 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
     case runtime::DataType::TypeCode::kFloat:
       switch (arr_type.bits()) {
+        case 16:
+          // NOTE: float16 is treated as uint16_t.
+          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
         case 32:
           element_type = llvm::Type::getFloatTy(*ctx);
           BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
@@ -158,6 +156,12 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       }
       break;
 
+    case runtime::DataType::TypeCode::kBFloat:
+      CHECK(arr_type.bits() == 16) << "CodegenParams: only support 16-bit bfloat; saw "
+                                   << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+      BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+
     default:
       CHECK(false) << "Data type not supported";
   }
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
index c21820aa6c3f..771bc201f7aa 100644
--- a/src/target/llvm/codegen_params.h
+++ b/src/target/llvm/codegen_params.h
@@ -32,12 +32,17 @@
 namespace tvm {
 namespace codegen {
 
+/*!
+ * \brief Convert an NDArray to an LLVM array of constants.
+ *
+ * The supplied NDArray is flattened, and each element is converted to the appropriate LLVM type.
+ *
+ * \param ctx LLVM context used to create the various primitive datatypes.
+ * \param arr NDArray to convert.
+ * \return LLVM array containing the array data.
+ */
 llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
 
-void LLVMCodeGenParams(llvm::LLVMContext* ctx, llvm::Module* module, int64_t storage_id_offset,
-                       ::tvm::runtime::Array<String> param_names,
-                       ::tvm::runtime::Array<runtime::NDArray> params_by_sid);
-
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index dde95d1964aa..c1cb59d6e870 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -29,23 +29,48 @@
 #include <iomanip>
 #include <memory>
 #include <string>
+#include <strings.h>
 
 namespace tvm {
 namespace codegen {
 
-namespace {
-class DLManagedTensorDeleter {
- public:
-  void operator()(DLManagedTensor* ptr) { ptr->deleter(ptr); }
-};
-}  // namespace
-
+/*! \brief maximum line length of generated parameters, including indent. */
 static constexpr const int kMaxLineLength = 80;
 
-template <typename T, typename = std::enable_if<std::is_integral<T>::value>>
-void PrintArray(void* data, size_t num_elements, int elements_per_row, std::string indent_str,
-                std::ostream& os) {
+static int ComputeNumElementsPerRow(int one_element_size_bytes, int indent_chars) {
+  if (one_element_size_bytes > kMaxLineLength - indent_chars) {
+    return 1;
+  }
+  // When multiple elements fit per line, divide the available space by the size of one element,
+  // and return the largest power of 2 less than the result. Using power-of-2-sized elements allows
+  // for easily traversing the generated code.
+  return 1 << (fls((kMaxLineLength - indent_chars) / one_element_size_bytes) - 1);
+}
+
+template <typename T, typename Enable = std::enable_if<std::is_integral<T>::value>>
+void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */);
+  if (std::is_signed<T>::value) {
+    one_element_size_bytes += 1;  // sign character
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 2;  // "LL"
+    }
+  } else {
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 3;  // "ULL"
+    }
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
   for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
     int64_t elem = static_cast<T*>(data)[i];
     if (std::is_signed<T>::value) {
       uint64_t to_print;
@@ -57,21 +82,40 @@ void PrintArray(void* data, size_t num_elements, int elements_per_row, std::stri
         to_print = elem;
       }
       os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(to_print);
+      if (sizeof(T) == 64 / 8) {
+        os << "LL";
+      }
     } else {
       os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(elem);
+      if (sizeof(T) == 64 / 8) {
+        os << "ULL";
+      }
     }
     if (i < num_elements - 1) {
       os << ", ";
     }
-    if (((i + 1) % elements_per_row) == 0) {
-      os << "\n" << indent_str;
-    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
   }
 }
 
-template <typename T, typename = std::enable_if<std::is_floating_point<T>::value>>
-void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int elements_per_row,
-                std::string indent_str, std::ostream& os) {
+template <typename T, typename Enable = std::enable_if<std::is_floating_point<T>::value>>
+void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  // Floats and doubles are printed as hex but casted.
+  int one_element_size_bytes =
+    (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */)
+    + 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
+  if (sizeof(T) == 64 / 8) {
+    one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
+  } else if (sizeof(T) == 32 / 8) {
+    one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
   std::stringstream ss;
   if (std::is_signed<T>::value) {
     ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
@@ -81,6 +125,13 @@ void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int
             std::ios::basefield | std::ios::showbase | std::ios::floatfield);
   }
   for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
+
     T elem = static_cast<T*>(data)[i];
     if (std::isinf(elem)) {
       // C99 standard.
@@ -96,9 +147,10 @@ void PrintArray(void* data, size_t num_elements, int one_element_size_bytes, int
     if (i < num_elements - 1) {
       os << ", ";
     }
-    if (((i + 1) % elements_per_row) == 0) {
-      os << "\n" << indent_str;
-    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
   }
 }
 
@@ -107,42 +159,12 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
                                 << arr_type.lanes();
 
-  int one_element_size_bytes = (arr_type.bits() / 4) + (2 /* "0x" */) + (2 /* ", " */);
-  if (arr_type.code() == runtime::DataType::TypeCode::kInt) {
-    one_element_size_bytes += 1;  // sign character
-    if (arr_type.bits() > 32) {
-      one_element_size_bytes += 2;  // "LL"
-    }
-  } else if (arr_type.code() == runtime::DataType::TypeCode::kUInt) {
-    if (arr_type.bits() > 32) {
-      one_element_size_bytes += 3;  // "ULL"
-    }
-  } else if (arr_type.code() == runtime::DataType::TypeCode::kFloat) {
-    // Floats and doubles are printed as hex but casted.
-    one_element_size_bytes += 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
-    if (arr_type.bits() == 64) {
-      one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
-    } else if (arr_type.bits() == 32) {
-      one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
-    }
-  }
-
-  int elements_per_row = 16;
-  while (elements_per_row > 1 &&
-         (elements_per_row * one_element_size_bytes) > (kMaxLineLength - indent_chars)) {
-    elements_per_row /= 2;
-  }
-
-  std::string indent_str(indent_chars, ' ');
-  os << indent_str;
-
   auto shape = arr.Shape();
   int num_elements = 1;
   for (auto shape_elem : shape) {
     num_elements *= shape_elem;
   }
 
-  std::unique_ptr<DLManagedTensor, DLManagedTensorDeleter> tensor(arr.ToDLPack());
   auto old_fmtflags = os.flags();
   os.setf(std::ios::internal | std::ios::hex,
           std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
@@ -154,13 +176,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
           << arr_type.bits() << "-bit array";
       if (arr_type.bits() == 8) {
-        PrintArray<int8_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 16) {
-        PrintArray<int16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 32) {
-        PrintArray<int32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 64) {
-        PrintArray<int64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -173,16 +195,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           << arr_type.bits() << "-bit array";
 
       if (arr_type.bits() == 8) {
-        PrintArray<uint8_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str, os);
+        PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 16) {
-        PrintArray<uint16_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
-                             os);
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 32) {
-        PrintArray<uint32_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
-                             os);
+        PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 64) {
-        PrintArray<uint64_t>(tensor->dl_tensor.data, num_elements, elements_per_row, indent_str,
-                             os);
+        PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -191,12 +210,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     case runtime::DataType::TypeCode::kFloat: {
       os.fill(' ');
       os.setf(std::ios::left, std::ios::adjustfield);
-      if (arr_type.bits() == 32) {
-        PrintArray<float>(tensor->dl_tensor.data, num_elements, one_element_size_bytes,
-                          elements_per_row, indent_str, os);
+      if (arr_type.bits() == 16) {
+        // NOTE: print types not widely supported by C as uint16_t.
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os);
       } else if (arr_type.bits() == 64) {
-        PrintArray<double>(tensor->dl_tensor.data, num_elements, one_element_size_bytes,
-                           elements_per_row, indent_str, os);
+        PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os);
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
                      << arr_type.bits() << "-bit array";
@@ -204,13 +224,19 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
       break;
     }
 
+    case runtime::DataType::TypeCode::kBFloat: {
+      // NOTE: print types not widely supported by C as uint16_t.
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support generating 16-bit bfloat params; saw "
+          << arr_type.bits() << "-bit array";
+      PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      break;
+    }
+
     default:
       CHECK(false) << "Data type not supported";
   }
 
-  if (num_elements % elements_per_row != 0) {
-    os << "\n";
-  }
   os.flags(old_fmtflags);
 }
 
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
index a3d277eac590..cc126c767c58 100644
--- a/src/target/source/codegen_params.h
+++ b/src/target/source/codegen_params.h
@@ -31,6 +31,19 @@
 namespace tvm {
 namespace codegen {
 
+/*!
+ * \brief Write a C representation of arr to os.
+ *
+ * This function generates a comma-separated, indented list of C integer listeals suitable for use
+ * in an initializer. The NDArray is flattened and then the list is produced element by element.
+ * For the int16_t NDArray [-3, -2, -1, 0, 1, 2, 3, ...], and indent_chars = 4, the following output
+ * is produced:
+ *     -0x0003, -0x0002, -0x0001, +0x0000, +0x0001, +0x0002, +0x0003
+ *
+ * \param arr The array to generate
+ * \param indent_chars Number of chars to indent
+ * \param os Output stream where the array data should be written.
+ */
 void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
 
 }  // namespace codegen

From 4400a346c303746e9c9bf651a20a36c3c757576e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 24 Nov 2020 13:49:05 -0800
Subject: [PATCH 58/60] git-clang-format

---
 src/target/llvm/codegen_params.cc   |  4 ++--
 src/target/source/codegen_params.cc | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 254cfe8d1283..694be5621606 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -157,8 +157,8 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
       break;
 
     case runtime::DataType::TypeCode::kBFloat:
-      CHECK(arr_type.bits() == 16) << "CodegenParams: only support 16-bit bfloat; saw "
-                                   << arr_type.bits() << "-bit array";
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits() << "-bit array";
       element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
       BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
 
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index c1cb59d6e870..bd29722e312e 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -24,12 +24,12 @@
 #include "codegen_params.h"
 
 #include <dlpack/dlpack.h>
+#include <strings.h>
 
 #include <cmath>
 #include <iomanip>
 #include <memory>
 #include <string>
-#include <strings.h>
 
 namespace tvm {
 namespace codegen {
@@ -104,9 +104,8 @@ void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::
 template <typename T, typename Enable = std::enable_if<std::is_floating_point<T>::value>>
 void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
   // Floats and doubles are printed as hex but casted.
-  int one_element_size_bytes =
-    (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */)
-    + 1 /* sign */ + 1 /* decimal point */ + 1 /* exponent sign */;
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */) + 1 /* sign */ +
+                               1 /* decimal point */ + 1 /* exponent sign */;
   if (sizeof(T) == 64 / 8) {
     one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
   } else if (sizeof(T) == 32 / 8) {
@@ -227,8 +226,8 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     case runtime::DataType::TypeCode::kBFloat: {
       // NOTE: print types not widely supported by C as uint16_t.
       CHECK(arr_type.bits() == 16)
-          << "CodegenParams: only support generating 16-bit bfloat params; saw "
-          << arr_type.bits() << "-bit array";
+          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
+          << "-bit array";
       PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
       break;
     }

From f53c2e2813b95ff1b418648191b2cf10fff17595 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 24 Nov 2020 14:24:46 -0800
Subject: [PATCH 59/60] remove fls, which isn't widely available

---
 src/target/source/codegen_params.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index bd29722e312e..cc7695abfd25 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -24,7 +24,6 @@
 #include "codegen_params.h"
 
 #include <dlpack/dlpack.h>
-#include <strings.h>
 
 #include <cmath>
 #include <iomanip>
@@ -44,7 +43,13 @@ static int ComputeNumElementsPerRow(int one_element_size_bytes, int indent_chars
   // When multiple elements fit per line, divide the available space by the size of one element,
   // and return the largest power of 2 less than the result. Using power-of-2-sized elements allows
   // for easily traversing the generated code.
-  return 1 << (fls((kMaxLineLength - indent_chars) / one_element_size_bytes) - 1);
+  int elements_per_row = (kMaxLineLength - indent_chars) / one_element_size_bytes;
+
+  // Implementation of fls. Iteratively clear the LSB until one bit remains.
+  while ((elements_per_row & (elements_per_row - 1)) > 0) {
+    elements_per_row &= elements_per_row - 1;
+  }
+  return elements_per_row;
 }
 
 template <typename T, typename Enable = std::enable_if<std::is_integral<T>::value>>

From 754cf350c660a767e3663ca2911129dfaba27759 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 25 Nov 2020 08:29:09 -0800
Subject: [PATCH 60/60] address tqchen comments

---
 src/runtime/graph/graph_runtime.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 0033a1d5d8d2..26b66be72bd4 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -253,7 +253,7 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
 void GraphRuntime::LinkedNDArrayDeleter(Object* container) {
   // container is the NDArray::Container which needs to get deleted.
   // The data member points to global const memory, so it does not need deleting.
-  delete reinterpret_cast<NDArray::Container*>(container);
+  delete static_cast<NDArray::Container*>(container);
 }
 
 void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {