diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c
index 6018d40dd300..e411c929f936 100644
--- a/apps/bundle_deploy/bundle.c
+++ b/apps/bundle_deploy/bundle.c
@@ -32,8 +32,10 @@
 
 #define CRT_MEMORY_NUM_PAGES 16384
 #define CRT_MEMORY_PAGE_SIZE_LOG2 10
+#define CRT_MEMORY_PAGE_SIZE_BYTES (1 << CRT_MEMORY_PAGE_SIZE_LOG2)
 
-static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * (1 << CRT_MEMORY_PAGE_SIZE_LOG2)];
+static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * CRT_MEMORY_PAGE_SIZE_BYTES]
+    __attribute__((aligned(CRT_MEMORY_PAGE_SIZE_BYTES)));
 static MemoryManagerInterface* g_memory_manager;
 
 /*! \brief macro to do C API call */
diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index 18a7b2bbb0ff..846e21b36883 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -33,8 +33,10 @@
 
 #define CRT_MEMORY_NUM_PAGES 16384
 #define CRT_MEMORY_PAGE_SIZE_LOG2 10
+#define CRT_MEMORY_PAGE_SIZE_BYTES (1 << CRT_MEMORY_PAGE_SIZE_LOG2)
 
-static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * (1 << CRT_MEMORY_PAGE_SIZE_LOG2)];
+static uint8_t g_crt_memory[CRT_MEMORY_NUM_PAGES * CRT_MEMORY_PAGE_SIZE_BYTES]
+    __attribute__((aligned(CRT_MEMORY_PAGE_SIZE_BYTES)));
 static MemoryManagerInterface* g_memory_manager;
 
 /*! \brief macro to do C API call */
diff --git a/apps/bundle_deploy/test.cc b/apps/bundle_deploy/test.cc
index 25056f4d17a6..7194427426d2 100644
--- a/apps/bundle_deploy/test.cc
+++ b/apps/bundle_deploy/test.cc
@@ -23,6 +23,7 @@
 #include <sys/time.h>
 #include <tvm/runtime/c_runtime_api.h>
 
+#include <cstdlib>
 #include <iostream>
 #include <random>
 #include <vector>
@@ -52,7 +53,7 @@ char* read_all_or_die(const char* name, const char* file_path, size_t* out_size)
     *out_size = st.st_size;
   }
 
-  char* data = (char*)malloc(st.st_size);
+  char* data = (char*)std::aligned_alloc(64, st.st_size);
   FILE* fp = fopen(file_path, "rb");
   size_t bytes_to_read = st.st_size;
   size_t bytes_read = 0;
@@ -129,7 +130,7 @@ int main(int argc, char** argv) {
   ftvm_runtime_run(handle);
   gettimeofday(&t3, 0);
 
-  float output_storage[10 * 5];
+  float* output_storage = static_cast<float*>(std::aligned_alloc(64, 10 * 5 * sizeof(float)));
   std::vector<int64_t> output_shape = {10, 5};
   DLTensor output;
   output.data = output_storage;
@@ -162,6 +163,9 @@ int main(int argc, char** argv) {
       (t4.tv_sec - t3.tv_sec) * 1000.0f + (t4.tv_usec - t3.tv_usec) / 1000.f,
       (t5.tv_sec - t4.tv_sec) * 1000.0f + (t5.tv_usec - t4.tv_usec) / 1000.f);
 
+  free(output_storage);
+  free(result_storage);
+  free(input_storage);
   free(json_data);
   free(params_data);
   dlclose(bundle);
diff --git a/apps/bundle_deploy/test_static.c b/apps/bundle_deploy/test_static.c
index b9c980843ea1..4258b3a4e0f9 100644
--- a/apps/bundle_deploy/test_static.c
+++ b/apps/bundle_deploy/test_static.c
@@ -54,12 +54,12 @@ int main(int argc, char** argv) {
   void* handle = tvm_runtime_create(json_data, params_data, params_size, argv[0]);
   gettimeofday(&t1, 0);
 
-  float input_storage[10 * 5];
+  float* input_storage = aligned_alloc(64, 10 * 5 * sizeof(float));
   fp = fopen(argv[1], "rb");
   fread(input_storage, 10 * 5, 4, fp);
   fclose(fp);
 
-  float result_storage[10 * 5];
+  float* result_storage = aligned_alloc(64, 10 * 5 * sizeof(float));
   fp = fopen(argv[2], "rb");
   fread(result_storage, 10 * 5, 4, fp);
   fclose(fp);
@@ -82,7 +82,7 @@ int main(int argc, char** argv) {
   tvm_runtime_run(handle);
   gettimeofday(&t3, 0);
 
-  float output_storage[10 * 5];
+  float* output_storage = aligned_alloc(64, 10 * 5 * sizeof(float));
   DLTensor output;
   output.data = output_storage;
   DLDevice out_dev = {kDLCPU, 0};
@@ -117,6 +117,9 @@ int main(int argc, char** argv) {
       (t4.tv_sec - t3.tv_sec) * 1000 + (t4.tv_usec - t3.tv_usec) / 1000.f,
       (t5.tv_sec - t4.tv_sec) * 1000 + (t5.tv_usec - t4.tv_usec) / 1000.f);
 
+  free(output_storage);
+  free(result_storage);
+  free(input_storage);
   free(json_data);
   free(params_data);
 
diff --git a/cmake/utils/CRTConfig.cmake b/cmake/utils/CRTConfig.cmake
index 42c523b08786..7f6b058bfd0a 100644
--- a/cmake/utils/CRTConfig.cmake
+++ b/cmake/utils/CRTConfig.cmake
@@ -25,6 +25,7 @@ function(generate_crt_config platform output_path)
   set(TVM_CRT_MAX_STRLEN_DLTYPE 10)
   set(TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120)
   set(TVM_CRT_MAX_STRLEN_PARAM_NAME 80)
+  set(TVM_CRT_ALLOC_ALIGNMENT 64)
 
   if("${platform}" STREQUAL "zephyr")
     set(TVM_CRT_MAX_PACKET_SIZE_BYTES 512)
diff --git a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py
index d6e1922efa85..52b1dec2c9fe 100755
--- a/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py
+++ b/rust/tvm-graph-rt/tests/test_tvm_basic/src/build_test_lib.py
@@ -23,19 +23,25 @@
 
 import tvm
 from tvm.relay.backend import Runtime
-from tvm import te
+from tvm.script import tir as T
 
 
 def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
+    @T.prim_func
+    def func(var_A: T.handle, var_B: T.handle, var_C: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
+        n = T.int32()
+        A = T.match_buffer(var_A, (n,), align=1)
+        B = T.match_buffer(var_B, (n,), align=1)
+        C = T.match_buffer(var_C, (n,), align=1)
+        for i in T.parallel(n):
+            with T.block("C"):
+                vi = T.axis.spatial(n, i)
+                C[vi] = A[vi] + B[vi]
+
     runtime = Runtime("cpp", {"system-lib": True})
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
-    tvm.build(s, [A, B, C], "llvm", runtime=runtime).save(osp.join(sys.argv[1], "test.o"))
+    print(tvm.lower(func, simple_mode=True))
+    tvm.build(func, target="llvm", runtime=runtime).save(osp.join(sys.argv[1], "test.o"))
 
 
 if __name__ == "__main__":
diff --git a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py
index 4b270fa17cbc..415832a614f0 100755
--- a/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py
+++ b/rust/tvm-graph-rt/tests/test_tvm_dso/src/build_test_lib.py
@@ -22,20 +22,26 @@
 import sys
 
 import tvm
-from tvm import te
 from tvm.contrib import cc
+from tvm.script import tir as T
 
 
 def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    @T.prim_func
+    def func(var_A: T.handle, var_B: T.handle, var_C: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
+        n = T.int32()
+        A = T.match_buffer(var_A, (n,), align=1)
+        B = T.match_buffer(var_B, (n,), align=1)
+        C = T.match_buffer(var_C, (n,), align=1)
+        for i in T.parallel(n):
+            with T.block("C"):
+                vi = T.axis.spatial(n, i)
+                C[vi] = A[vi] + B[vi]
+
+    print(tvm.lower(func, simple_mode=True))
     obj_file = osp.join(sys.argv[1], "test.o")
-    tvm.build(s, [A, B, C], "llvm").save(obj_file)
+    tvm.build(func, "llvm").save(obj_file)
     cc.create_shared(osp.join(sys.argv[1], "test.so"), [obj_file])
 
 
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py b/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py
index 2bf327a31b1b..6d744b28c67e 100755
--- a/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py
+++ b/rust/tvm-graph-rt/tests/test_wasm32/src/build_test_lib.py
@@ -22,20 +22,26 @@
 import sys
 
 import tvm
-from tvm import te
 from tvm.relay.backend import Runtime
+from tvm.script import tir as T
 
 
 def main():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-    s[C].parallel(s[C].op.axis[0])
-    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    @T.prim_func
+    def func(var_A: T.handle, var_B: T.handle, var_C: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
+        n = T.int32()
+        A = T.match_buffer(var_A, (n,), align=1)
+        B = T.match_buffer(var_B, (n,), align=1)
+        C = T.match_buffer(var_C, (n,), align=1)
+        for i in T.parallel(n):
+            with T.block("C"):
+                vi = T.axis.spatial(n, i)
+                C[vi] = A[vi] + B[vi]
+
+    print(tvm.lower(s, simple_mode=True))
     runtime = Runtime("cpp", {"system-lib": True})
-    tvm.build(s, [A, B, C], "llvm -mtriple=wasm32-unknown-unknown", runtime=runtime).save(
+    tvm.build(func, target="llvm -mtriple=wasm32-unknown-unknown", runtime=runtime).save(
         osp.join(sys.argv[1], "test.o")
     )
 
diff --git a/rust/tvm/tests/basics/src/tvm_add.py b/rust/tvm/tests/basics/src/tvm_add.py
index fc5c4213bd08..807e27db7794 100755
--- a/rust/tvm/tests/basics/src/tvm_add.py
+++ b/rust/tvm/tests/basics/src/tvm_add.py
@@ -20,23 +20,34 @@
 import sys
 
 import tvm
-from tvm import te
 from tvm.contrib import cc
+from tvm.script import tir as T
 
 
 def main(target, out_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
+    @T.prim_func
+    def func(var_A: T.handle, var_B: T.handle, var_C: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": T.bool(True)})
+        n = T.int32()
+        A = T.match_buffer(var_A, (n,), align=1)
+        B = T.match_buffer(var_B, (n,), align=1)
+        C = T.match_buffer(var_C, (n,), align=1)
+        # with T.block("root"):
+        for i in range(n):
+            with T.block("C"):
+                v_i = T.axis.spatial(n, i)
+                T.reads(A[v_i], B[v_i])
+                T.writes(C[v_i])
+                C[v_i] = A[v_i] + B[v_i]
 
     if target == "cuda":
-        bx, tx = s[C].split(C.op.axis[0], factor=64)
-        s[C].bind(bx, te.thread_axis("blockIdx.x"))
-        s[C].bind(tx, te.thread_axis("threadIdx.x"))
+        sch = tvm.tir.Schedule(func)
+        i, j = sch.split(sch.get_loops("C")[0], [None, 64])
+        sch.bind(i, "blockIdx.x")
+        sch.bind(j, "threadIdx.x")
+        func = sch.mod["main"]
 
-    fadd = tvm.build(s, [A, B, C], tvm.target.Target(target, host="llvm"), name="myadd")
+    fadd = tvm.build(func, target=tvm.target.Target(target, host="llvm"), name="myadd")
     fadd.save(osp.join(out_dir, "test_add.o"))
     if target == "cuda":
         fadd.imported_modules[0].save(osp.join(out_dir, "test_add.ptx"))
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
index a5c8105144f7..155f45bff5ce 100644
--- a/src/runtime/crt/aot_executor_module/aot_executor_module.c
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -154,7 +154,7 @@ int32_t TVMAotExecutorModule_GetInputName(TVMValue* args, int* tcodes, int nargs
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  char* name;
+  const char* name;
   int ret = TVMAotExecutor_GetInputName(aot_executor.executor, args[0].v_int64, &name);
   if (ret < 0) {
     return kTvmErrorExecutorModuleNoSuchInput;
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 99b3201b95b0..4e0e0d4fc240 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -24,6 +24,7 @@
 #include <math.h>
 #include <stdarg.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -90,10 +91,26 @@ int TVMArrayFree(TVMArrayHandle handle) {
 
 int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
                             void** out_data) {
-  if (alignment != 1) {
-    nbytes = (nbytes + alignment - 1) / alignment * alignment;
-  }
-  return TVMPlatformMemoryAllocate(nbytes, dev, out_data);
+  // The TVMPlatformMemoryAllocate function does not guarantee the
+  // alignment of the allocation.  Therefore, deliberately
+  // overallocate by (alignment-1) and return an aligned region from
+  // it.
+  size_t total_bytes = nbytes + sizeof(void*) + (alignment - 1);
+  void* allocated_buf;
+  int err = TVMPlatformMemoryAllocate(total_bytes, dev, &allocated_buf);
+  if (err) return err;
+
+  void* first_allowed_data_ptr = ((uint8_t*)allocated_buf) + sizeof(void*);
+  uintptr_t offset = (alignment - ((uintptr_t)first_allowed_data_ptr) % alignment);
+  void* data_ptr = first_allowed_data_ptr + offset;
+
+  // Must keep a pointer to the original allocation, so that it can be
+  // passed to TVMPlatformMemoryFree.
+  ((void**)data_ptr)[-1] = allocated_buf;
+
+  *out_data = data_ptr;
+
+  return err;
 }
 
 int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype,
@@ -110,7 +127,10 @@ int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shap
   return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data);
 }
 
-int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { return TVMPlatformMemoryFree(ptr, dev); }
+int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) {
+  void* allocated_buf = ((void**)ptr)[-1];
+  return TVMPlatformMemoryFree(allocated_buf, dev);
+}
 
 TVM_ATTRIBUTE_UNUSED static bool IsContiguous(const DLTensor* arr) {
   if (arr->strides == NULL) return true;
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index b0e869766bde..1c1e3c58f47a 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -63,8 +63,8 @@ int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, D
     return status;
   }
   int total_elem_bytes = TVMNDArray_DataSizeBytes(array);
-  array->dl_tensor.data =
-      TVMBackendAllocWorkspace(kDLCPU, 0, total_elem_bytes, dtype.code, dtype.bits);
+  TVMDeviceAllocDataSpace(dev, total_elem_bytes, TVM_CRT_ALLOC_ALIGNMENT, dtype,
+                          &array->dl_tensor.data);
   memset(array->dl_tensor.data, 0, total_elem_bytes);
   return 0;
 }
@@ -167,7 +167,7 @@ int TVMNDArray_Release(TVMNDArray* arr) {
     return 0;
   }
 
-  err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev);
+  err = TVMDeviceFreeDataSpace(dev, arr->dl_tensor.data);
   if (err != kTvmErrorNoError) {
     return err;
   }
diff --git a/src/runtime/crt/crt_config.h.template b/src/runtime/crt/crt_config.h.template
index 1d32253282e8..70ac85b91fec 100644
--- a/src/runtime/crt/crt_config.h.template
+++ b/src/runtime/crt/crt_config.h.template
@@ -54,6 +54,9 @@
 /*! Maximum supported string length in parameter names */
 #define TVM_CRT_MAX_STRLEN_PARAM_NAME ${TVM_CRT_MAX_STRLEN_PARAM_NAME}
 
+/*! Alignment (in bytes) for data buffer allocation */
+#define TVM_CRT_ALLOC_ALIGNMENT ${TVM_CRT_ALLOC_ALIGNMENT}
+
 /*! Enable checks to enforce the stack allocator with a FIFO ordering. Off by default */
 // #define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
 
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 395a343ccb41..46704a332a31 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -841,15 +841,10 @@ int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_bl
       status = -1;
     }
 
-    if (executor->data_entry[eid].dl_tensor.shape) {
-      err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.shape, dev);
-      if (err != kTvmErrorNoError) {
-        status = -1;
-      }
-      executor->data_entry[eid].dl_tensor.shape = 0;
-    }
+    // The memory in the executor->data_entry[eid].dl_tensor.shape is
+    // owned by attrs->shape, and should not be freed here.
     if (executor->data_entry[eid].dl_tensor.data) {
-      err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.data, dev);
+      err = TVMDeviceFreeDataSpace(dev, executor->data_entry[eid].dl_tensor.data);
       if (err != kTvmErrorNoError) {
         status = -1;
       }
diff --git a/src/runtime/crt/host/CMakeLists.txt.template b/src/runtime/crt/host/CMakeLists.txt.template
index be0bce85513b..8059ea316524 100644
--- a/src/runtime/crt/host/CMakeLists.txt.template
+++ b/src/runtime/crt/host/CMakeLists.txt.template
@@ -34,6 +34,9 @@ set(CRT_LIBS microtvm_rpc_server
   memory
 )
 
+
+add_library(tvm_model)
+
 # Build CRT libraries
 foreach(crt_lib_name ${CRT_LIBS})
   add_library(${crt_lib_name})
@@ -41,11 +44,16 @@ foreach(crt_lib_name ${CRT_LIBS})
   target_sources(${crt_lib_name} PRIVATE ${crt_lib_srcs})
   target_include_directories(${crt_lib_name} PRIVATE crt_config crt/include)
   target_compile_definitions(${crt_lib_name} PRIVATE -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE)
-  target_link_libraries(main PRIVATE ${crt_lib_name})
+  # Circular dependencies result in the static libraries being listed
+  # twice in the link command, resolving circular dependencies between
+  # the libraries and the model.
+  #
+  # See https://cmake.org/cmake/help/latest/command/target_link_libraries.html#cyclic-dependencies-of-static-libraries
+  target_link_libraries(tvm_model PRIVATE ${crt_lib_name})
+  target_link_libraries(${crt_lib_name} PRIVATE tvm_model)
 endforeach(crt_lib_name ${CRT_LIBS})
 
 # Build model files
-add_library(tvm_model)
 file(GLOB_RECURSE tvm_model_srcs model/codegen/host/src/*.c model/codegen/host/lib/*.o)
 target_sources(tvm_model PRIVATE ${tvm_model_srcs})
 target_include_directories(tvm_model PRIVATE ${CMAKE_SOURCE_DIR}/include crt_config crt/include)
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index c2efa79c0c83..e055187bb467 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -182,30 +182,7 @@ struct NDArray::Internal {
 NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, uint64_t relative_byte_offset) {
   ICHECK(data_ != nullptr);
 
-  const DLTensor& orig = get_mutable()->dl_tensor;
-  CHECK(IsContiguous()) << [&orig]() {
-    std::stringstream ss;
-    ss << "Can only create view for compact tensor, but found strides ";
-
-    ss << "[";
-    for (int i = 0; i < orig.ndim; i++) {
-      if (i) ss << ", ";
-      ss << orig.strides[i];
-    }
-    ss << "]";
-
-    ss << ", for shape ";
-    ss << "[";
-    for (int i = 0; i < orig.ndim; i++) {
-      if (i) ss << ", ";
-      ss << orig.shape[i];
-    }
-    ss << "]";
-    return ss.str();
-  }();
-
   const auto& curr_dl_tensor = get_mutable()->dl_tensor;
-
   NDArray ret = Internal::Create(shape, dtype, curr_dl_tensor.device);
 
   size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
@@ -273,9 +250,6 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   data->SetDeleter(Internal::DLPackDeleter);
   // fill up content.
   data->manager_ctx = tensor;
-  ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous.";
-  ICHECK(IsAligned(tensor->dl_tensor))
-      << "Data in DLManagedTensor is not aligned as required by NDArray";
   data->dl_tensor = tensor->dl_tensor;
   // update shape_
   std::vector<ShapeTuple::index_type> shape;
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6fc083d17ccf..e378de6a9228 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1439,7 +1439,12 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return ret_dummy;
   } else if (op->op.same_as(builtin::reinterpret())) {
     llvm::Type* target = DTypeToLLVMType(op->dtype);
-    return builder_->CreateBitCast(MakeValue(op->args[0]), target);
+    llvm::Value* value = MakeValue(op->args[0]);
+    if (op->args[0].dtype().is_handle() && (op->dtype.is_int() || op->dtype.is_uint())) {
+      return builder_->CreatePtrToInt(value, target);
+    } else {
+      return builder_->CreateBitCast(value, target);
+    }
   } else if (op->op.same_as(builtin::isnan())) {
     // TODO(hgt312): set fast math flag
     llvm::Value* a = MakeValue(op->args[0]);
diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc
index 5b9e005b7ea3..a4e39984ffd5 100644
--- a/src/tir/transforms/arg_binder.cc
+++ b/src/tir/transforms/arg_binder.cc
@@ -302,25 +302,47 @@ void ArgBinder::BindDLTensor(const Buffer& buffer, const PrimExpr& device_type,
   if (Bind_(buffer->data, TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
             arg_name + ".data", true)) {
     Var vptr(buffer->data);
+    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+
+    // Mark alignment of external bufs.
+    AssertStmt satisfies_tvm_alignment = [&]() {
+      PrimExpr interpretable_as_pointer =
+          (device_type == kDLCPU) || (device_type == kDLCUDA) || (device_type == kDLCUDAHost) ||
+          (device_type == kDLCUDAManaged) || (device_type == kDLHexagon);
+      PrimExpr is_aligned =
+          tvm::floormod(tvm::reinterpret(DataType::UInt(vptr->dtype.bits()), vptr),
+                        buffer->data_alignment) == 0;
+
+      std::ostringstream alignment_err_msg;
+      alignment_err_msg << arg_name << ".data is expected to be aligned to "
+                        << buffer->data_alignment << " bytes";
+
+      return AssertStmt(!interpretable_as_pointer || is_aligned, StringImm(alignment_err_msg.str()),
+                        nop);
+    }();
+    asserts_.push_back(satisfies_tvm_alignment);
 
     // Check if the data pointer is NULL.  This check is skipped for
     // size-0 arrays, since CUDA provides a NULL pointer for size-zero
     // allocations.
-    auto alloc_size = [&]() -> PrimExpr {
-      PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
-      for (const auto& dim : buffer->shape) {
-        product *= dim;
-      }
-      return product;
+    AssertStmt valid_data_ptr_for_non_empty_array = [&]() {
+      auto alloc_size = [&]() -> PrimExpr {
+        PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
+        for (const auto& dim : buffer->shape) {
+          product *= dim;
+        }
+        return product;
+      }();
+      return AssertStmt(
+          alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
+          tvm::tir::StringImm(arg_name + " is expected to have non-NULL data pointer"), nop);
     }();
-    asserts_.emplace_back(AssertStmt(
-        alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-        tvm::tir::StringImm(arg_name + " is expected to have non-NULL data pointer"), nop));
+    asserts_.push_back(valid_data_ptr_for_non_empty_array);
 
-    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
-    // mark alignment of external bufs
-    init_nest_.emplace_back(AttrStmt(vptr, tir::attr::storage_alignment,
-                                     IntImm(DataType::Int(32), buffer->data_alignment), nop));
+    // AttrStmt provides a @llvm.assume that can be placed in the
+    // generated LLVM kernel
+    init_nest_.push_back(AttrStmt(vptr, tir::attr::storage_alignment,
+                                  IntImm(DataType::Int(32), buffer->data_alignment), nop));
   }
 }
 
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index 5abfe24f434d..f447ddaa200f 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -244,13 +244,25 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) {
              "user-given arguments for memory pools, the default behaviour is a single size "
              "un-restricted pool is assigned";
       PrimFunc func = scope_stack_.top().func;
-      Optional<tvm::relay::Executor> executor_config =
-          module_->GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor);
-      Integer workspace_alignment = 16;
-      if (executor_config) {
-        workspace_alignment =
-            executor_config.value()->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
-      }
+      auto workspace_alignment = [&]() -> Integer {
+        if (const auto* decl_buffer = op->body.as<DeclBufferNode>()) {
+          ICHECK(decl_buffer->buffer->data.same_as(op->buffer_var))
+              << "DeclBuffer of Buffer " << decl_buffer->buffer << " has data ptr "
+              << decl_buffer->buffer->data
+              << ", which is mismatched from the parent Allocate's buffer_var of "
+              << op->buffer_var;
+          return decl_buffer->buffer->data_alignment;
+        }
+
+        if (auto executor_config = module_->GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor)) {
+          if (auto config_alignment =
+                  executor_config.value()->GetAttr<Integer>("workspace-byte-alignment")) {
+            return config_alignment.value();
+          }
+        }
+
+        return tvm::runtime::kAllocAlignment;
+      }();
 
       BufferInfoKind bi_kind = BufferInfoKind::kIntermediate;
       String buffer_info_name = op->buffer_var->name_hint;
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index f1316ae3cee0..f81bb8a3b744 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -382,7 +382,7 @@ def test_alignment():
     s = te.create_schedule(B.op)
     bx, tx = s[B].split(B.op.axis[0], factor=8)
     s[B].vectorize(tx)
-    f = tvm.build(s, [A, B], "llvm", name="test_alignment")
+    f = tvm.build(s, [A, B], "llvm -opt-level=0", name="test_alignment")
 
     lines = f.get_source().split("\n")
 
diff --git a/tests/python/tir-transform/test_tir_argument_alignment.py b/tests/python/tir-transform/test_tir_argument_alignment.py
new file mode 100644
index 000000000000..081dae64254f
--- /dev/null
+++ b/tests/python/tir-transform/test_tir_argument_alignment.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+
+import pytest
+
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+
+
+alignment_required = tvm.testing.parameter(1, 32, 64)
+offset = tvm.testing.parameter(0, 3, 32, 64)
+
+
+def test_aligned_dltensor(alignment_required, offset):
+    """Alignment of buffer arguments checked during DLTensor unpacking
+
+    TVM allocates buffers that are aligned according to the value of
+    `tvm::runtime::kAllocAlignment`.  However, buffers may be
+    non-aligned, either when provided by an external source, or when
+    the TVM runtime used for compilation and for execution have a
+    different value of `tvm::runtime::kAllocAlignment` through the
+    `TVM_KALLOC_ALIGNMENT` macro definition.  In addition, while
+    `tvm::runtime::kAllocAlignment` is the default alignment for TIR
+    buffers, it can be overridden on a per-buffer basis.
+
+    This test varies the alignment required by a buffer argument and
+    the alignment provided by an externally-owned array, validating
+    that non-aligned buffers may always be converted to TVM, and must
+    have their alignment validated when calling a PrimFunc.
+    """
+    torch = pytest.importorskip("torch")
+
+    @T.prim_func
+    def func(a: T.handle):
+        A = T.match_buffer(a, 16, dtype="int8", align=alignment_required)
+        T.evaluate(0)
+
+    built = tvm.build(func)
+
+    torch_tensor = torch.arange(128, dtype=torch.int8)
+    torch_view = torch_tensor[offset : offset + 16]
+    tvm_array = tvm.nd.from_dlpack(torch_view)
+
+    satisfies_alignment = offset % alignment_required == 0
+    if satisfies_alignment:
+        built(tvm_array)
+    else:
+        with pytest.raises(tvm.TVMError):
+            built(tvm_array)
+
+
+contiguity_test_case = tvm.testing.parameter(
+    by_dict={
+        "entire_first_row": ([4, 16], [0, 0]),
+        "entire_second_row": ([4, 16], [1, 0]),
+        "left_half_of_first_row": ([4, 32], [0, 0]),
+        "right_half_of_first_row": ([4, 32], [0, 16]),
+    }
+)
+
+
+def test_contiguous_dltensor(contiguity_test_case):
+    """Validate argument buffer is compact when strides are unspecified."""
+    torch = pytest.importorskip("torch")
+
+    @T.prim_func
+    def func(a: T.handle):
+        A = T.match_buffer(a, [1, 16], dtype="int8", align=1)
+        T.evaluate(0)
+
+    built = tvm.build(func)
+
+    view_backing_shape, view_offset = contiguity_test_case
+    torch_tensor = torch.zeros(*view_backing_shape, dtype=torch.int8)
+    torch_view = torch_tensor[
+        view_offset[0] : view_offset[0] + 1,
+        view_offset[1] : view_offset[1] + 16,
+    ]
+    tvm_array = tvm.nd.from_dlpack(torch_view)
+
+    built(tvm_array)
+
+
+strided_test_case = tvm.testing.parameter(
+    by_dict={
+        "entire_buffer": (8, 16),
+        "split_in_slowest_changing_dim": (32, 16),
+        "split_in_fastest_changing_dim": (8, 64),
+    }
+)
+
+
+def test_dynamic_striding_on_external_dltensor(strided_test_case):
+    """External buffers may be strided.
+
+    Validity is checked by the TIR unpacking of the DLTensor, based on
+    the requirements of the TIR buffer.
+    """
+    torch = pytest.importorskip("torch")
+
+    @T.prim_func
+    def func(a: T.handle):
+        stride_i = T.var("int32")
+        stride_j = T.var("int32")
+        A = T.match_buffer(a, [8, 16], strides=[stride_i, stride_j], dtype="int8", align=1)
+        T.evaluate(0)
+
+    built = tvm.build(func)
+
+    torch_tensor = torch.zeros(*strided_test_case, dtype=torch.int8)
+    torch_view = torch_tensor[:8, :16]
+    tvm_array = tvm.nd.from_dlpack(torch_view)
+
+    built(tvm_array)
+
+
+def test_static_striding_on_external_dltensor(strided_test_case):
+    """External buffers may be strided.
+
+    Import of strided arrays from external sources is legal.  The
+    validity for any given PrimFunc is checked by the TIR unpacking of
+    the DLTensor, based on the requirements of the TIR buffer.
+    """
+    torch = pytest.importorskip("torch")
+
+    @T.prim_func
+    def func(a: T.handle):
+        A = T.match_buffer(a, [8, 16], dtype="int8", align=1)
+        T.evaluate(0)
+
+    built = tvm.build(func)
+
+    torch_tensor = torch.zeros(*strided_test_case, dtype=torch.int8)
+    torch_view = torch_tensor[:8, :16]
+    tvm_array = tvm.nd.from_dlpack(torch_view)
+
+    has_correct_striding = strided_test_case[1] == 16
+    if has_correct_striding:
+        built(tvm_array)
+    else:
+        with pytest.raises(tvm.TVMError):
+            built(tvm_array)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/tir-usmp/test_tir_usmp_algo.py b/tests/python/tir-usmp/test_tir_usmp_algo.py
index b9cfde485633..886b10c3b2fc 100644
--- a/tests/python/tir-usmp/test_tir_usmp_algo.py
+++ b/tests/python/tir-usmp/test_tir_usmp_algo.py
@@ -365,8 +365,8 @@ def run_model(input: T.handle, output: T.handle) -> None:
 @pytest.mark.parametrize(
     ["algorithm", "fast_memory_size", "slow_memory_size"],
     [
-        ("greedy_by_size", 200704, 1418528),
-        ("greedy_by_conflicts", 200704, 1418528),
+        ("greedy_by_size", 200704, 1418560),
+        ("greedy_by_conflicts", 200704, 1418560),
         ("hill_climb", 200704, 1117462),
     ],
 )
diff --git a/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index 9e9fea7c8152..5de8c83f288e 100644
--- a/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/tir-usmp/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -106,7 +106,7 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150560], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
@@ -118,7 +118,7 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1)
+        placeholder_65 = T.match_buffer(placeholder_62, [150560], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
@@ -178,11 +178,11 @@ class LinearStructurePlanned:
     @T.prim_func
     def __tvm_main__(input: T.handle, fast_memory_0_var: T.handle("uint8"), slow_memory_1_var: T.handle("uint8"), output: T.handle) -> None:
         fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
-        sid_9_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[1117472], dtype="handle")
+        sid_9_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[1117504], dtype="handle")
         sid_8_let: T.handle("int8") = T.address_of(slow_memory_1_buffer_var[0], dtype="handle")
         T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9_let, fast_memory_0_buffer_var.data, slow_memory_1_buffer_var.data, dtype="int32"))
         T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9_let, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8_let, fast_memory_0_buffer_var.data, slow_memory_1_buffer_var.data, dtype="int32"))
@@ -193,7 +193,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8")
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16")
         fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         tensor_2_let = T.Buffer([200704], dtype="uint8")
         with T.LetStmt(T.address_of(fast_memory_6_buffer_var[0], dtype="handle"), var=tensor_2_let.data):
@@ -207,23 +207,23 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
 
     @T.prim_func
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle, fast_memory_2_var: T.handle("uint8"), slow_memory_3_var: T.handle("uint8")) -> None:
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8")
+        placeholder_4 = T.match_buffer(placeholder_2, [150560], dtype="uint8")
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16")
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16")
         fast_memory_2_buffer_var = T.match_buffer(fast_memory_2_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         for ax0_ax1_fused_1, ax2_1, ax3_inner_1 in T.grid(224, 224, 3):
             T_subtract_1[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1] = T.cast(placeholder_4[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1], "int16") - placeholder_5[0]
 
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle, fast_memory_4_var: T.handle("uint8"), slow_memory_5_var: T.handle("uint8")) -> None:
-        placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16")
+        placeholder_65 = T.match_buffer(placeholder_62, [150560], dtype="int16")
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16")
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32")
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8")
         fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
-        slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418560], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_7_let = T.Buffer([157323], "int16")
         with T.LetStmt(T.address_of(slow_memory_5_buffer_var[802816], dtype="handle"), var=PaddedInput_7_let.data):