From d08330a32e13bd42960b3793118e5bbc544aa5d3 Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Wed, 9 Nov 2022 09:47:49 -0800
Subject: [PATCH 1/4] [Hexagon] Enable Hexagon User DMA bypass mode

---
 include/tvm/tir/builtin.h                     |   7 -
 src/driver/driver_api.cc                      |   1 +
 src/runtime/hexagon/hexagon_buffer.cc         |  14 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  29 +--
 src/runtime/hexagon/hexagon_user_dma.cc       |  61 ++---
 src/runtime/hexagon/hexagon_user_dma.h        |   2 +-
 src/runtime/hexagon/hexagon_vtcm_pool.h       |  10 +
 src/tir/op/builtin.cc                         |   3 -
 src/tir/transforms/lower_async_dma.cc         |  14 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  15 --
 .../hexagon/hexagon_user_dma_tests.cc         | 151 +++++++++++-
 .../test_hexagon/test_async_dma_pipeline.py   |  12 +-
 .../test_hexagon/test_cache_read_write.py     | 226 ------------------
 .../test_parallel_hvx_load_vtcm.py            |   9 +-
 .../test_software_pipeline_async.py           |   6 +-
 .../test_hexagon/test_vtcm_bandwidth.py       |   7 +-
 16 files changed, 223 insertions(+), 344 deletions(-)
 delete mode 100644 tests/python/contrib/test_hexagon/test_cache_read_write.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 9f6b7f9ce5d1..d830ea579aa7 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -713,13 +713,6 @@ TVM_DLL const Op& texture2d_store();
  */
 TVM_DLL const Op& texture2d_load();
 
-/*!
- * \brief Copy 1d memory from source to destination
- * Same semantics as memcpy(destination, source, size)
- * Allows for device specific implementations e.g. direct memory access (DMA)
- */
-TVM_DLL const Op& mem_copy();
-
 /*!
  * \brief Initiate a non-blocking DMA copy from source to destination
  */
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index bb4990e3e502..e5e3998b1e7b 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -53,6 +53,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.dma_bypass_cache", Bool);
 
 using tvm::Array;
 using tvm::transform::Pass;
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index c58026e83cfe..4a966f156780 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -26,13 +26,12 @@
 
 #include "hexagon_common.h"
 #include "hexagon_device_api.h"
+#include "qurt_memory.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length);
-
 struct Allocation {
   Allocation(size_t allocation_nbytes, size_t alignment)
       : allocation_nbytes_(allocation_nbytes), alignment_(alignment) {}
@@ -237,8 +236,15 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet&
 
   // Finally, do the memory copies.
   for (const auto& copy : macro_copies) {
-    int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes);
-    CHECK_EQ(error_code, 0);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    memcpy(copy.dest, copy.src, copy.num_bytes);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
+    qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
+                         QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 7221be03cc53..1c3b139d39a3 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -38,8 +38,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length);
-
 HexagonDeviceAPI* HexagonDeviceAPI::Global() {
   static auto* inst = new HexagonDeviceAPI();
   return inst;
@@ -206,39 +204,38 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor")
+TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy_dltensor")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
       DLTensor* dst = args[0];
       DLTensor* src = args[1];
       int size = args[2];
+      ICHECK(size > 0);
+      bool bypass_cache = args[3];
 
-      hexagon_user_dma_1d_sync(dst->data, src->data, size);
+      int ret = DMA_RETRY;
+      do {
+        ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(SYNC_DMA_QUEUE, dst->data, src->data,
+                                                          size, bypass_cache);
+      } while (ret == DMA_RETRY);
+      CHECK(ret == DMA_SUCCESS);
+      HexagonDeviceAPI::Global()->UserDMA()->Wait(SYNC_DMA_QUEUE, 0);
 
       *rv = static_cast<int32_t>(0);
     });
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
-  void* dst = args[0];
-  void* src = args[1];
-  int size = args[2];
-
-  int error_code = hexagon_user_dma_1d_sync(dst, src, size);
-  CHECK_EQ(error_code, 0);
-
-  *rv = static_cast<int32_t>(0);
-});
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   int queue_id = args[0];
   void* dst = args[1];
   void* src = args[2];
   int size = args[3];
   ICHECK(size > 0);
+  bool bypass_cache = args[3];
 
   int ret = DMA_RETRY;
   do {
-    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size);
+    ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size, bypass_cache);
   } while (ret == DMA_RETRY);
+  CHECK(ret == DMA_SUCCESS);
   *rv = static_cast<int32_t>(ret);
 });
 
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 619338e39688..c30fd645bbd0 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -32,7 +32,7 @@ unsigned int HexagonUserDMA::Init() {
   return status;
 }
 
-int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) {
+int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache) {
   // length limited to 24 bits
   if (length > DESC_LENGTH_MASK) {
     return DMA_FAILURE;
@@ -66,8 +66,24 @@ int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) {
   dma_desc_set_desctype(dma_desc, DESC_DESCTYPE_1D);
   dma_desc_set_dstcomp(dma_desc, DESC_COMP_NONE);
   dma_desc_set_srccomp(dma_desc, DESC_COMP_NONE);
-  dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF);
-  dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF);
+
+  bool dst_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(dst, length);
+  bool src_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(src, length);
+
+  // VTCM -> DDR with bypass enabled
+  if (dst_is_ddr && !src_is_ddr && bypass_cache) {
+    dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_ON);
+  } else {
+    dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF);
+  }
+
+  // DDR -> VTCM with bypass enabled
+  if (src_is_ddr && !dst_is_ddr && bypass_cache) {
+    dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_ON);
+  } else {
+    dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF);
+  }
+
   dma_desc_set_order(dma_desc, DESC_ORDER_ORDER);
   dma_desc_set_done(dma_desc, DESC_DONE_INCOMPLETE);
   dma_desc_set_src(dma_desc, src32);
@@ -117,45 +133,6 @@ HexagonUserDMA::~HexagonUserDMA() {
   delete descriptors_;
 }
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
-  HexagonUserDMA* user_dma = HexagonDeviceAPI::Global()->UserDMA();
-
-  // One DMA transfer can copy at most DESC_LENGTH_MASK bytes.
-  // Make the common case quick.
-  if (length <= DESC_LENGTH_MASK) {
-    // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, dst, src, length);
-    if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(SYNC_DMA_QUEUE, 0);
-    return DMA_SUCCESS;
-  }
-
-  // Split big transfers into smaller transfers.
-  char* cast_src = static_cast<char*>(src);
-  char* cast_dst = static_cast<char*>(dst);
-  for (uint32_t i = 0; i < length;) {
-    // Ensure there is no overflow while updating i
-    uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
-    // sync DMA -> `Copy` and then `Wait(0)`
-    int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, &cast_dst[i], &cast_src[i], cur_len);
-    if (ret_val != DMA_SUCCESS) return ret_val;
-    user_dma->Wait(SYNC_DMA_QUEUE, 0);
-    // 2 cases for new val for i:
-    // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
-    //    new_i = i + (length - i) = length, no more iter
-    //            and no overflow (since (length - i) <= (MAX_UINT - i))
-    // 2. length - i > DESC_LENGTH_MASK
-    //    length > (i + DESC_LENGTH_MASK)
-    //    new_i = (i + DESC_LENGTH_MASK)
-    //    length > new_i for next iter, we're done
-    //    length - i > DESC_LENGTH_MASK
-    //    and length <= MAX_UINT,
-    //    so MAX_UINT >= length > DESC_LEN_MASK + i
-    //    MAX_UINT > (DESC_LEN_MASK + i), so no overflow
-    i += cur_len;
-  }
-  return DMA_SUCCESS;
-}
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h
index 01e143d255b4..9397a16e3f03 100644
--- a/src/runtime/hexagon/hexagon_user_dma.h
+++ b/src/runtime/hexagon/hexagon_user_dma.h
@@ -52,7 +52,7 @@ class HexagonUserDMA {
    * \param length Length in bytes to copy
    * \returns Status: DMA_SUCCESS or DMA_FAILURE
    */
-  int Copy(int queue_id, void* dst, void* src, uint32_t length);
+  int Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache);
 
   /*!
    * \brief Wait until the number of DMAs in flight is less than or equal to some maximum
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index 1c44a455196c..56f83967cd90 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -70,6 +70,16 @@ class HexagonVtcmPool {
   //! \brief Returns the total number of bytes in this pool
   size_t TotalBytes() { return reinterpret_cast<size_t>(vtcm_size_); }
 
+  bool IsVtcm(void* ptr, unsigned size) {
+    auto char_ptr = static_cast<char*>(ptr);
+    auto char_vtcm = static_cast<char*>(vtcm_data_);
+
+    if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) {
+      return true;
+    }
+    return false;
+  }
+
  private:
   //! \brief Total size of VTCM pool
   unsigned int vtcm_size_;
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 9feba142eb6a..56ecba9e9ed9 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -290,9 +290,6 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load)
     .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr<TCallEffectKind>("TCallEffectKind",
-                                                            Integer(CallEffectKind::kOpaque));
-
 TIR_DEFINE_BUILTIN_FUNC(dma_copy).set_attr<TCallEffectKind>("TCallEffectKind",
                                                             Integer(CallEffectKind::kOpaque));
 
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 417e9d61f263..b9ba4d41b7da 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -32,7 +32,7 @@ namespace tir {
 
 class AsyncDMALowerer : public StmtExprMutator {
  public:
-  AsyncDMALowerer() {}
+  explicit AsyncDMALowerer(bool dma_bypass_cache) : dma_bypass_cache_(dma_bypass_cache) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     // Convert this, for example:
@@ -52,7 +52,7 @@ class AsyncDMALowerer : public StmtExprMutator {
       int queue_id = queue_id_node->value;
 
       // abort if we have not seen this queue ID in `copy` transform
-      if (queue_ids.find(queue_id) == queue_ids.end()) {
+      if (queue_ids_.find(queue_id) == queue_ids_.end()) {
         DLOG(INFO) << "AsyncDMALowerer exiting because the queue ID observed in the "
                       "`async_wait_queue_scope` transform has not been previously observed in the "
                       "`async_commit_queue_scope` transform";
@@ -160,7 +160,7 @@ class AsyncDMALowerer : public StmtExprMutator {
 
       // now that we are about to perform the `copy` transform
       // save queue ID for inspection in `wait` transform
-      queue_ids.insert(queue_id);
+      queue_ids_.insert(queue_id);
 
       return Evaluate(Call(DataType::Int(32), builtin::dma_copy(),
                            {queue_id,
@@ -168,13 +168,14 @@ class AsyncDMALowerer : public StmtExprMutator {
                                  {BufferLoad(bufferstorenode->buffer, store_index)}),
                             Call(DataType::Handle(), builtin::address_of(),
                                  {BufferLoad(bufferloadnode->buffer, load_index)}),
-                            for_loop->extent * bufferloadnode->dtype.bytes()}));
+                            for_loop->extent * bufferloadnode->dtype.bytes(), dma_bypass_cache_}));
     }
     return StmtExprMutator::VisitStmt_(op);
   }
 
  private:
-  std::set<int> queue_ids;
+  std::set<int> queue_ids_;
+  bool dma_bypass_cache_;
 };
 
 namespace transform {
@@ -182,7 +183,8 @@ namespace transform {
 Pass LowerAsyncDMA() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto fptr = f.CopyOnWrite();
-    fptr->body = AsyncDMALowerer()(std::move(fptr->body));
+    bool dma_bypass_cache = ctx->GetConfig<Bool>("tir.dma_bypass_cache", Bool(false)).value();
+    fptr->body = AsyncDMALowerer(dma_bypass_cache)(std::move(fptr->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LowerAsyncDMA", {});
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index f79682ef7ecc..25d62539721f 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -315,8 +315,6 @@ class BuiltinLower : public StmtExprMutator {
       return MakeArray(op);
     } else if (op->op.same_as(builtin::tvm_context_id())) {
       return make_zero(op->dtype);
-    } else if (op->op.same_as(builtin::mem_copy())) {
-      return MakeMemCopy(op);
     } else if (op->op.same_as(builtin::dma_copy())) {
       return MakeDMACopy(op);
     } else if (op->op.same_as(builtin::dma_wait())) {
@@ -326,19 +324,6 @@ class BuiltinLower : public StmtExprMutator {
     }
   }
 
-  PrimExpr MakeMemCopy(const CallNode* op) {
-    PrimExpr dst = op->args[0];
-    PrimExpr src = op->args[1];
-    PrimExpr size = op->args[2];
-
-    std::string fdevapi_prefix =
-        "device_api." + std::string(runtime::DeviceName(device_type_.as<IntImmNode>()->value));
-
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
-                            {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size});
-    return VisitExpr(call_packed);
-  }
-
   PrimExpr MakeDMACopy(const CallNode* op) {
     PrimExpr queue_id = op->args[0];
     PrimExpr dst = op->args[1];
diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
index b76c7c652e6a..e4ffe3a0de9c 100644
--- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc
@@ -53,6 +53,10 @@ class HexagonUserDMATest : public ::testing::Test {
   char* src_char = nullptr;
   char* dst_char = nullptr;
   uint32_t length = 0x4000;  // 16KB
+  const bool ENABLE_BYPASS = true;
+  const bool DISABLE_BYPASS = false;
+  Optional<String> global_scope{"global"};
+  Optional<String> global_vtcm_scope{"global.vtcm"};
 };
 
 TEST_F(HexagonUserDMATest, wait) {
@@ -67,14 +71,14 @@ TEST_F(HexagonUserDMATest, bad_copy) {
   void* src64 = reinterpret_cast<void*>(bigaddr);
   void* dst64 = reinterpret_cast<void*>(bigaddr);
   uint32_t biglength = 0x1000000;
-  ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length), DMA_SUCCESS);
-  ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length, DISABLE_BYPASS), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length, DISABLE_BYPASS), DMA_SUCCESS);
+  ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength, DISABLE_BYPASS), DMA_SUCCESS);
 }
 
 TEST_F(HexagonUserDMATest, sync_dma) {
   // kick off 1 DMA
-  ret = user_dma->Copy(queue_id, dst, src, length);
+  ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
   ASSERT_EQ(ret, DMA_SUCCESS);
 
   // wait for DMA to complete
@@ -89,7 +93,7 @@ TEST_F(HexagonUserDMATest, sync_dma) {
 TEST_F(HexagonUserDMATest, async_dma_wait) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(queue_id, dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
@@ -108,7 +112,7 @@ TEST_F(HexagonUserDMATest, async_dma_wait) {
 TEST_F(HexagonUserDMATest, async_dma_poll) {
   // kick off 10x duplicate DMAs
   for (uint32_t i = 0; i < 10; ++i) {
-    ret = user_dma->Copy(queue_id, dst, src, length);
+    ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
 
@@ -131,7 +135,7 @@ TEST_F(HexagonUserDMATest, pipeline) {
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
     ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
-                          pipeline_length);
+                          pipeline_length, DISABLE_BYPASS);
   }
 
   user_dma->Wait(queue_id, 3);
@@ -168,35 +172,35 @@ TEST_F(HexagonUserDMATest, pipeline_write_queue) {
 
   for (uint32_t i = 0; i < pipeline_depth; ++i) {
     ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length,
-                          pipeline_length);
+                          pipeline_length, DISABLE_BYPASS);
   }
 
   user_dma->Wait(queue_id, 3);
   for (uint32_t i = 0; i < pipeline_length; ++i) {
     dst_char[i]++;
   }
-  ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length);
+  ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 2);
   for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + pipeline_length, dst_char + pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 1);
   for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + 2 * pipeline_length, dst_char + 2 * pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
 
   user_dma->Wait(queue_id, 0);
   for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
     dst_char[i]++;
   }
   ret |= user_dma->Copy(write_queue, src_char + 3 * pipeline_length, dst_char + 3 * pipeline_length,
-                        pipeline_length);
+                        pipeline_length, DISABLE_BYPASS);
   user_dma->Wait(write_queue, 0);
 
   // verify
@@ -214,7 +218,7 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
   for (uint32_t i = 0; i < number_of_dmas; ++i) {
     do {
       ret = user_dma->Copy(queue_id, dst_char + i * length_of_each_dma,
-                           src_char + i * length_of_each_dma, length_of_each_dma);
+                           src_char + i * length_of_each_dma, length_of_each_dma, DISABLE_BYPASS);
     } while (ret == DMA_RETRY);
     ASSERT_EQ(ret, DMA_SUCCESS);
   }
@@ -224,3 +228,124 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) {
     ASSERT_EQ(src_char[i], dst_char[i]);
   }
 }
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass_vtcm_to_vtcm) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcm1hb(length, kHexagonAllocAlignment, global_vtcm_scope);
+  HexagonBuffer vtcm2hb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcm1hb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> VTCM
+  // NOTE: Cache bypass is disabled for VTCM -> VTCM transfers
+  ret =
+      user_dma->Copy(queue_id, vtcm2hb.GetPointer(), vtcm1hb.GetPointer(), length, DISABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcm2hb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, sync_dma_bypass_) {
+  HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope);
+  HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope);
+
+  // init src, dst HexagonBuffers
+  srchb.CopyFrom(src, length);
+  dsthb.CopyFrom(dst, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+
+  // change src data
+  for (uint32_t i = 0; i < length; ++i) {
+    src_char[i] = 2;
+  }
+
+  // copy new src data to HexagonBuffer
+  srchb.CopyFrom(src, length);
+
+  // DDR src -> VTCM
+  ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // VTCM -> DDR dst
+  ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMAs to complete
+  user_dma->Wait(queue_id, 0);
+
+  // copy answer from dst HexagonBuffer
+  dsthb.CopyTo(dst, length);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index ef9b142d6f27..943b4e9ebc96 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -89,6 +89,7 @@ def evaluate(
     with tvm.transform.PassContext(
         config={
             "tir.use_async_copy": use_async_copy,
+            "tir.dma_bypass_cache": 1,
             "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope,
         }
     ):
@@ -133,7 +134,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", scope="global")
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     a_global_vtcm.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
@@ -153,12 +154,13 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     dtype="handle",
                 ),
                 T.Cast("int", a_bytes),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     w_global_vtcm.data,
                     T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
@@ -178,6 +180,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     dtype="handle",
                 ),
                 T.Cast("int", w_bytes),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
@@ -202,7 +205,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                 )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     c_buffer.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
@@ -222,6 +225,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
                     dtype="handle",
                 ),
                 T.Cast("int", a_bytes),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
@@ -290,7 +294,7 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
-        128 * 64,
+        # 128 * 64, # Only works on 8Gen1 HDK's
     )
 
     size_w = tvm.testing.parameter(
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
deleted file mode 100644
index 3ac297fd80d8..000000000000
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Lower cache_read and cache_write to Hexagon DMA via tensorize """
-
-import numpy as np
-
-import tvm.testing
-from tvm import te, tir
-from tvm.contrib.hexagon.session import Session
-from tvm.script import tir as T
-
-from .infrastructure import get_hexagon_target
-
-
-def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
-    """Define and return tensor intrinsic for mem copy"""
-    src = te.placeholder(shape=shape, dtype=dtype, name="src")
-    dst = te.compute(shape, lambda i: src[i], name="dst")
-    size = shape[0] * np.dtype(dtype).itemsize
-
-    src_buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        scope=src_scope,
-        offset_factor=1,
-        name="mem_copy_src_buffer",
-    )
-
-    dst_buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        scope=dst_scope,
-        offset_factor=1,
-        name="mem_copy_dst_buffer",
-    )
-
-    zero_indices = [0 for _ in shape]
-
-    def intrin_func(ins, outs):
-        ir_builder = tvm.tir.ir_builder.create()
-
-        _src = ins[0]
-        _dst = outs[0]
-
-        dst_handle = ir_builder.buffer_ptr(dst_buffer)
-        src_handle = ir_builder.buffer_ptr(src_buffer)
-
-        ir_builder.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.mem_copy",
-                tvm.tir.call_intrin("handle", "tir.address_of", dst_handle[zero_indices]),
-                tvm.tir.call_intrin("handle", "tir.address_of", src_handle[zero_indices]),
-                size,
-            )
-        )
-        return ir_builder.get()
-
-    return te.decl_tensor_intrin(dst.op, intrin_func, binds={src: src_buffer, dst: dst_buffer})
-
-
-def verify(hexagon_session: Session, schedule, x_tensor, y_tensor, z_tensor, size):
-    """Verify correctness with reference from numpy"""
-    print(tvm.lower(schedule, [x_tensor, y_tensor, z_tensor]))
-
-    func = tvm.build(
-        schedule,
-        [x_tensor, y_tensor, z_tensor],
-        get_hexagon_target("v68"),
-        name="dmacpy",
-    )
-
-    mod = hexagon_session.load_module(func)
-    x_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=x_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    y_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=y_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    z_array = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=z_tensor.dtype),
-        device=hexagon_session.device,
-    )
-    mod["dmacpy"](x_array, y_array, z_array)
-
-    ref = x_array.numpy() + y_array.numpy()
-    np.testing.assert_equal(z_array.numpy(), ref)
-
-
-@tvm.testing.requires_hexagon
-def test_cache_read_write(hexagon_session: Session):
-    """Test cache_read and cache_write to global.vtcm for hexagon"""
-    size = 128
-    outer_shape = (size,)
-    factor = 16
-    inner_shape = (factor,)
-    dtype = "int8"
-
-    x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
-    y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
-    z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z")
-    s = te.create_schedule(z_tensor.op)
-
-    x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor])
-    y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor])
-    z_vtcm = s.cache_write(z_tensor, "global.vtcm")
-
-    zouter, _ = s[z_vtcm].split(z_vtcm.op.axis[0], factor=factor)
-
-    s[x_vtcm].compute_at(s[z_vtcm], zouter)
-    s[y_vtcm].compute_at(s[z_vtcm], zouter)
-
-    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
-
-    (cache_read_x,) = s[x_vtcm].op.axis
-    s[x_vtcm].tensorize(cache_read_x, mem_copy_read)
-
-    (cache_read_y,) = s[y_vtcm].op.axis
-    s[y_vtcm].tensorize(cache_read_y, mem_copy_read)
-
-    mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm")
-
-    (cache_write_z,) = s[z_tensor].op.axis
-    s[z_tensor].tensorize(cache_write_z, mem_copy_write)
-
-    verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size)
-
-
-def layout_transform_2d(n):
-    return [n // 16, te.AXIS_SEPARATOR, n % 16]
-
-
-@tvm.testing.requires_hexagon
-def test_cache_read_write_2d(hexagon_session: Session):
-    """Test 2D cache_read and cache_write to global.vtcm for hexagon"""
-    size = 128
-    outer_shape = (size,)
-    factor = 16
-    inner_shape = (factor,)
-    dtype = "int8"
-
-    x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
-    y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
-    z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z")
-    s = te.create_schedule(z_tensor.op)
-
-    x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor])
-    y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor])
-    z_vtcm = s.cache_write(z_tensor, "global.vtcm")
-
-    layout_x_vtcm = s[x_vtcm].transform_layout(layout_transform_2d)
-    layout_y_vtcm = s[y_vtcm].transform_layout(layout_transform_2d)
-    _ = s[z_vtcm].transform_layout(layout_transform_2d)
-
-    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
-    s[x_vtcm].tensorize(layout_x_vtcm[1], mem_copy_read)
-    s[y_vtcm].tensorize(layout_y_vtcm[1], mem_copy_read)
-
-    # The loop schedule over `z` is not modified when calling `transform_layout`
-    # on `z_vtcm` above therefore we must call `split` to modify the loop schedule
-    # over `z` to match the layout of `z_vtcm` such that we can accurately write
-    # `z_vtcm` back to `z` using memory copy intrinsic
-    _, zinner = s[z_tensor].split(z_tensor.op.axis[0], factor=factor)
-    mem_copy_write = intrin_mem_copy(inner_shape, dtype, "global", "global.vtcm")
-    s[z_tensor].tensorize(zinner, mem_copy_write)
-
-    verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size)
-
-
-@T.prim_func
-def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):
-    for i in T.serial(
-        0,
-        8192,
-    ):
-        with T.block("C"):
-            buffer_c[i] = buffer_a[i] * T.int8(2)
-
-
-def test_vtcm_lowering():
-    """Test lowering with vtcm mem scope"""
-    mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
-    sch = tir.Schedule(mod, debug_mask="all")
-    block_c = sch.get_block("C")
-    (flat,) = sch.get_loops(block_c)
-    outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128])
-    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
-    sch.compute_at(cache_block, outer)
-    lowered = tvm.lower(sch.mod["main"])
-
-    def ir_module_has_allocate_nodes(irmod):
-        nallocs = 0
-
-        def _visit(stmt):
-            nonlocal nallocs
-            if isinstance(stmt, tvm.tir.Allocate):
-                nallocs += 1
-
-        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
-        return nallocs
-
-    assert not ir_module_has_allocate_nodes(lowered), (
-        "AllocateNode found in lowered IRModule, "
-        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
-    )
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index e6fc0a3c201c..6cca44388d09 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -213,7 +213,7 @@ def operator(
         )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -233,12 +233,13 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     b_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -258,6 +259,7 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
@@ -279,7 +281,7 @@ def operator(
                 )
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     c_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -299,6 +301,7 @@ def operator(
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index ba7513a4f39c..387d0f20c4c2 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -178,7 +178,11 @@ def test_async_software_pipeline(
             ref = reference(a_np, b_np)
 
         with tvm.transform.PassContext(
-            config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+            config={
+                "tir.use_async_copy": 1,
+                "tir.dma_bypass_cache": 1,
+                "tir.merge_async_commit_queue_scope": False,
+            }
         ):
             # tvm.lower(schedule.mod["main"]).show()
             func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 980ac0cf4c2a..afe33e4ab4ed 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -25,7 +25,7 @@
 
 from .infrastructure import get_hexagon_target
 
-MB = 1024**2
+MB = 1024 ** 2
 KB = 1024
 TEST_OUTPUT_TEMPLATE = (
     "Test bandwidth with buffer size {}MB... \n"
@@ -61,7 +61,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
         a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
-                "device_api.hexagon.mem_copy_DLTensor",
+                "device_api.hexagon.dma_copy_dltensor",
                 T.tvm_stack_make_array(
                     a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
@@ -81,6 +81,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
                     dtype="handle",
                 ),
                 T.cast(size, dtype="int"),
+                True,  # bypass cache
                 dtype="int32",
             )
         )
@@ -113,7 +114,7 @@ def evaluate(hexagon_session, sch, size):
     )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
-    gbps = round((size / 2**30) / runtime.mean, 4)
+    gbps = round((size / 2 ** 30) / runtime.mean, 4)
     tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a)
 
     return gbps

From f49ea460a0e6f8ab010b809d12faab9a041c7843 Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Mon, 14 Nov 2022 14:49:15 -0800
Subject: [PATCH 2/4] fix python lint errors

---
 tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index afe33e4ab4ed..0b6b52335cb5 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -25,7 +25,7 @@
 
 from .infrastructure import get_hexagon_target
 
-MB = 1024 ** 2
+MB = 1024**2
 KB = 1024
 TEST_OUTPUT_TEMPLATE = (
     "Test bandwidth with buffer size {}MB... \n"
@@ -114,7 +114,7 @@ def evaluate(hexagon_session, sch, size):
     )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
-    gbps = round((size / 2 ** 30) / runtime.mean, 4)
+    gbps = round((size / 2**30) / runtime.mean, 4)
     tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a)
 
     return gbps

From 18ef71fc728e1662fb2fd61ab2262348673baeb1 Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Tue, 15 Nov 2022 10:22:40 -0800
Subject: [PATCH 3/4] restore vtcm tests; add TODO for ION buffer; check IsVtcm
 pointers

---
 src/runtime/hexagon/hexagon_buffer.cc         |  4 ++
 src/runtime/hexagon/hexagon_vtcm_pool.h       |  2 +
 .../python/contrib/test_hexagon/test_vtcm.py  | 65 +++++++++++++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_vtcm.py

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 4a966f156780..b8c7bd2cb96e 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -236,6 +236,10 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet&
 
   // Finally, do the memory copies.
   for (const auto& copy : macro_copies) {
+    // clean Hexagon cache before / after memcpy to ensure clean cache state to enable usage of DMA
+    // bypass mode for increased DMA bandwidth
+    // TODO(HWE): Switch to ION Buffer to avoid need for memcpy and potentially lighten or alleviate
+    // the burden of cache invalidation in this code
     qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
                          QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
     qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h
index 56f83967cd90..2e0918e997c4 100644
--- a/src/runtime/hexagon/hexagon_vtcm_pool.h
+++ b/src/runtime/hexagon/hexagon_vtcm_pool.h
@@ -72,7 +72,9 @@ class HexagonVtcmPool {
 
   bool IsVtcm(void* ptr, unsigned size) {
     auto char_ptr = static_cast<char*>(ptr);
+    CHECK(char_ptr != nullptr);
     auto char_vtcm = static_cast<char*>(vtcm_data_);
+    CHECK(vtcm_data_ != nullptr);
 
     if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) {
       return true;
diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py
new file mode 100644
index 000000000000..ddd5f7bb4b88
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_vtcm.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm.testing
+from tvm import tir
+from tvm.contrib.hexagon.session import Session
+from tvm.script import tir as T
+
+from .infrastructure import get_hexagon_target
+
+
+@T.prim_func
+def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):
+    for i in T.serial(
+        0,
+        8192,
+    ):
+        with T.block("C"):
+            buffer_c[i] = buffer_a[i] * T.int8(2)
+
+
+def test_vtcm_lowering():
+    """Test lowering with vtcm mem scope"""
+    mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    (flat,) = sch.get_loops(block_c)
+    outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128])
+    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
+    sch.compute_at(cache_block, outer)
+    lowered = tvm.lower(sch.mod["main"])
+
+    def ir_module_has_allocate_nodes(irmod):
+        nallocs = 0
+
+        def _visit(stmt):
+            nonlocal nallocs
+            if isinstance(stmt, tvm.tir.Allocate):
+                nallocs += 1
+
+        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
+        return nallocs
+
+    assert not ir_module_has_allocate_nodes(lowered), (
+        "AllocateNode found in lowered IRModule, "
+        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
+    )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 21063ff499308f58eff88534c54e1b6a1df29633 Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Tue, 15 Nov 2022 14:26:47 -0800
Subject: [PATCH 4/4] fix py lint errors; reset test-only changes

---
 tests/python/contrib/test_hexagon/test_vtcm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py
index ddd5f7bb4b88..11188436a318 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm.py
@@ -14,14 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""VTCM Tests"""
 
 import tvm.testing
 from tvm import tir
-from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T
 
-from .infrastructure import get_hexagon_target
-
 
 @T.prim_func
 def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):