From d08330a32e13bd42960b3793118e5bbc544aa5d3 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Nov 2022 09:47:49 -0800 Subject: [PATCH 1/4] [Hexagon] Enable Hexagon User DMA bypass mode --- include/tvm/tir/builtin.h | 7 - src/driver/driver_api.cc | 1 + src/runtime/hexagon/hexagon_buffer.cc | 14 +- src/runtime/hexagon/hexagon_device_api.cc | 29 +-- src/runtime/hexagon/hexagon_user_dma.cc | 61 ++--- src/runtime/hexagon/hexagon_user_dma.h | 2 +- src/runtime/hexagon/hexagon_vtcm_pool.h | 10 + src/tir/op/builtin.cc | 3 - src/tir/transforms/lower_async_dma.cc | 14 +- src/tir/transforms/lower_tvm_builtin.cc | 15 -- .../hexagon/hexagon_user_dma_tests.cc | 151 +++++++++++- .../test_hexagon/test_async_dma_pipeline.py | 12 +- .../test_hexagon/test_cache_read_write.py | 226 ------------------ .../test_parallel_hvx_load_vtcm.py | 9 +- .../test_software_pipeline_async.py | 6 +- .../test_hexagon/test_vtcm_bandwidth.py | 7 +- 16 files changed, 223 insertions(+), 344 deletions(-) delete mode 100644 tests/python/contrib/test_hexagon/test_cache_read_write.py diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 9f6b7f9ce5d1..d830ea579aa7 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -713,13 +713,6 @@ TVM_DLL const Op& texture2d_store(); */ TVM_DLL const Op& texture2d_load(); -/*! - * \brief Copy 1d memory from source to destination - * Same semantics as memcpy(destination, source, size) - * Allows for device specific implementations e.g. direct memory access (DMA) - */ -TVM_DLL const Op& mem_copy(); - /*! * \brief Initiate a non-blocking DMA copy from source to destination */ diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index bb4990e3e502..e5e3998b1e7b 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -53,6 +53,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.dma_bypass_cache", Bool); using tvm::Array; using tvm::transform::Pass; diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index c58026e83cfe..4a966f156780 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -26,13 +26,12 @@ #include "hexagon_common.h" #include "hexagon_device_api.h" +#include "qurt_memory.h" namespace tvm { namespace runtime { namespace hexagon { -int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length); - struct Allocation { Allocation(size_t allocation_nbytes, size_t alignment) : allocation_nbytes_(allocation_nbytes), alignment_(alignment) {} @@ -237,8 +236,15 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& // Finally, do the memory copies. for (const auto& copy : macro_copies) { - int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes); - CHECK_EQ(error_code, 0); + qurt_mem_cache_clean(reinterpret_cast(copy.dest), copy.num_bytes, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + qurt_mem_cache_clean(reinterpret_cast(copy.src), copy.num_bytes, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + memcpy(copy.dest, copy.src, copy.num_bytes); + qurt_mem_cache_clean(reinterpret_cast(copy.dest), copy.num_bytes, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); + qurt_mem_cache_clean(reinterpret_cast(copy.src), copy.num_bytes, + QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); } } diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 7221be03cc53..1c3b139d39a3 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -38,8 +38,6 @@ namespace tvm { namespace runtime { namespace hexagon { -int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length); - HexagonDeviceAPI* HexagonDeviceAPI::Global() { static auto* inst = new HexagonDeviceAPI(); return inst; @@ -206,39 +204,38 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); } -TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor") +TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy_dltensor") .set_body([](TVMArgs args, TVMRetValue* rv) { DLTensor* dst = args[0]; DLTensor* src = args[1]; int size = args[2]; + ICHECK(size > 0); + bool bypass_cache = args[3]; - hexagon_user_dma_1d_sync(dst->data, src->data, size); + int ret = DMA_RETRY; + do { + ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(SYNC_DMA_QUEUE, dst->data, src->data, + size, bypass_cache); + } while (ret == DMA_RETRY); + CHECK(ret == DMA_SUCCESS); + HexagonDeviceAPI::Global()->UserDMA()->Wait(SYNC_DMA_QUEUE, 0); *rv = static_cast(0); }); -TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { - void* dst = args[0]; - void* src = args[1]; - int size = args[2]; - - int error_code = hexagon_user_dma_1d_sync(dst, src, size); - CHECK_EQ(error_code, 0); - - *rv = static_cast(0); -}); - TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) { int queue_id = args[0]; void* dst = args[1]; void* src = args[2]; int size = args[3]; ICHECK(size > 0); + bool bypass_cache = args[3]; int ret = DMA_RETRY; do { - ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size); + ret = HexagonDeviceAPI::Global()->UserDMA()->Copy(queue_id, dst, src, size, bypass_cache); } while (ret == DMA_RETRY); + CHECK(ret == DMA_SUCCESS); *rv = static_cast(ret); }); diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 619338e39688..c30fd645bbd0 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -32,7 +32,7 @@ unsigned int HexagonUserDMA::Init() { return status; } -int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) { +int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache) { // length limited to 24 bits if (length > DESC_LENGTH_MASK) { return DMA_FAILURE; @@ -66,8 +66,24 @@ int HexagonUserDMA::Copy(int queue_id, void* dst, void* src, uint32_t length) { dma_desc_set_desctype(dma_desc, DESC_DESCTYPE_1D); dma_desc_set_dstcomp(dma_desc, DESC_COMP_NONE); dma_desc_set_srccomp(dma_desc, DESC_COMP_NONE); - dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF); - dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF); + + bool dst_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(dst, length); + bool src_is_ddr = !HexagonDeviceAPI::Global()->VtcmPool()->IsVtcm(src, length); + + // VTCM -> DDR with bypass enabled + if (dst_is_ddr && !src_is_ddr && bypass_cache) { + dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_ON); + } else { + dma_desc_set_bypassdst(dma_desc, DESC_BYPASS_OFF); + } + + // DDR -> VTCM with bypass enabled + if (src_is_ddr && !dst_is_ddr && bypass_cache) { + dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_ON); + } else { + dma_desc_set_bypasssrc(dma_desc, DESC_BYPASS_OFF); + } + dma_desc_set_order(dma_desc, DESC_ORDER_ORDER); dma_desc_set_done(dma_desc, DESC_DONE_INCOMPLETE); dma_desc_set_src(dma_desc, src32); @@ -117,45 +133,6 @@ HexagonUserDMA::~HexagonUserDMA() { delete descriptors_; } -int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { - HexagonUserDMA* user_dma = HexagonDeviceAPI::Global()->UserDMA(); - - // One DMA transfer can copy at most DESC_LENGTH_MASK bytes. - // Make the common case quick. - if (length <= DESC_LENGTH_MASK) { - // sync DMA -> `Copy` and then `Wait(0)` - int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, dst, src, length); - if (ret_val != DMA_SUCCESS) return ret_val; - user_dma->Wait(SYNC_DMA_QUEUE, 0); - return DMA_SUCCESS; - } - - // Split big transfers into smaller transfers. - char* cast_src = static_cast(src); - char* cast_dst = static_cast(dst); - for (uint32_t i = 0; i < length;) { - // Ensure there is no overflow while updating i - uint32_t cur_len = std::min(length - i, DESC_LENGTH_MASK); - // sync DMA -> `Copy` and then `Wait(0)` - int ret_val = user_dma->Copy(SYNC_DMA_QUEUE, &cast_dst[i], &cast_src[i], cur_len); - if (ret_val != DMA_SUCCESS) return ret_val; - user_dma->Wait(SYNC_DMA_QUEUE, 0); - // 2 cases for new val for i: - // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT) - // new_i = i + (length - i) = length, no more iter - // and no overflow (since (length - i) <= (MAX_UINT - i)) - // 2. length - i > DESC_LENGTH_MASK - // length > (i + DESC_LENGTH_MASK) - // new_i = (i + DESC_LENGTH_MASK) - // length > new_i for next iter, we're done - // length - i > DESC_LENGTH_MASK - // and length <= MAX_UINT, - // so MAX_UINT >= length > DESC_LEN_MASK + i - // MAX_UINT > (DESC_LEN_MASK + i), so no overflow - i += cur_len; - } - return DMA_SUCCESS; -} } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h index 01e143d255b4..9397a16e3f03 100644 --- a/src/runtime/hexagon/hexagon_user_dma.h +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -52,7 +52,7 @@ class HexagonUserDMA { * \param length Length in bytes to copy * \returns Status: DMA_SUCCESS or DMA_FAILURE */ - int Copy(int queue_id, void* dst, void* src, uint32_t length); + int Copy(int queue_id, void* dst, void* src, uint32_t length, bool bypass_cache); /*! * \brief Wait until the number of DMAs in flight is less than or equal to some maximum diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h index 1c44a455196c..56f83967cd90 100644 --- a/src/runtime/hexagon/hexagon_vtcm_pool.h +++ b/src/runtime/hexagon/hexagon_vtcm_pool.h @@ -70,6 +70,16 @@ class HexagonVtcmPool { //! \brief Returns the total number of bytes in this pool size_t TotalBytes() { return reinterpret_cast(vtcm_size_); } + bool IsVtcm(void* ptr, unsigned size) { + auto char_ptr = static_cast(ptr); + auto char_vtcm = static_cast(vtcm_data_); + + if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) { + return true; + } + return false; + } + private: //! \brief Total size of VTCM pool unsigned int vtcm_size_; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 9feba142eb6a..56ecba9e9ed9 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -290,9 +290,6 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load) .set_attr("TVectorizable", true) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr("TCallEffectKind", - Integer(CallEffectKind::kOpaque)); - TIR_DEFINE_BUILTIN_FUNC(dma_copy).set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc index 417e9d61f263..b9ba4d41b7da 100644 --- a/src/tir/transforms/lower_async_dma.cc +++ b/src/tir/transforms/lower_async_dma.cc @@ -32,7 +32,7 @@ namespace tir { class AsyncDMALowerer : public StmtExprMutator { public: - AsyncDMALowerer() {} + explicit AsyncDMALowerer(bool dma_bypass_cache) : dma_bypass_cache_(dma_bypass_cache) {} Stmt VisitStmt_(const AttrStmtNode* op) final { // Convert this, for example: @@ -52,7 +52,7 @@ class AsyncDMALowerer : public StmtExprMutator { int queue_id = queue_id_node->value; // abort if we have not seen this queue ID in `copy` transform - if (queue_ids.find(queue_id) == queue_ids.end()) { + if (queue_ids_.find(queue_id) == queue_ids_.end()) { DLOG(INFO) << "AsyncDMALowerer exiting because the queue ID observed in the " "`async_wait_queue_scope` transform has not been previously observed in the " "`async_commit_queue_scope` transform"; @@ -160,7 +160,7 @@ class AsyncDMALowerer : public StmtExprMutator { // now that we are about to perform the `copy` transform // save queue ID for inspection in `wait` transform - queue_ids.insert(queue_id); + queue_ids_.insert(queue_id); return Evaluate(Call(DataType::Int(32), builtin::dma_copy(), {queue_id, @@ -168,13 +168,14 @@ class AsyncDMALowerer : public StmtExprMutator { {BufferLoad(bufferstorenode->buffer, store_index)}), Call(DataType::Handle(), builtin::address_of(), {BufferLoad(bufferloadnode->buffer, load_index)}), - for_loop->extent * bufferloadnode->dtype.bytes()})); + for_loop->extent * bufferloadnode->dtype.bytes(), dma_bypass_cache_})); } return StmtExprMutator::VisitStmt_(op); } private: - std::set queue_ids; + std::set queue_ids_; + bool dma_bypass_cache_; }; namespace transform { @@ -182,7 +183,8 @@ namespace transform { Pass LowerAsyncDMA() { auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { auto fptr = f.CopyOnWrite(); - fptr->body = AsyncDMALowerer()(std::move(fptr->body)); + bool dma_bypass_cache = ctx->GetConfig("tir.dma_bypass_cache", Bool(false)).value(); + fptr->body = AsyncDMALowerer(dma_bypass_cache)(std::move(fptr->body)); return f; }; return CreatePrimFuncPass(pass_func, 0, "tir.LowerAsyncDMA", {}); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index f79682ef7ecc..25d62539721f 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -315,8 +315,6 @@ class BuiltinLower : public StmtExprMutator { return MakeArray(op); } else if (op->op.same_as(builtin::tvm_context_id())) { return make_zero(op->dtype); - } else if (op->op.same_as(builtin::mem_copy())) { - return MakeMemCopy(op); } else if (op->op.same_as(builtin::dma_copy())) { return MakeDMACopy(op); } else if (op->op.same_as(builtin::dma_wait())) { @@ -326,19 +324,6 @@ class BuiltinLower : public StmtExprMutator { } } - PrimExpr MakeMemCopy(const CallNode* op) { - PrimExpr dst = op->args[0]; - PrimExpr src = op->args[1]; - PrimExpr size = op->args[2]; - - std::string fdevapi_prefix = - "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); - - Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size}); - return VisitExpr(call_packed); - } - PrimExpr MakeDMACopy(const CallNode* op) { PrimExpr queue_id = op->args[0]; PrimExpr dst = op->args[1]; diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc index b76c7c652e6a..e4ffe3a0de9c 100644 --- a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -53,6 +53,10 @@ class HexagonUserDMATest : public ::testing::Test { char* src_char = nullptr; char* dst_char = nullptr; uint32_t length = 0x4000; // 16KB + const bool ENABLE_BYPASS = true; + const bool DISABLE_BYPASS = false; + Optional global_scope{"global"}; + Optional global_vtcm_scope{"global.vtcm"}; }; TEST_F(HexagonUserDMATest, wait) { @@ -67,14 +71,14 @@ TEST_F(HexagonUserDMATest, bad_copy) { void* src64 = reinterpret_cast(bigaddr); void* dst64 = reinterpret_cast(bigaddr); uint32_t biglength = 0x1000000; - ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length), DMA_SUCCESS); - ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length), DMA_SUCCESS); - ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength), DMA_SUCCESS); + ASSERT_NE(user_dma->Copy(queue_id, dst64, src, length, DISABLE_BYPASS), DMA_SUCCESS); + ASSERT_NE(user_dma->Copy(queue_id, dst, src64, length, DISABLE_BYPASS), DMA_SUCCESS); + ASSERT_NE(user_dma->Copy(queue_id, dst, src, biglength, DISABLE_BYPASS), DMA_SUCCESS); } TEST_F(HexagonUserDMATest, sync_dma) { // kick off 1 DMA - ret = user_dma->Copy(queue_id, dst, src, length); + ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS); ASSERT_EQ(ret, DMA_SUCCESS); // wait for DMA to complete @@ -89,7 +93,7 @@ TEST_F(HexagonUserDMATest, sync_dma) { TEST_F(HexagonUserDMATest, async_dma_wait) { // kick off 10x duplicate DMAs for (uint32_t i = 0; i < 10; ++i) { - ret = user_dma->Copy(queue_id, dst, src, length); + ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS); ASSERT_EQ(ret, DMA_SUCCESS); } @@ -108,7 +112,7 @@ TEST_F(HexagonUserDMATest, async_dma_wait) { TEST_F(HexagonUserDMATest, async_dma_poll) { // kick off 10x duplicate DMAs for (uint32_t i = 0; i < 10; ++i) { - ret = user_dma->Copy(queue_id, dst, src, length); + ret = user_dma->Copy(queue_id, dst, src, length, DISABLE_BYPASS); ASSERT_EQ(ret, DMA_SUCCESS); } @@ -131,7 +135,7 @@ TEST_F(HexagonUserDMATest, pipeline) { for (uint32_t i = 0; i < pipeline_depth; ++i) { ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length, - pipeline_length); + pipeline_length, DISABLE_BYPASS); } user_dma->Wait(queue_id, 3); @@ -168,35 +172,35 @@ TEST_F(HexagonUserDMATest, pipeline_write_queue) { for (uint32_t i = 0; i < pipeline_depth; ++i) { ret |= user_dma->Copy(queue_id, dst_char + i * pipeline_length, src_char + i * pipeline_length, - pipeline_length); + pipeline_length, DISABLE_BYPASS); } user_dma->Wait(queue_id, 3); for (uint32_t i = 0; i < pipeline_length; ++i) { dst_char[i]++; } - ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length); + ret |= user_dma->Copy(write_queue, src_char, dst_char, pipeline_length, DISABLE_BYPASS); user_dma->Wait(queue_id, 2); for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) { dst_char[i]++; } ret |= user_dma->Copy(write_queue, src_char + pipeline_length, dst_char + pipeline_length, - pipeline_length); + pipeline_length, DISABLE_BYPASS); user_dma->Wait(queue_id, 1); for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) { dst_char[i]++; } ret |= user_dma->Copy(write_queue, src_char + 2 * pipeline_length, dst_char + 2 * pipeline_length, - pipeline_length); + pipeline_length, DISABLE_BYPASS); user_dma->Wait(queue_id, 0); for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) { dst_char[i]++; } ret |= user_dma->Copy(write_queue, src_char + 3 * pipeline_length, dst_char + 3 * pipeline_length, - pipeline_length); + pipeline_length, DISABLE_BYPASS); user_dma->Wait(write_queue, 0); // verify @@ -214,7 +218,7 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) { for (uint32_t i = 0; i < number_of_dmas; ++i) { do { ret = user_dma->Copy(queue_id, dst_char + i * length_of_each_dma, - src_char + i * length_of_each_dma, length_of_each_dma); + src_char + i * length_of_each_dma, length_of_each_dma, DISABLE_BYPASS); } while (ret == DMA_RETRY); ASSERT_EQ(ret, DMA_SUCCESS); } @@ -224,3 +228,124 @@ TEST_F(HexagonUserDMATest, overflow_ring_buffer) { ASSERT_EQ(src_char[i], dst_char[i]); } } + +TEST_F(HexagonUserDMATest, sync_dma_bypass) { + HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope); + + // init src, dst HexagonBuffers + srchb.CopyFrom(src, length); + dsthb.CopyFrom(dst, length); + + // DDR src -> VTCM + ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // VTCM -> DDR dst + ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMAs to complete + user_dma->Wait(queue_id, 0); + + // copy answer from dst HexagonBuffer + dsthb.CopyTo(dst, length); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, sync_dma_bypass_vtcm_to_vtcm) { + HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer vtcm1hb(length, kHexagonAllocAlignment, global_vtcm_scope); + HexagonBuffer vtcm2hb(length, kHexagonAllocAlignment, global_vtcm_scope); + + // init src, dst HexagonBuffers + srchb.CopyFrom(src, length); + dsthb.CopyFrom(dst, length); + + // DDR src -> VTCM + ret = user_dma->Copy(queue_id, vtcm1hb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // VTCM -> VTCM + // NOTE: Cache bypass is disabled for VTCM -> VTCM transfers + ret = + user_dma->Copy(queue_id, vtcm2hb.GetPointer(), vtcm1hb.GetPointer(), length, DISABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // VTCM -> DDR dst + ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcm2hb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMAs to complete + user_dma->Wait(queue_id, 0); + + // copy answer from dst HexagonBuffer + dsthb.CopyTo(dst, length); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, sync_dma_bypass_) { + HexagonBuffer srchb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer dsthb(length, kHexagonAllocAlignment, global_scope); + HexagonBuffer vtcmhb(length, kHexagonAllocAlignment, global_vtcm_scope); + + // init src, dst HexagonBuffers + srchb.CopyFrom(src, length); + dsthb.CopyFrom(dst, length); + + // DDR src -> VTCM + ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // VTCM -> DDR dst + ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMAs to complete + user_dma->Wait(queue_id, 0); + + // copy answer from dst HexagonBuffer + dsthb.CopyTo(dst, length); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } + + // change src data + for (uint32_t i = 0; i < length; ++i) { + src_char[i] = 2; + } + + // copy new src data to HexagonBuffer + srchb.CopyFrom(src, length); + + // DDR src -> VTCM + ret = user_dma->Copy(queue_id, vtcmhb.GetPointer(), srchb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // VTCM -> DDR dst + ret = user_dma->Copy(queue_id, dsthb.GetPointer(), vtcmhb.GetPointer(), length, ENABLE_BYPASS); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMAs to complete + user_dma->Wait(queue_id, 0); + + // copy answer from dst HexagonBuffer + dsthb.CopyTo(dst, length); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index ef9b142d6f27..943b4e9ebc96 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -89,6 +89,7 @@ def evaluate( with tvm.transform.PassContext( config={ "tir.use_async_copy": use_async_copy, + "tir.dma_bypass_cache": 1, "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope, } ): @@ -133,7 +134,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", scope="global") T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( a_global_vtcm.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), @@ -153,12 +154,13 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: dtype="handle", ), T.Cast("int", a_bytes), + True, # bypass cache dtype="int32", ) ) T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( w_global_vtcm.data, T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"), @@ -178,6 +180,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: dtype="handle", ), T.Cast("int", w_bytes), + True, # bypass cache dtype="int32", ) ) @@ -202,7 +205,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: ) T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( c_buffer.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), @@ -222,6 +225,7 @@ def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: dtype="handle", ), T.Cast("int", a_bytes), + True, # bypass cache dtype="int32", ) ) @@ -290,7 +294,7 @@ class TestAsyncDMAPipeline: size_a = tvm.testing.parameter( 1024, 64 * 64, - 128 * 64, + # 128 * 64, # Only works on 8Gen1 HDK's ) size_w = tvm.testing.parameter( diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py deleted file mode 100644 index 3ac297fd80d8..000000000000 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ /dev/null @@ -1,226 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" Lower cache_read and cache_write to Hexagon DMA via tensorize """ - -import numpy as np - -import tvm.testing -from tvm import te, tir -from tvm.contrib.hexagon.session import Session -from tvm.script import tir as T - -from .infrastructure import get_hexagon_target - - -def intrin_mem_copy(shape, dtype, dst_scope, src_scope): - """Define and return tensor intrinsic for mem copy""" - src = te.placeholder(shape=shape, dtype=dtype, name="src") - dst = te.compute(shape, lambda i: src[i], name="dst") - size = shape[0] * np.dtype(dtype).itemsize - - src_buffer = tvm.tir.decl_buffer( - shape, - dtype, - scope=src_scope, - offset_factor=1, - name="mem_copy_src_buffer", - ) - - dst_buffer = tvm.tir.decl_buffer( - shape, - dtype, - scope=dst_scope, - offset_factor=1, - name="mem_copy_dst_buffer", - ) - - zero_indices = [0 for _ in shape] - - def intrin_func(ins, outs): - ir_builder = tvm.tir.ir_builder.create() - - _src = ins[0] - _dst = outs[0] - - dst_handle = ir_builder.buffer_ptr(dst_buffer) - src_handle = ir_builder.buffer_ptr(src_buffer) - - ir_builder.emit( - tvm.tir.call_intrin( - "handle", - "tir.mem_copy", - tvm.tir.call_intrin("handle", "tir.address_of", dst_handle[zero_indices]), - tvm.tir.call_intrin("handle", "tir.address_of", src_handle[zero_indices]), - size, - ) - ) - return ir_builder.get() - - return te.decl_tensor_intrin(dst.op, intrin_func, binds={src: src_buffer, dst: dst_buffer}) - - -def verify(hexagon_session: Session, schedule, x_tensor, y_tensor, z_tensor, size): - """Verify correctness with reference from numpy""" - print(tvm.lower(schedule, [x_tensor, y_tensor, z_tensor])) - - func = tvm.build( - schedule, - [x_tensor, y_tensor, z_tensor], - get_hexagon_target("v68"), - name="dmacpy", - ) - - mod = hexagon_session.load_module(func) - x_array = tvm.nd.array( - np.random.randint(low=-128, high=127, size=size, dtype=x_tensor.dtype), - device=hexagon_session.device, - ) - y_array = tvm.nd.array( - np.random.randint(low=-128, high=127, size=size, dtype=y_tensor.dtype), - device=hexagon_session.device, - ) - z_array = tvm.nd.array( - np.random.randint(low=-128, high=127, size=size, dtype=z_tensor.dtype), - device=hexagon_session.device, - ) - mod["dmacpy"](x_array, y_array, z_array) - - ref = x_array.numpy() + y_array.numpy() - np.testing.assert_equal(z_array.numpy(), ref) - - -@tvm.testing.requires_hexagon -def test_cache_read_write(hexagon_session: Session): - """Test cache_read and cache_write to global.vtcm for hexagon""" - size = 128 - outer_shape = (size,) - factor = 16 - inner_shape = (factor,) - dtype = "int8" - - x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x") - y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y") - z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z") - s = te.create_schedule(z_tensor.op) - - x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor]) - y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor]) - z_vtcm = s.cache_write(z_tensor, "global.vtcm") - - zouter, _ = s[z_vtcm].split(z_vtcm.op.axis[0], factor=factor) - - s[x_vtcm].compute_at(s[z_vtcm], zouter) - s[y_vtcm].compute_at(s[z_vtcm], zouter) - - mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global") - - (cache_read_x,) = s[x_vtcm].op.axis - s[x_vtcm].tensorize(cache_read_x, mem_copy_read) - - (cache_read_y,) = s[y_vtcm].op.axis - s[y_vtcm].tensorize(cache_read_y, mem_copy_read) - - mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm") - - (cache_write_z,) = s[z_tensor].op.axis - s[z_tensor].tensorize(cache_write_z, mem_copy_write) - - verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size) - - -def layout_transform_2d(n): - return [n // 16, te.AXIS_SEPARATOR, n % 16] - - -@tvm.testing.requires_hexagon -def test_cache_read_write_2d(hexagon_session: Session): - """Test 2D cache_read and cache_write to global.vtcm for hexagon""" - size = 128 - outer_shape = (size,) - factor = 16 - inner_shape = (factor,) - dtype = "int8" - - x_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="x") - y_tensor = te.placeholder(shape=outer_shape, dtype=dtype, name="y") - z_tensor = te.compute(outer_shape, lambda i: x_tensor[i] + y_tensor[i], name="z") - s = te.create_schedule(z_tensor.op) - - x_vtcm = s.cache_read(x_tensor, "global.vtcm", [z_tensor]) - y_vtcm = s.cache_read(y_tensor, "global.vtcm", [z_tensor]) - z_vtcm = s.cache_write(z_tensor, "global.vtcm") - - layout_x_vtcm = s[x_vtcm].transform_layout(layout_transform_2d) - layout_y_vtcm = s[y_vtcm].transform_layout(layout_transform_2d) - _ = s[z_vtcm].transform_layout(layout_transform_2d) - - mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global") - s[x_vtcm].tensorize(layout_x_vtcm[1], mem_copy_read) - s[y_vtcm].tensorize(layout_y_vtcm[1], mem_copy_read) - - # The loop schedule over `z` is not modified when calling `transform_layout` - # on `z_vtcm` above therefore we must call `split` to modify the loop schedule - # over `z` to match the layout of `z_vtcm` such that we can accurately write - # `z_vtcm` back to `z` using memory copy intrinsic - _, zinner = s[z_tensor].split(z_tensor.op.axis[0], factor=factor) - mem_copy_write = intrin_mem_copy(inner_shape, dtype, "global", "global.vtcm") - s[z_tensor].tensorize(zinner, mem_copy_write) - - verify(hexagon_session, s, x_tensor, y_tensor, z_tensor, size) - - -@T.prim_func -def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]): - for i in T.serial( - 0, - 8192, - ): - with T.block("C"): - buffer_c[i] = buffer_a[i] * T.int8(2) - - -def test_vtcm_lowering(): - """Test lowering with vtcm mem scope""" - mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main")) - sch = tir.Schedule(mod, debug_mask="all") - block_c = sch.get_block("C") - (flat,) = sch.get_loops(block_c) - outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128]) - cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm") - sch.compute_at(cache_block, outer) - lowered = tvm.lower(sch.mod["main"]) - - def ir_module_has_allocate_nodes(irmod): - nallocs = 0 - - def _visit(stmt): - nonlocal nallocs - if isinstance(stmt, tvm.tir.Allocate): - nallocs += 1 - - tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit) - return nallocs - - assert not ir_module_has_allocate_nodes(lowered), ( - "AllocateNode found in lowered IRModule, " - "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope" - ) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py index e6fc0a3c201c..6cca44388d09 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -213,7 +213,7 @@ def operator( ) T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( a_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), @@ -233,12 +233,13 @@ def operator( dtype="handle", ), T.cast(size, dtype="int"), + True, # bypass cache dtype="int32", ) ) T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( b_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), @@ -258,6 +259,7 @@ def operator( dtype="handle", ), T.cast(size, dtype="int"), + True, # bypass cache dtype="int32", ) ) @@ -279,7 +281,7 @@ def operator( ) T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( c_buffer.data, T.tvm_stack_make_shape(size, dtype="handle"), @@ -299,6 +301,7 @@ def operator( dtype="handle", ), T.cast(size, dtype="int"), + True, # bypass cache dtype="int32", ) ) diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py index ba7513a4f39c..387d0f20c4c2 100644 --- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py +++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py @@ -178,7 +178,11 @@ def test_async_software_pipeline( ref = reference(a_np, b_np) with tvm.transform.PassContext( - config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False} + config={ + "tir.use_async_copy": 1, + "tir.dma_bypass_cache": 1, + "tir.merge_async_commit_queue_scope": False, + } ): # tvm.lower(schedule.mod["main"]).show() func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68")) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 980ac0cf4c2a..afe33e4ab4ed 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -25,7 +25,7 @@ from .infrastructure import get_hexagon_target -MB = 1024**2 +MB = 1024 ** 2 KB = 1024 TEST_OUTPUT_TEMPLATE = ( "Test bandwidth with buffer size {}MB... \n" @@ -61,7 +61,7 @@ def operator(a: T.handle, a_v: T.handle) -> None: a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") T.evaluate( T.tvm_call_packed( - "device_api.hexagon.mem_copy_DLTensor", + "device_api.hexagon.dma_copy_dltensor", T.tvm_stack_make_array( a_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), @@ -81,6 +81,7 @@ def operator(a: T.handle, a_v: T.handle) -> None: dtype="handle", ), T.cast(size, dtype="int"), + True, # bypass cache dtype="int32", ) ) @@ -113,7 +114,7 @@ def evaluate(hexagon_session, sch, size): ) runtime = timer(a_hexagon, a_vtcm_hexagon) - gbps = round((size / 2**30) / runtime.mean, 4) + gbps = round((size / 2 ** 30) / runtime.mean, 4) tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a) return gbps From f49ea460a0e6f8ab010b809d12faab9a041c7843 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 14 Nov 2022 14:49:15 -0800 Subject: [PATCH 2/4] fix python lint errors --- tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index afe33e4ab4ed..0b6b52335cb5 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -25,7 +25,7 @@ from .infrastructure import get_hexagon_target -MB = 1024 ** 2 +MB = 1024**2 KB = 1024 TEST_OUTPUT_TEMPLATE = ( "Test bandwidth with buffer size {}MB... \n" @@ -114,7 +114,7 @@ def evaluate(hexagon_session, sch, size): ) runtime = timer(a_hexagon, a_vtcm_hexagon) - gbps = round((size / 2 ** 30) / runtime.mean, 4) + gbps = round((size / 2**30) / runtime.mean, 4) tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a) return gbps From 18ef71fc728e1662fb2fd61ab2262348673baeb1 Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 15 Nov 2022 10:22:40 -0800 Subject: [PATCH 3/4] restore vtcm tests; add TODO for ION buffer; check IsVtcm pointers --- src/runtime/hexagon/hexagon_buffer.cc | 4 ++ src/runtime/hexagon/hexagon_vtcm_pool.h | 2 + .../python/contrib/test_hexagon/test_vtcm.py | 65 +++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/test_vtcm.py diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 4a966f156780..b8c7bd2cb96e 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -236,6 +236,10 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& // Finally, do the memory copies. for (const auto& copy : macro_copies) { + // clean Hexagon cache before / after memcpy to ensure clean cache state to enable usage of DMA + // bypass mode for increased DMA bandwidth + // TODO(HWE): Switch to ION Buffer to avoid need for memcpy and potentially lighten or alleviate + // the burden of cache invalidation in this code qurt_mem_cache_clean(reinterpret_cast(copy.dest), copy.num_bytes, QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE); qurt_mem_cache_clean(reinterpret_cast(copy.src), copy.num_bytes, diff --git a/src/runtime/hexagon/hexagon_vtcm_pool.h b/src/runtime/hexagon/hexagon_vtcm_pool.h index 56f83967cd90..2e0918e997c4 100644 --- a/src/runtime/hexagon/hexagon_vtcm_pool.h +++ b/src/runtime/hexagon/hexagon_vtcm_pool.h @@ -72,7 +72,9 @@ class HexagonVtcmPool { bool IsVtcm(void* ptr, unsigned size) { auto char_ptr = static_cast(ptr); + CHECK(char_ptr != nullptr); auto char_vtcm = static_cast(vtcm_data_); + CHECK(vtcm_data_ != nullptr); if (char_ptr >= char_vtcm && (char_ptr + size) <= (char_vtcm + vtcm_size_)) { return true; diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py new file mode 100644 index 000000000000..ddd5f7bb4b88 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_vtcm.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm.testing +from tvm import tir +from tvm.contrib.hexagon.session import Session +from tvm.script import tir as T + +from .infrastructure import get_hexagon_target + + +@T.prim_func +def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]): + for i in T.serial( + 0, + 8192, + ): + with T.block("C"): + buffer_c[i] = buffer_a[i] * T.int8(2) + + +def test_vtcm_lowering(): + """Test lowering with vtcm mem scope""" + mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main")) + sch = tir.Schedule(mod, debug_mask="all") + block_c = sch.get_block("C") + (flat,) = sch.get_loops(block_c) + outer, _, _, _ = sch.split(flat, factors=[8, 4, 2, 128]) + cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm") + sch.compute_at(cache_block, outer) + lowered = tvm.lower(sch.mod["main"]) + + def ir_module_has_allocate_nodes(irmod): + nallocs = 0 + + def _visit(stmt): + nonlocal nallocs + if isinstance(stmt, tvm.tir.Allocate): + nallocs += 1 + + tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit) + return nallocs + + assert not ir_module_has_allocate_nodes(lowered), ( + "AllocateNode found in lowered IRModule, " + "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope" + ) + + +if __name__ == "__main__": + tvm.testing.main() From 21063ff499308f58eff88534c54e1b6a1df29633 Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 15 Nov 2022 14:26:47 -0800 Subject: [PATCH 4/4] fix py lint errors; reset test-only changes --- tests/python/contrib/test_hexagon/test_vtcm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_vtcm.py b/tests/python/contrib/test_hexagon/test_vtcm.py index ddd5f7bb4b88..11188436a318 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_vtcm.py @@ -14,14 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""VTCM Tests""" import tvm.testing from tvm import tir -from tvm.contrib.hexagon.session import Session from tvm.script import tir as T -from .infrastructure import get_hexagon_target - @T.prim_func def scale_by_two(buffer_a: T.Buffer[(8192,), "int8"], buffer_c: T.Buffer[(8192,), "int8"]):