From 6bbe6cc576360b5046d09fbc2ab27a57255b0134 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 2 Mar 2022 09:53:08 -0800 Subject: [PATCH 01/38] repurpose texture flatten for vtcm; TIR lowering correct --- src/driver/driver_api.cc | 2 +- src/tir/transforms/lower_tvm_builtin.cc | 2 +- src/tir/transforms/texture_flatten.cc | 93 ++----------------- .../test_hexagon/test_cache_read_write.py | 10 +- 4 files changed, 15 insertions(+), 92 deletions(-) diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 2a0c2f73f2ba..8d54500a63b7 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -240,8 +240,8 @@ Array CreatePassList(bool disable_loop_partition) { // PHASE 1 pass_list.push_back(tir::transform::InjectPrefetch()); - pass_list.push_back(tir::transform::TextureFlatten()); pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers)); + pass_list.push_back(tir::transform::TextureFlatten()); pass_list.push_back(tir::transform::LowerCrossThreadReduction()); pass_list.push_back(tir::transform::LowerInitBlock()); pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation()); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index bcf763ca8a93..53f00b3c25cc 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -416,7 +416,7 @@ class BuiltinLower : public StmtExprMutator { Call(let->var.dtype(), builtin::tvm_call_packed(), {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_), cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), - cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()), + /*cast(DataType::UInt(64), call->args[1]),*/ IntImm(DataType::Int(32), dtype.code()), IntImm(DataType::Int(32), dtype.bits())}); Stmt alloca = LetStmt(let->var, call_packed, body); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 7dc800737944..de17e7773f8a 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -66,8 +66,8 @@ class TextureLoweringBase : public StmtExprMutator { } protected: - std::string GetStorageScope(const Buffer& buffer) { - auto* ptr = buffer->data->type_annotation.as(); + std::string GetStorageScope(const Var& var) { + auto* ptr = var->type_annotation.as(); ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; return ptr->storage_scope; } @@ -87,97 +87,20 @@ class TextureFlattener : public TextureLoweringBase { IRVisitorWithAnalyzer* bound_analyzer) : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} - Stmt VisitStmt_(const BufferRealizeNode* op) final { - if (extern_buf_.count(op->buffer)) { - return this->VisitStmt(op->body); - } - - std::string storage_scope = GetStorageScope(op->buffer); - Var buffer_var(op->buffer->data->name_hint, - PointerType(PrimType(op->buffer->dtype), String(storage_scope))); - let_binding_.insert({op->buffer->data, buffer_var}); - + Stmt VisitStmt_(const AllocateNode* op) final { + Stmt body = this->VisitStmt(op->body); + std::string storage_scope = GetStorageScope(op->buffer_var); Stmt stmt = StmtExprMutator::VisitStmt_(op); - op = stmt.as(); + op = stmt.as(); // Rewrite any buffer realizations with storage scope to 2d texture allocations if (IsTextureStorage(storage_scope)) { - Stmt body = this->VisitStmt(op->body); - ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; - int vec_length = static_cast(op->bounds.back()->extent.as()->value); - ICHECK(vec_length == 4 || vec_length == 1) - << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)"; - - struct ShapeFromRange { - const Array& bounds; - PrimExpr operator[](size_t i) const { return bounds[i]->extent; } - }; - size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); - auto texture = - ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); - Array args = {texture.width, texture.height}; - stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); + Array args = {op->extents.back()}; + stmt = LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } return stmt; } - - Stmt VisitStmt_(const BufferStoreNode* op) final { - Stmt stmt = StmtExprMutator::VisitStmt_(op); - op = stmt.as(); - std::string storage_scope = GetStorageScope(op->buffer); - // Lower to two dimensional access - if (IsTextureStorage(storage_scope)) { - Array args = GetTextureAccessArgs(op, op->buffer); - args.push_back(op->value); - stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args)); - } - - return stmt; - } - - PrimExpr VisitExpr_(const BufferLoadNode* op) final { - PrimExpr expr = StmtExprMutator::VisitExpr_(op); - op = expr.as(); - // Lower to two dimensional access - std::string storage_scope = GetStorageScope(op->buffer); - if (IsTextureStorage(storage_scope)) { - Array args = GetTextureAccessArgs(op, op->buffer); - args.push_back(op->indices.back()); - expr = Call(op->buffer->dtype, builtin::texture2d_load(), args); - } - - return expr; - } - - protected: - template - Array GetTextureAccessArgs(const T* op, const Buffer& buffer) { - Array args; - if (let_binding_.count(op->buffer->data)) { - args.push_back(let_binding_[op->buffer->data]); - } else { - args.push_back(buffer->data); - } - Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) { - if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { - col_dims.push_back(op->buffer->shape[i]); - col_indices.push_back(op->indices[i]); - } else { - row_dims.push_back(op->buffer->shape[i]); - row_indices.push_back(op->indices[i]); - } - } - PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); - PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); - args.push_back(row_offset); - args.push_back(col_offset); - return args; - } - - // Bindings to new texture vars with texture pointer scope - std::unordered_map let_binding_; }; PrimFunc TextureFlatten(PrimFunc func) { diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 38f62a036e5e..66871a52f3a8 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -75,16 +75,16 @@ def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_p z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z") s = te.create_schedule(z.op) - x_global = s.cache_read(x, "global.vtcm", [z]) - y_global = s.cache_read(y, "global.vtcm", [z]) - z_global = s.cache_write(z, "global.vtcm") + x_global = s.cache_read(x, "global.texture", [z]) + y_global = s.cache_read(y, "global.texture", [z]) + z_global = s.cache_write(z, "global.texture") zouter, zinner = s[z_global].split(z_global.op.axis[0], factor=factor) s[x_global].compute_at(s[z_global], zouter) s[y_global].compute_at(s[z_global], zouter) - mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global") + mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.texture", "global") (cache_read_x,) = s[x_global].op.axis s[x_global].tensorize(cache_read_x, mem_copy_read) @@ -92,7 +92,7 @@ def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_p (cache_read_y,) = s[y_global].op.axis s[y_global].tensorize(cache_read_y, mem_copy_read) - mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm") + mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.texture") (cache_write_z,) = s[z].op.axis s[z].tensorize(cache_write_z, mem_copy_write) From 3cb0121bbfecec1c9a8e1b5825e4896fb355ec52 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 2 Mar 2022 10:01:48 -0800 Subject: [PATCH 02/38] clean up remaining code in texture flatten pass --- src/tir/transforms/texture_flatten.cc | 56 +++++---------------------- 1 file changed, 10 insertions(+), 46 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index de17e7773f8a..10b721091cec 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -38,54 +38,14 @@ namespace tvm { namespace tir { -using runtime::ApplyTexture2DFlattening; -using runtime::DefaultTextureLayoutSeparator; using runtime::IsTextureStorage; -class TextureLoweringBase : public StmtExprMutator { - public: - explicit TextureLoweringBase(const Map& extern_buffer_map, - IRVisitorWithAnalyzer* bound_analyzer) - : bound_analyzer_{bound_analyzer} { - for (auto kv : extern_buffer_map) { - extern_buf_.insert(kv.second); - } - } - - inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) const { - PrimExpr base = make_const(DataType::Int(32), 0); - ICHECK_EQ(shape.size(), index.size()); - if (index.size() > 0) { - PrimExpr offset = index[0]; - for (size_t i = 1; i < index.size(); ++i) { - offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]); - } - base = base + offset; - } - return base; - } - - protected: - std::string GetStorageScope(const Var& var) { - auto* ptr = var->type_annotation.as(); - ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; - return ptr->storage_scope; - } - - // Set of all external input and output buffers - std::unordered_set extern_buf_; - // Bound analzer - IRVisitorWithAnalyzer* bound_analyzer_; -}; - // Lower Nd storage access to 2d texture access using lowering convention // specified by the buffers storage scope. -class TextureFlattener : public TextureLoweringBase { +class TextureFlattener : public StmtExprMutator { public: using StmtExprMutator::VisitStmt_; - explicit TextureFlattener(const Map& extern_buffer_map, - IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} + TextureFlattener() {} Stmt VisitStmt_(const AllocateNode* op) final { Stmt body = this->VisitStmt(op->body); @@ -93,7 +53,7 @@ class TextureFlattener : public TextureLoweringBase { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - // Rewrite any buffer realizations with storage scope to 2d texture allocations + // Rewrite any allocations with storage scope to 1d (TODO Nd) texture allocations if (IsTextureStorage(storage_scope)) { Array args = {op->extents.back()}; stmt = LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); @@ -101,13 +61,17 @@ class TextureFlattener : public TextureLoweringBase { return stmt; } + protected: + std::string GetStorageScope(const Var& var) { + auto* ptr = var->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; + } }; PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); - IRVisitorWithAnalyzer bound_analyzer; - bound_analyzer(fptr->body); - fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body)); + fptr->body = TextureFlattener()(std::move(fptr->body)); return func; } From 5365fabdc057bdf69db71738225037279eb29cd2 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 2 Mar 2022 13:46:21 -0800 Subject: [PATCH 03/38] add Alloc and FreeTexture, but failing to run over rpc --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 43 +++++++++++++++++++ src/tir/transforms/texture_flatten.cc | 3 ++ 2 files changed, 46 insertions(+) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index a4f6803f682c..ed41b91119d6 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -36,6 +36,10 @@ #include "hexagon_buffer.h" #include "hexagon_common.h" +#if defined(__hexagon__) +#include "HAP_compute_res.h" +#endif + namespace tvm { namespace runtime { namespace hexagon { @@ -152,6 +156,7 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo } TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to mem_copy"); void* dst = args[0]; void* src = args[1]; int size = args[2]; @@ -161,7 +166,45 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM *rv = static_cast(0); }); +TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { + int nbytes = args[0]; +// unsigned int context_id_ = 0; +// void* data_ = nullptr; +// #if defined(__hexagon__) +// compute_res_attr_t res_info; +// HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info)); + +// // allocate nbytes of vtcm on a single page +// HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, +// /*b_single_page = */ 1)); +// context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000); + +// if (context_id_) { +// data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); +// if (!data_) { +// HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null."); +// HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); +// return; +// } +// } else { +// HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource."); +// return; +// } +// #endif + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to AllocTexture"); + auto data_ = malloc(nbytes); + *rv = data_; +}); + +TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to FreeTexture"); + void* data_ = args[2]; + free(data_); + *rv = static_cast(0); +}); + TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { + HEXAGON_PRINT(ALWAYS, "STRAW: Getting Device API"); DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); *rv = static_cast(ptr); }); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 10b721091cec..f3af7599129c 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -55,6 +55,9 @@ class TextureFlattener : public StmtExprMutator { // Rewrite any allocations with storage scope to 1d (TODO Nd) texture allocations if (IsTextureStorage(storage_scope)) { + std::cout << "--------------------------------------------------------\n"; + std::cout << "op->extents.back() = " << op->extents.back() << "\n"; + std::cout << "--------------------------------------------------------\n"; Array args = {op->extents.back()}; stmt = LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } From 8044dc0e0c87fe6d1a553f3ce7c00db204c7a4ef Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 3 Mar 2022 08:01:20 -0800 Subject: [PATCH 04/38] test passing with malloc in the device api --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 33 +++++-------------- src/tir/transforms/lower_tvm_builtin.cc | 28 ++++++---------- src/tir/transforms/texture_flatten.cc | 12 +++---- .../test_hexagon/test_cache_read_write.py | 7 ++++ 4 files changed, 29 insertions(+), 51 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index ed41b91119d6..da3c7728b923 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -158,8 +158,11 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { HEXAGON_PRINT(ALWAYS, "STRAW: Made it to mem_copy"); void* dst = args[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: dst = %p", dst); void* src = args[1]; + HEXAGON_PRINT(ALWAYS, "STRAW: src = %p", src); int size = args[2]; + HEXAGON_PRINT(ALWAYS, "STRAW: size = %d", size); hexagon_user_dma_1d_sync(dst, src, size); @@ -167,38 +170,18 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM }); TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - int nbytes = args[0]; -// unsigned int context_id_ = 0; -// void* data_ = nullptr; -// #if defined(__hexagon__) -// compute_res_attr_t res_info; -// HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info)); - -// // allocate nbytes of vtcm on a single page -// HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, -// /*b_single_page = */ 1)); -// context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000); - -// if (context_id_) { -// data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); -// if (!data_) { -// HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null."); -// HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); -// return; -// } -// } else { -// HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource."); -// return; -// } -// #endif HEXAGON_PRINT(ALWAYS, "STRAW: Made it to AllocTexture"); + int nbytes = args[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: nbytes = %d", nbytes); auto data_ = malloc(nbytes); + HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); *rv = data_; }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { HEXAGON_PRINT(ALWAYS, "STRAW: Made it to FreeTexture"); - void* data_ = args[2]; + void* data_ = args[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); free(data_); *rv = static_cast(0); }); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 53f00b3c25cc..0900033ba044 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -400,31 +400,23 @@ class BuiltinLower : public StmtExprMutator { } Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) { - ICHECK(device_type_.defined()) << "Unknown device type in current IR"; - ICHECK(device_id_.defined()) << "Unknown device id in current IR"; - Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); + PrimExpr size = call->args[0]; + Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); - DataType dtype = - let->var->type_annotation.as()->element_type.as()->dtype; - - std::string fdevapi_prefix = "device_api."; - fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Call call_packed = - Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), - /*cast(DataType::UInt(64), call->args[1]),*/ IntImm(DataType::Int(32), dtype.code()), - IntImm(DataType::Int(32), dtype.bits())}); + + std::string fdevapi_prefix = + "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + + Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocTexture"), size}); Stmt alloca = LetStmt(let->var, call_packed, body); - Call free_op = - Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), let->var}); + Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeTexture"), let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index f3af7599129c..b7d69015db5e 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -40,8 +40,6 @@ namespace tvm { namespace tir { using runtime::IsTextureStorage; -// Lower Nd storage access to 2d texture access using lowering convention -// specified by the buffers storage scope. class TextureFlattener : public StmtExprMutator { public: using StmtExprMutator::VisitStmt_; @@ -53,19 +51,17 @@ class TextureFlattener : public StmtExprMutator { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - // Rewrite any allocations with storage scope to 1d (TODO Nd) texture allocations if (IsTextureStorage(storage_scope)) { - std::cout << "--------------------------------------------------------\n"; - std::cout << "op->extents.back() = " << op->extents.back() << "\n"; - std::cout << "--------------------------------------------------------\n"; Array args = {op->extents.back()}; - stmt = LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); + stmt = LetStmt(op->buffer_var, + Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } return stmt; } + protected: - std::string GetStorageScope(const Var& var) { + std::string GetStorageScope(const Var& var) { auto* ptr = var->type_annotation.as(); ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; return ptr->storage_scope; diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 66871a52f3a8..bd8d1a8f9e80 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -107,6 +107,13 @@ def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_p dso_binary = "test_binary.so" dso_binary_path = temp.relpath(dso_binary) func.save(dso_binary_path) + print("PRINT LLVM IR") + print(func.get_source("ll")) + for impmod in func.imported_modules: + print(impmod.get_source("ll")) + print("DONE") + print(type(func)) + print(type(func).__mro__) if not android_serial_number: pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") From 82f61be6ed83d97ddb86173c6c9274e3cd63eaeb Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 3 Mar 2022 08:15:18 -0800 Subject: [PATCH 05/38] cleanup --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 17 ++++++----------- src/tir/transforms/lower_tvm_builtin.cc | 8 ++++---- src/tir/transforms/texture_flatten.cc | 2 -- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index da3c7728b923..d4400fd732da 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -36,10 +36,6 @@ #include "hexagon_buffer.h" #include "hexagon_common.h" -#if defined(__hexagon__) -#include "HAP_compute_res.h" -#endif - namespace tvm { namespace runtime { namespace hexagon { @@ -158,11 +154,11 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { HEXAGON_PRINT(ALWAYS, "STRAW: Made it to mem_copy"); void* dst = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: dst = %p", dst); + HEXAGON_PRINT(ALWAYS, "STRAW: dst = %p", dst); void* src = args[1]; - HEXAGON_PRINT(ALWAYS, "STRAW: src = %p", src); + HEXAGON_PRINT(ALWAYS, "STRAW: src = %p", src); int size = args[2]; - HEXAGON_PRINT(ALWAYS, "STRAW: size = %d", size); + HEXAGON_PRINT(ALWAYS, "STRAW: size = %d", size); hexagon_user_dma_1d_sync(dst, src, size); @@ -172,22 +168,21 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { HEXAGON_PRINT(ALWAYS, "STRAW: Made it to AllocTexture"); int nbytes = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: nbytes = %d", nbytes); + HEXAGON_PRINT(ALWAYS, "STRAW: nbytes = %d", nbytes); auto data_ = malloc(nbytes); - HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); + HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); *rv = data_; }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { HEXAGON_PRINT(ALWAYS, "STRAW: Made it to FreeTexture"); void* data_ = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); + HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); free(data_); *rv = static_cast(0); }); TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { - HEXAGON_PRINT(ALWAYS, "STRAW: Getting Device API"); DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); *rv = static_cast(ptr); }); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 0900033ba044..55d4945ada74 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -400,16 +400,16 @@ class BuiltinLower : public StmtExprMutator { } Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) { - PrimExpr size = call->args[0]; - Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); - std::string fdevapi_prefix = - "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + std::string fdevapi_prefix = "device_api."; + fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); + // TODO: cast? + PrimExpr size = call->args[0]; Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), {StringImm(fdevapi_prefix + ".AllocTexture"), size}); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index b7d69015db5e..2ff5abd8243e 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -30,8 +30,6 @@ #include #include -#include - #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/texture.h" #include "../../runtime/thread_storage_scope.h" From c1843edac013b1cd7a6ef222af1893f0bb392d4b Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 4 Mar 2022 14:48:53 -0800 Subject: [PATCH 06/38] fails in very reliable way with memory corruption --- src/runtime/hexagon/hexagon/hexagon_buffer.cc | 15 ++++- .../hexagon/hexagon/hexagon_device_api_v2.cc | 67 ++++++++++++++++--- src/tir/transforms/lower_tvm_builtin.cc | 8 ++- .../test_hexagon/test_cache_read_write.py | 2 +- 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc index e4654a349dca..91e129a07416 100644 --- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc @@ -165,12 +165,25 @@ HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional scope) allocations_.push_back(data); } -HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); } +HexagonBuffer::~HexagonBuffer() { + HEXAGON_PRINT(ALWAYS, "STRAW: ---------------------------------------"); + HEXAGON_PRINT(ALWAYS, "STRAW: DELETING HexagonBuffer at %p", this); + managed_allocations_.clear(); + HEXAGON_PRINT(ALWAYS, "STRAW: ---------------------------------------"); +} void** HexagonBuffer::GetPointer() { + HEXAGON_PRINT(ALWAYS, "STRAW: In HexagonBuffer::GetPointer"); + HEXAGON_PRINT(ALWAYS, "STRAW: checking size"); if (!allocations_.size()) { + HEXAGON_PRINT(ALWAYS, "STRAW: returning nullptr"); return nullptr; } + HEXAGON_PRINT(ALWAYS, "STRAW: checking data"); + void** x = allocations_.data(); + HEXAGON_PRINT(ALWAYS, "STRAW: addressof %p", addressof(allocations_)); + HEXAGON_PRINT(ALWAYS, "STRAW: returning %p", x); + HEXAGON_PRINT(ALWAYS, "STRAW: vtcm pointer %p", x[0]); return allocations_.data(); } diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index d4400fd732da..31236bdda46b 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -125,20 +125,25 @@ void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamH HexagonBuffer* hex_from_buf = static_cast(from->data); HexagonBuffer* hex_to_buf = static_cast(to->data); + HEXAGON_PRINT(ALWAYS, "HERE1"); if (TVMDeviceExtType(from->device.device_type) == kDLHexagon && TVMDeviceExtType(to->device.device_type) == kDLHexagon) { + HEXAGON_PRINT(ALWAYS, " In HEREa"); CHECK(hex_from_buf != nullptr); CHECK(hex_to_buf != nullptr); hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from)); } else if (from->device.device_type == kDLCPU && TVMDeviceExtType(to->device.device_type) == kDLHexagon) { + HEXAGON_PRINT(ALWAYS, " In HEREb"); CHECK(hex_to_buf != nullptr); hex_to_buf->CopyFrom(from->data, GetDataSize(*from)); } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon && to->device.device_type == kDLCPU) { + HEXAGON_PRINT(ALWAYS, " In HEREc"); CHECK(hex_from_buf != nullptr); hex_from_buf->CopyTo(to->data, GetDataSize(*to)); } else { + HEXAGON_PRINT(ALWAYS, " In HEREd"); CHECK(false) << "Expect copy between DLTensor devices of types kDLHexagon and kDLCPU (external) only."; } @@ -152,39 +157,85 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo } TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { + void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to mem_copy"); void* dst = args[0]; HEXAGON_PRINT(ALWAYS, "STRAW: dst = %p", dst); - void* src = args[1]; + std::string dst_scope = args[1]; + HEXAGON_PRINT(ALWAYS, "STRAW: dst_scope = %s", dst_scope.c_str()); + void* src = args[2]; HEXAGON_PRINT(ALWAYS, "STRAW: src = %p", src); - int size = args[2]; + std::string src_scope = args[3]; + HEXAGON_PRINT(ALWAYS, "STRAW: src_scope = %s", src_scope.c_str()); + int size = args[4]; HEXAGON_PRINT(ALWAYS, "STRAW: size = %d", size); + if(dst_scope == "global.texture") { + HEXAGON_PRINT(ALWAYS, "STRAW: dst is vtcm"); + auto* hexbuf = static_cast(dst); + HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); + dst = hexbuf->GetPointer()[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: vtcm dst = %p", dst); + } + + if(src_scope == "global.texture") { + HEXAGON_PRINT(ALWAYS, "STRAW: src is vtcm"); + auto* hexbuf = static_cast(src); + HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); + src = hexbuf->GetPointer()[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", src); + } + hexagon_user_dma_1d_sync(dst, src, size); *rv = static_cast(0); + //void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { + void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to AllocTexture"); int nbytes = args[0]; HEXAGON_PRINT(ALWAYS, "STRAW: nbytes = %d", nbytes); - auto data_ = malloc(nbytes); - HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); - *rv = data_; + auto *hexbuf = new HexagonBuffer(nbytes, kHexagonAllocAlignment, String("global.vtcm")); + HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); + auto *ptr = hexbuf->GetPointer()[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", ptr); + + *rv = hexbuf; + //void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { + void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); + HEXAGON_PRINT(ALWAYS, "STRAW: Made it to FreeTexture"); - void* data_ = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: data_ = %p", data_); - free(data_); + void* ptr = args[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: ptr = %p", ptr); + auto *hexbuf = static_cast(ptr); + HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); + auto *x = hexbuf->GetPointer()[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", x); + delete hexbuf; *rv = static_cast(0); + + //void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { + void **p = (void**)0x0024FA10; + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); *rv = static_cast(ptr); + HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); } // namespace hexagon diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 55d4945ada74..3ddc5431973b 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -218,14 +218,16 @@ class BuiltinLower : public StmtExprMutator { PrimExpr MakeMemCopy(const CallNode* op) { PrimExpr dst = op->args[0]; - PrimExpr src = op->args[1]; - PrimExpr size = op->args[2]; + PrimExpr dst_scope = op->args[1]; + PrimExpr src = op->args[2]; + PrimExpr src_scope = op->args[3]; + PrimExpr size = op->args[4]; std::string fdevapi_prefix = "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size}); + {StringImm(fdevapi_prefix + ".mem_copy"), dst, dst_scope, src, src_scope, size}); return VisitExpr(call_packed); } diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index bd8d1a8f9e80..123c3e26c2ab 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -54,7 +54,7 @@ def intrin_func(ins, outs): _dst = outs[0] ib.emit( tvm.tir.call_intrin( - "handle", "tir.mem_copy", _dst.access_ptr("w"), _src.access_ptr("r"), size + "handle", "tir.mem_copy", _dst.access_ptr("w"), dst_scope, _src.access_ptr("r"), src_scope, size ) ) return ib.get() From 4cef7691d8df7fd56a8d34f0e05c2dd85d637666 Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 4 Mar 2022 15:17:24 -0800 Subject: [PATCH 07/38] working with non-HexagonBuffer vtcm alloc --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 97 ++++++++----------- 1 file changed, 38 insertions(+), 59 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 31236bdda46b..06cf2245281f 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -36,6 +36,10 @@ #include "hexagon_buffer.h" #include "hexagon_common.h" +#if defined(__hexagon__) +#include "HAP_compute_res.h" +#endif + namespace tvm { namespace runtime { namespace hexagon { @@ -157,85 +161,60 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo } TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { - void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); - - HEXAGON_PRINT(ALWAYS, "STRAW: Made it to mem_copy"); void* dst = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: dst = %p", dst); - std::string dst_scope = args[1]; - HEXAGON_PRINT(ALWAYS, "STRAW: dst_scope = %s", dst_scope.c_str()); + // std::string dst_scope = args[1]; void* src = args[2]; - HEXAGON_PRINT(ALWAYS, "STRAW: src = %p", src); - std::string src_scope = args[3]; - HEXAGON_PRINT(ALWAYS, "STRAW: src_scope = %s", src_scope.c_str()); + // std::string src_scope = args[3]; int size = args[4]; - HEXAGON_PRINT(ALWAYS, "STRAW: size = %d", size); - - if(dst_scope == "global.texture") { - HEXAGON_PRINT(ALWAYS, "STRAW: dst is vtcm"); - auto* hexbuf = static_cast(dst); - HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); - dst = hexbuf->GetPointer()[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: vtcm dst = %p", dst); - } - - if(src_scope == "global.texture") { - HEXAGON_PRINT(ALWAYS, "STRAW: src is vtcm"); - auto* hexbuf = static_cast(src); - HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); - src = hexbuf->GetPointer()[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", src); - } hexagon_user_dma_1d_sync(dst, src, size); *rv = static_cast(0); - //void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); -TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); +std::map vtcmallocs; - HEXAGON_PRINT(ALWAYS, "STRAW: Made it to AllocTexture"); +TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { int nbytes = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: nbytes = %d", nbytes); - auto *hexbuf = new HexagonBuffer(nbytes, kHexagonAllocAlignment, String("global.vtcm")); - HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); - auto *ptr = hexbuf->GetPointer()[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", ptr); - - *rv = hexbuf; - //void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); + void *data_ = nullptr; + unsigned int context_id_ = 0; +#if defined(__hexagon__) + compute_res_attr_t res_info; + HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info)); + + // allocate nbytes of vtcm on a single page + HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, + /*b_single_page = */ 1)); + context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000); + + if (context_id_) { + data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); + if (!data_) { + HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null."); + HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); + return; + } + } else { + HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource."); + return; + } +#endif + vtcmallocs[data_] = context_id_; + *rv = data_; }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); - - HEXAGON_PRINT(ALWAYS, "STRAW: Made it to FreeTexture"); - void* ptr = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: ptr = %p", ptr); - auto *hexbuf = static_cast(ptr); - HEXAGON_PRINT(ALWAYS, "STRAW: hexbuf = %p", hexbuf); - auto *x = hexbuf->GetPointer()[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: vtcm src = %p", x); - delete hexbuf; + void* data_ = args[0]; + unsigned int context_id_ = vtcmallocs[data_]; +#if defined(__hexagon__) + HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); +#endif *rv = static_cast(0); - - //void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { - void **p = (void**)0x0024FA10; - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); *rv = static_cast(ptr); - HEXAGON_PRINT(ALWAYS, "STRAW: CORRUPTED MEMORY = %p", *p); }); } // namespace hexagon From f13cd4c9e570442053d200192a65d08347c43923 Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 4 Mar 2022 15:38:23 -0800 Subject: [PATCH 08/38] cleanup --- src/runtime/hexagon/hexagon/hexagon_buffer.cc | 15 +-------------- .../hexagon/hexagon/hexagon_device_api_v2.cc | 5 ----- .../contrib/test_hexagon/test_cache_read_write.py | 2 -- 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc index 91e129a07416..e4654a349dca 100644 --- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc @@ -165,25 +165,12 @@ HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional scope) allocations_.push_back(data); } -HexagonBuffer::~HexagonBuffer() { - HEXAGON_PRINT(ALWAYS, "STRAW: ---------------------------------------"); - HEXAGON_PRINT(ALWAYS, "STRAW: DELETING HexagonBuffer at %p", this); - managed_allocations_.clear(); - HEXAGON_PRINT(ALWAYS, "STRAW: ---------------------------------------"); -} +HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); } void** HexagonBuffer::GetPointer() { - HEXAGON_PRINT(ALWAYS, "STRAW: In HexagonBuffer::GetPointer"); - HEXAGON_PRINT(ALWAYS, "STRAW: checking size"); if (!allocations_.size()) { - HEXAGON_PRINT(ALWAYS, "STRAW: returning nullptr"); return nullptr; } - HEXAGON_PRINT(ALWAYS, "STRAW: checking data"); - void** x = allocations_.data(); - HEXAGON_PRINT(ALWAYS, "STRAW: addressof %p", addressof(allocations_)); - HEXAGON_PRINT(ALWAYS, "STRAW: returning %p", x); - HEXAGON_PRINT(ALWAYS, "STRAW: vtcm pointer %p", x[0]); return allocations_.data(); } diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 06cf2245281f..2ddc9b2c7ac2 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -129,25 +129,20 @@ void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamH HexagonBuffer* hex_from_buf = static_cast(from->data); HexagonBuffer* hex_to_buf = static_cast(to->data); - HEXAGON_PRINT(ALWAYS, "HERE1"); if (TVMDeviceExtType(from->device.device_type) == kDLHexagon && TVMDeviceExtType(to->device.device_type) == kDLHexagon) { - HEXAGON_PRINT(ALWAYS, " In HEREa"); CHECK(hex_from_buf != nullptr); CHECK(hex_to_buf != nullptr); hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from)); } else if (from->device.device_type == kDLCPU && TVMDeviceExtType(to->device.device_type) == kDLHexagon) { - HEXAGON_PRINT(ALWAYS, " In HEREb"); CHECK(hex_to_buf != nullptr); hex_to_buf->CopyFrom(from->data, GetDataSize(*from)); } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon && to->device.device_type == kDLCPU) { - HEXAGON_PRINT(ALWAYS, " In HEREc"); CHECK(hex_from_buf != nullptr); hex_from_buf->CopyTo(to->data, GetDataSize(*to)); } else { - HEXAGON_PRINT(ALWAYS, " In HEREd"); CHECK(false) << "Expect copy between DLTensor devices of types kDLHexagon and kDLCPU (external) only."; } diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 123c3e26c2ab..ded25208f67c 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -112,8 +112,6 @@ def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_p for impmod in func.imported_modules: print(impmod.get_source("ll")) print("DONE") - print(type(func)) - print(type(func).__mro__) if not android_serial_number: pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") From e9ef946bec0feba59e8271f8171d4c21e322d0aa Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 7 Mar 2022 12:21:00 -0800 Subject: [PATCH 09/38] do not pass scope through mem_copy api --- src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 6 ++---- src/tir/transforms/lower_tvm_builtin.cc | 8 +++----- .../python/contrib/test_hexagon/test_cache_read_write.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 2ddc9b2c7ac2..0786c2e8867a 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -157,10 +157,8 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { void* dst = args[0]; - // std::string dst_scope = args[1]; - void* src = args[2]; - // std::string src_scope = args[3]; - int size = args[4]; + void* src = args[1]; + int size = args[2]; hexagon_user_dma_1d_sync(dst, src, size); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 3ddc5431973b..55d4945ada74 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -218,16 +218,14 @@ class BuiltinLower : public StmtExprMutator { PrimExpr MakeMemCopy(const CallNode* op) { PrimExpr dst = op->args[0]; - PrimExpr dst_scope = op->args[1]; - PrimExpr src = op->args[2]; - PrimExpr src_scope = op->args[3]; - PrimExpr size = op->args[4]; + PrimExpr src = op->args[1]; + PrimExpr size = op->args[2]; std::string fdevapi_prefix = "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".mem_copy"), dst, dst_scope, src, src_scope, size}); + {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size}); return VisitExpr(call_packed); } diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index ded25208f67c..f8dcc84497f4 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -54,7 +54,7 @@ def intrin_func(ins, outs): _dst = outs[0] ib.emit( tvm.tir.call_intrin( - "handle", "tir.mem_copy", _dst.access_ptr("w"), dst_scope, _src.access_ptr("r"), src_scope, size + "handle", "tir.mem_copy", _dst.access_ptr("w"), _src.access_ptr("r"), size ) ) return ib.get() From daac1889c4423f1490e3e0ecc49f1a947e13b908 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 7 Mar 2022 14:25:40 -0600 Subject: [PATCH 10/38] [Hexagon] Resolve breakage in test_hexagon/test_cache_read_write Breakage was caused by https://github.com/apache/tvm/pull/9727, which didn't account for the new `builtin::mem_copy()` when computing the stack size in `StackSizeChecker`. --- src/tir/transforms/lower_tvm_builtin.cc | 31 ++++++++++++++----- .../test_hexagon/test_cache_read_write.py | 3 +- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 7f0631d00e57..0a8ebadcd194 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -34,6 +34,17 @@ namespace tvm { namespace tir { +namespace { +Call MakeMemCopyHelper(const CallNode* op, std::string packed_func_name) { + PrimExpr dst = op->args[0]; + PrimExpr src = op->args[1]; + PrimExpr size = op->args[2]; + + return Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(packed_func_name), dst, src, size}); +} +} // namespace + class StackSizeChecker : public StmtExprVisitor { public: struct StackSizes { @@ -73,10 +84,19 @@ class StackSizeChecker : public StmtExprVisitor { return MakeShape(op); } else if (op->op.same_as(builtin::tvm_stack_make_array())) { return MakeArray(op); + } else if (op->op.same_as(builtin::mem_copy())) { + return MakeMemCopy(op); } else { return StmtExprVisitor::VisitExpr_(op); } } + + void MakeMemCopy(const CallNode* op) { + Call call_packed = MakeMemCopyHelper(op, "nonexistent_function"); + + return VisitExpr(call_packed); + } + // call shape void MakeShape(const CallNode* op) { // if args.size() == 0, it is still valid and represents a scalar @@ -346,15 +366,12 @@ class BuiltinLower : public StmtExprMutator { } PrimExpr MakeMemCopy(const CallNode* op) { - PrimExpr dst = op->args[0]; - PrimExpr src = op->args[1]; - PrimExpr size = op->args[2]; + std::stringstream packed_func_name; + packed_func_name << "device_api." << runtime::DeviceName(device_type_.as()->value) + << ".mem_copy"; - std::string fdevapi_prefix = - "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + Call call_packed = MakeMemCopyHelper(op, packed_func_name.str()); - Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size}); return VisitExpr(call_packed); } diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 8216d07adece..16b206371199 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -63,7 +63,7 @@ def intrin_func(ins, outs): @requires_hexagon_toolchain -def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_port): +def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket): size = 128 outer_shape = (size,) factor = 16 @@ -115,6 +115,7 @@ def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_p "rpc_tracker_host": tvm_tracker_host, "rpc_tracker_port": tvm_tracker_port, "rpc_server_port": 7070, + "adb_server_socket": adb_server_socket, } launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) launcher.upload(dso_binary_path, dso_binary) From 5ca8970dc7437d380838344e492178e5d25b5c06 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 7 Mar 2022 12:27:06 -0800 Subject: [PATCH 11/38] use HexagonBuffer in Alloc and Free packed funcs --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 38 ++++--------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 0786c2e8867a..55cdce8fabb5 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -165,43 +165,19 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM *rv = static_cast(0); }); -std::map vtcmallocs; +std::map vtcmallocs; TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { int nbytes = args[0]; - void *data_ = nullptr; - unsigned int context_id_ = 0; -#if defined(__hexagon__) - compute_res_attr_t res_info; - HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info)); - - // allocate nbytes of vtcm on a single page - HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, - /*b_single_page = */ 1)); - context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000); - - if (context_id_) { - data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); - if (!data_) { - HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null."); - HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); - return; - } - } else { - HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource."); - return; - } -#endif - vtcmallocs[data_] = context_id_; - *rv = data_; + HexagonBuffer *hexbuf = new HexagonBuffer(nbytes, kHexagonAllocAlignment, String("global.vtcm")); + void* ptr = hexbuf->GetPointer()[0]; + vtcmallocs[ptr] = hexbuf; + *rv = ptr; }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - void* data_ = args[0]; - unsigned int context_id_ = vtcmallocs[data_]; -#if defined(__hexagon__) - HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); -#endif + void* ptr = args[0]; + delete vtcmallocs[ptr]; *rv = static_cast(0); }); From 8cea1e1d012533509e829f6e0720bbc53a6b473d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 7 Mar 2022 14:57:29 -0600 Subject: [PATCH 12/38] Added comment indicating need for StackSizeChecker::MakeMemCopy. --- src/tir/transforms/lower_tvm_builtin.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 0a8ebadcd194..4bc41c263dcc 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -85,6 +85,8 @@ class StackSizeChecker : public StmtExprVisitor { } else if (op->op.same_as(builtin::tvm_stack_make_array())) { return MakeArray(op); } else if (op->op.same_as(builtin::mem_copy())) { + // The 3 arguments to mem_copy require 4 arguments to + // tvm_call_packed, so we cannot re-use MakeCallPacked here. return MakeMemCopy(op); } else { return StmtExprVisitor::VisitExpr_(op); From a96c062cc4426c06212131499471b8e1710bdb7e Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 7 Mar 2022 15:27:59 -0800 Subject: [PATCH 13/38] add AllocVtcmWorkspace and FreeVtcmWorkspace --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 35 +++++++++++++++++-- .../hexagon/hexagon/hexagon_device_api_v2.h | 6 ++++ src/tir/transforms/lower_tvm_builtin.cc | 7 ++-- .../test_hexagon/test_cache_read_write.py | 4 ++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index ea4b1c66c450..5378cd936e99 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -123,6 +123,13 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) { workspace_allocations_.erase(it); } +void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, + DLDataType dtype) { + return AllocDataSpace(dev, ndim, shape, dtype, String("global.vtcm")); +} + +void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); } + void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { CHECK_EQ(from->byte_offset, 0); CHECK_EQ(to->byte_offset, 0); @@ -170,8 +177,22 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM std::map vtcmallocs; TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - int nbytes = args[0]; - HexagonBuffer *hexbuf = new HexagonBuffer(nbytes, kHexagonAllocAlignment, String("global.vtcm")); + int64_t nbytes = args[0]; + int64_t shape[1] = {nbytes}; + + // TODO: pass device as packed func arg + Device dev; + dev.device_type = static_cast(kDLHexagon); + + // TODO: pass dtype as packed func arg + DLDataType dtype; + dtype.bits = 8; + dtype.lanes = 1; + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + HexagonBuffer* hexbuf = + reinterpret_cast(hexapi->AllocVtcmWorkspace(dev, 1, shape, dtype)); + void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; *rv = ptr; @@ -179,7 +200,15 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { void* ptr = args[0]; - delete vtcmallocs[ptr]; + HexagonBuffer* hexbuf = vtcmallocs[ptr]; + // delete hexbuf; + + // TODO: pass device as packed func arg + Device dev; + dev.device_type = static_cast(kDLHexagon); + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + hexapi->FreeVtcmWorkspace(dev, hexbuf); *rv = static_cast(0); }); diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index 3d866307f17c..06b638c03639 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -82,6 +82,12 @@ class HexagonDeviceAPIv2 final : public DeviceAPI { void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) final; + // TODO: comments + void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype); + + // TODO: comments + void FreeVtcmWorkspace(Device dev, void* ptr); + /*! * \brief Copy data from one storage to another. * \note This API is designed to support special memory with shape dependent layout. diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index e554e2a01b5c..100266f9c53e 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -554,10 +554,9 @@ class BuiltinLower : public StmtExprMutator { std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - // TODO: cast? - PrimExpr size = call->args[0]; - Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocTexture"), size}); + Call call_packed = Call( + let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::UInt(64), call->args[0])}); Stmt alloca = LetStmt(let->var, call_packed, body); diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 9d20e756a23a..57c0ba1cdbe2 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -63,7 +63,9 @@ def intrin_func(ins, outs): @requires_hexagon_toolchain -def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket): +def test_cache_read_write( + android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket +): size = 128 outer_shape = (size,) factor = 16 From 0ecd0174ad1bb62f603845b1d8cc85a02047b627 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 7 Mar 2022 16:17:29 -0800 Subject: [PATCH 14/38] cleanup --- src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 5378cd936e99..3a40b304cb9a 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -36,10 +36,6 @@ #include "hexagon_buffer.h" #include "hexagon_common.h" -#if defined(__hexagon__) -#include "HAP_compute_res.h" -#endif - namespace tvm { namespace runtime { namespace hexagon { @@ -201,7 +197,6 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { void* ptr = args[0]; HexagonBuffer* hexbuf = vtcmallocs[ptr]; - // delete hexbuf; // TODO: pass device as packed func arg Device dev; From 2c1ee84fe12a2955d5711da15c410ca2c805c4a8 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 8 Mar 2022 09:22:47 -0600 Subject: [PATCH 15/38] Updated unittests to run all contrib/test_hexagon at CI. --- tests/scripts/task_python_hexagon.sh | 2 +- tests/scripts/task_python_hexagon_simulator.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh index f94ec8b45e12..82c1fbe585ea 100755 --- a/tests/scripts/task_python_hexagon.sh +++ b/tests/scripts/task_python_hexagon.sh @@ -24,4 +24,4 @@ source tests/scripts/setup-pytest-env.sh make cython3 -run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon/test_launcher.py +run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon diff --git a/tests/scripts/task_python_hexagon_simulator.sh b/tests/scripts/task_python_hexagon_simulator.sh index cddd52375da1..c8ae847e3eca 100755 --- a/tests/scripts/task_python_hexagon_simulator.sh +++ b/tests/scripts/task_python_hexagon_simulator.sh @@ -35,6 +35,6 @@ export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim" # HEXAGON_TOOLCHAIN is already set export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH} export ANDROID_SERIAL_NUMBER=simulator -run_pytest ctypes python-contrib-hexagon-simulator tests/python/contrib/test_hexagon/test_launcher.py +run_pytest ctypes python-contrib-hexagon-simulator tests/python/contrib/test_hexagon kill ${TRACKER_PID} From 351b0afd3f91e504941aeeff5ef0974c324d5faf Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 8 Mar 2022 10:34:09 -0800 Subject: [PATCH 16/38] create separate vtcm alloc lowering pass and transform --- include/tvm/tir/builtin.h | 2 + include/tvm/tir/transform.h | 2 + python/tvm/tir/transform/transform.py | 5 +++ src/driver/driver_api.cc | 3 +- .../hexagon/hexagon/hexagon_device_api_v2.cc | 4 +- src/tir/op/builtin.cc | 3 ++ src/tir/transforms/lower_tvm_builtin.cc | 42 +++++++++++++++++-- .../test_hexagon/test_cache_read_write.py | 12 +++--- 8 files changed, 60 insertions(+), 13 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index f7e1cfbc3e6d..b992353d9f5d 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -638,6 +638,8 @@ TVM_DLL const Op& texture2d_load(); */ TVM_DLL const Op& mem_copy(); +TVM_DLL const Op& vtcm_alloca(); + /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { // array head address diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index 4330c4f7c64a..eee6a6396a71 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -617,6 +617,8 @@ TVM_DLL Pass ExtractPrimFuncConstants(); */ TVM_DLL Pass RenormalizeSplitPattern(); +TVM_DLL Pass LowerVtcmAlloc(); + } // namespace transform } // namespace tir } // namespace tvm diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 802fdc576c41..84d4a2db20cf 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -123,6 +123,11 @@ def TextureFlatten(): return _ffi_api.TextureFlatten() # type: ignore +# TODO: comments +def LowerVtcmAlloc(): + return _ffi_api.LowerVtcmAlloc() + + def InjectCopyIntrin(pragma_key: str, fintrin): """Inject virtual thread loops. diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 26d45beaae3c..2d9cdd8912ff 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -240,8 +240,9 @@ Array CreatePassList(bool disable_loop_partition) { // PHASE 1 pass_list.push_back(tir::transform::InjectPrefetch()); - pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers)); pass_list.push_back(tir::transform::TextureFlatten()); + pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers)); + pass_list.push_back(tir::transform::LowerVtcmAlloc()); pass_list.push_back(tir::transform::LowerCrossThreadReduction()); pass_list.push_back(tir::transform::LowerInitBlock()); pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation()); diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 3a40b304cb9a..a2152788bfbb 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -172,7 +172,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM std::map vtcmallocs; -TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { +TVM_REGISTER_GLOBAL("device_api.hexagon.AllocVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { int64_t nbytes = args[0]; int64_t shape[1] = {nbytes}; @@ -194,7 +194,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocTexture").set_body([](TVMArgs args, *rv = ptr; }); -TVM_REGISTER_GLOBAL("device_api.hexagon.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { +TVM_REGISTER_GLOBAL("device_api.hexagon.FreeVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { void* ptr = args[0]; HexagonBuffer* hexbuf = vtcmallocs[ptr]; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 0e767ead4e6b..31d494c6c78e 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -263,6 +263,9 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load) TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(vtcm_alloca) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + } // namespace builtin } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 100266f9c53e..20c25f373132 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -253,6 +253,8 @@ class BuiltinLower : public StmtExprMutator { if (const CallNode* call = op->value.as()) { if (call->op.same_as(builtin::texture2d_alloca())) { return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call)); + } else if (call->op.same_as(builtin::vtcm_alloca())) { + return StmtExprMutator::VisitStmt(MakeVtcmAlloc(op, call)); } } return StmtExprMutator::VisitStmt_(op); @@ -546,6 +548,38 @@ class BuiltinLower : public StmtExprMutator { } Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) { + ICHECK(device_type_.defined()) << "Unknown device type in current IR"; + ICHECK(device_id_.defined()) << "Unknown device id in current IR"; + Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); + + Stmt body = SeqStmt( + {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), + let->body}); + DataType dtype = + let->var->type_annotation.as()->element_type.as()->dtype; + + std::string fdevapi_prefix = "device_api."; + fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); + Call call_packed = + Call(let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), + cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits())}); + + Stmt alloca = LetStmt(let->var, call_packed, body); + + Call free_op = + Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), let->var}); + + Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); + body = SeqStmt({alloca, free_stmt}); + return body; + } + + Stmt MakeVtcmAlloc(const LetStmtNode* let, const CallNode* call) { Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), @@ -554,14 +588,14 @@ class BuiltinLower : public StmtExprMutator { std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Call call_packed = Call( - let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::UInt(64), call->args[0])}); + Call call_packed = + Call(let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocVtcm"), cast(DataType::UInt(64), call->args[0])}); Stmt alloca = LetStmt(let->var, call_packed, body); Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeTexture"), let->var}); + {StringImm(fdevapi_prefix + ".FreeVtcm"), let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 57c0ba1cdbe2..5084f0031367 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -77,16 +77,16 @@ def test_cache_read_write( z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z") s = te.create_schedule(z.op) - x_global = s.cache_read(x, "global.texture", [z]) - y_global = s.cache_read(y, "global.texture", [z]) - z_global = s.cache_write(z, "global.texture") + x_global = s.cache_read(x, "global.vtcm", [z]) + y_global = s.cache_read(y, "global.vtcm", [z]) + z_global = s.cache_write(z, "global.vtcm") zouter, zinner = s[z_global].split(z_global.op.axis[0], factor=factor) s[x_global].compute_at(s[z_global], zouter) s[y_global].compute_at(s[z_global], zouter) - mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.texture", "global") + mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global") (cache_read_x,) = s[x_global].op.axis s[x_global].tensorize(cache_read_x, mem_copy_read) @@ -94,7 +94,7 @@ def test_cache_read_write( (cache_read_y,) = s[y_global].op.axis s[y_global].tensorize(cache_read_y, mem_copy_read) - mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.texture") + mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm") (cache_write_z,) = s[z].op.axis s[z].tensorize(cache_write_z, mem_copy_write) @@ -121,7 +121,7 @@ def test_cache_read_write( rpc_info = { "rpc_tracker_host": tvm_tracker_host, "rpc_tracker_port": tvm_tracker_port, - "rpc_server_port": 7070, + "rpc_server_port": 8080, "adb_server_socket": adb_server_socket, } launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) From 0f3778296b18978473b97f209ed808c590f63398 Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 8 Mar 2022 10:47:15 -0800 Subject: [PATCH 17/38] reset texture_flatten.cc --- src/tir/transforms/texture_flatten.cc | 144 ++++++++++++++++-- .../test_hexagon/test_cache_read_write.py | 5 - 2 files changed, 130 insertions(+), 19 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 2ff5abd8243e..7dc800737944 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -30,45 +30,161 @@ #include #include +#include + #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/texture.h" #include "../../runtime/thread_storage_scope.h" namespace tvm { namespace tir { +using runtime::ApplyTexture2DFlattening; +using runtime::DefaultTextureLayoutSeparator; using runtime::IsTextureStorage; -class TextureFlattener : public StmtExprMutator { +class TextureLoweringBase : public StmtExprMutator { + public: + explicit TextureLoweringBase(const Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : bound_analyzer_{bound_analyzer} { + for (auto kv : extern_buffer_map) { + extern_buf_.insert(kv.second); + } + } + + inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) const { + PrimExpr base = make_const(DataType::Int(32), 0); + ICHECK_EQ(shape.size(), index.size()); + if (index.size() > 0) { + PrimExpr offset = index[0]; + for (size_t i = 1; i < index.size(); ++i) { + offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]); + } + base = base + offset; + } + return base; + } + + protected: + std::string GetStorageScope(const Buffer& buffer) { + auto* ptr = buffer->data->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; + } + + // Set of all external input and output buffers + std::unordered_set extern_buf_; + // Bound analzer + IRVisitorWithAnalyzer* bound_analyzer_; +}; + +// Lower Nd storage access to 2d texture access using lowering convention +// specified by the buffers storage scope. +class TextureFlattener : public TextureLoweringBase { public: using StmtExprMutator::VisitStmt_; - TextureFlattener() {} + explicit TextureFlattener(const Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} + + Stmt VisitStmt_(const BufferRealizeNode* op) final { + if (extern_buf_.count(op->buffer)) { + return this->VisitStmt(op->body); + } + + std::string storage_scope = GetStorageScope(op->buffer); + Var buffer_var(op->buffer->data->name_hint, + PointerType(PrimType(op->buffer->dtype), String(storage_scope))); + let_binding_.insert({op->buffer->data, buffer_var}); - Stmt VisitStmt_(const AllocateNode* op) final { - Stmt body = this->VisitStmt(op->body); - std::string storage_scope = GetStorageScope(op->buffer_var); Stmt stmt = StmtExprMutator::VisitStmt_(op); - op = stmt.as(); + op = stmt.as(); + // Rewrite any buffer realizations with storage scope to 2d texture allocations if (IsTextureStorage(storage_scope)) { - Array args = {op->extents.back()}; - stmt = LetStmt(op->buffer_var, - Call(op->buffer_var.dtype(), builtin::texture2d_alloca(), args), body); + Stmt body = this->VisitStmt(op->body); + ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; + int vec_length = static_cast(op->bounds.back()->extent.as()->value); + ICHECK(vec_length == 4 || vec_length == 1) + << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)"; + + struct ShapeFromRange { + const Array& bounds; + PrimExpr operator[](size_t i) const { return bounds[i]->extent; } + }; + size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); + auto texture = + ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); + Array args = {texture.width, texture.height}; + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } return stmt; } + Stmt VisitStmt_(const BufferStoreNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + std::string storage_scope = GetStorageScope(op->buffer); + // Lower to two dimensional access + if (IsTextureStorage(storage_scope)) { + Array args = GetTextureAccessArgs(op, op->buffer); + args.push_back(op->value); + stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args)); + } + + return stmt; + } + + PrimExpr VisitExpr_(const BufferLoadNode* op) final { + PrimExpr expr = StmtExprMutator::VisitExpr_(op); + op = expr.as(); + // Lower to two dimensional access + std::string storage_scope = GetStorageScope(op->buffer); + if (IsTextureStorage(storage_scope)) { + Array args = GetTextureAccessArgs(op, op->buffer); + args.push_back(op->indices.back()); + expr = Call(op->buffer->dtype, builtin::texture2d_load(), args); + } + + return expr; + } + protected: - std::string GetStorageScope(const Var& var) { - auto* ptr = var->type_annotation.as(); - ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; - return ptr->storage_scope; + template + Array GetTextureAccessArgs(const T* op, const Buffer& buffer) { + Array args; + if (let_binding_.count(op->buffer->data)) { + args.push_back(let_binding_[op->buffer->data]); + } else { + args.push_back(buffer->data); + } + Array row_dims, row_indices, col_dims, col_indices; + for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) { + if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { + col_dims.push_back(op->buffer->shape[i]); + col_indices.push_back(op->indices[i]); + } else { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); + } + } + PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); + PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); + args.push_back(row_offset); + args.push_back(col_offset); + return args; } + + // Bindings to new texture vars with texture pointer scope + std::unordered_map let_binding_; }; PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); - fptr->body = TextureFlattener()(std::move(fptr->body)); + IRVisitorWithAnalyzer bound_analyzer; + bound_analyzer(fptr->body); + fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body)); return func; } diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 5084f0031367..cf6b5de0c83a 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -109,11 +109,6 @@ def test_cache_read_write( dso_binary = "test_binary.so" dso_binary_path = temp.relpath(dso_binary) func.save(dso_binary_path) - print("PRINT LLVM IR") - print(func.get_source("ll")) - for impmod in func.imported_modules: - print(impmod.get_source("ll")) - print("DONE") if not android_serial_number: pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") From 6678e1410bbb6acbd0b71b3a379954bd655740e3 Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 8 Mar 2022 11:43:08 -0800 Subject: [PATCH 18/38] comments --- include/tvm/tir/builtin.h | 7 +++++-- include/tvm/tir/transform.h | 9 +++++++-- python/tvm/tir/transform/transform.py | 5 ----- src/runtime/hexagon/hexagon/hexagon_device_api_v2.h | 11 +++++++++-- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index b992353d9f5d..f145e5bb2bfe 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -631,6 +631,11 @@ TVM_DLL const Op& texture2d_store(); */ TVM_DLL const Op& texture2d_load(); +/*! + * \brief Create a vtcm allocation + */ +TVM_DLL const Op& vtcm_alloca(); + /*! * \brief Copy 1d memory from source to destination * Same semantics as memcpy(destination, source, size) @@ -638,8 +643,6 @@ TVM_DLL const Op& texture2d_load(); */ TVM_DLL const Op& mem_copy(); -TVM_DLL const Op& vtcm_alloca(); - /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { // array head address diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index eee6a6396a71..24c3cfa78f72 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -459,6 +459,13 @@ TVM_DLL Pass FlattenBuffer(); */ TVM_DLL Pass TextureFlatten(); +/* + * \brief Lower VTCM allocations + * + * \return The Pass + */ +TVM_DLL Pass LowerVtcmAlloc(); + /*! * \brief Implements a Common Subexpression Elimination (CSE) for TIR * which introduces let-in bindings for duplicated sub-expressions. @@ -617,8 +624,6 @@ TVM_DLL Pass ExtractPrimFuncConstants(); */ TVM_DLL Pass RenormalizeSplitPattern(); -TVM_DLL Pass LowerVtcmAlloc(); - } // namespace transform } // namespace tir } // namespace tvm diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 84d4a2db20cf..802fdc576c41 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -123,11 +123,6 @@ def TextureFlatten(): return _ffi_api.TextureFlatten() # type: ignore -# TODO: comments -def LowerVtcmAlloc(): - return _ffi_api.LowerVtcmAlloc() - - def InjectCopyIntrin(pragma_key: str, fintrin): """Inject virtual thread loops. diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index 06b638c03639..67526b79b3af 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -82,10 +82,17 @@ class HexagonDeviceAPIv2 final : public DeviceAPI { void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) final; - // TODO: comments + /*! + * \brief Allocate an Nd VTCM workspace. + * \param dev The device to perform the operation. + * \param ndim The number of dimensions of allocated tensor. + * \param shape The shape of allocated tensor. + * \param dtype The element type. + * \return The allocated HexagonBuffer pointer. + */ void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype); - // TODO: comments + //! \brief Free the allocated Nd VTCM workspace. void FreeVtcmWorkspace(Device dev, void* ptr); /*! From 1088c6640fef53ebcabbc2eb182e318efa9eb3c2 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 8 Mar 2022 16:44:37 -0600 Subject: [PATCH 19/38] CI bump From 7de3ae0f582d6e15d3b8f6a6b8102f5d5ea95bc8 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 9 Mar 2022 09:18:41 -0600 Subject: [PATCH 20/38] Fix lint formatting error. --- tests/python/contrib/test_hexagon/test_cache_read_write.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index 16b206371199..fb9b352476bd 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -63,7 +63,9 @@ def intrin_func(ins, outs): @requires_hexagon_toolchain -def test_cache_read_write(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket): +def test_cache_read_write( + android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket +): size = 128 outer_shape = (size,) factor = 16 From 794dbbfe8f9f509676d1e4e27a3b297731ee7f18 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 9 Mar 2022 13:03:25 -0600 Subject: [PATCH 21/38] Updated fix to remove StackSizeChecker entirely. --- src/tir/transforms/lower_tvm_builtin.cc | 277 ++++++++---------------- 1 file changed, 96 insertions(+), 181 deletions(-) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 4bc41c263dcc..df46125844a6 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -34,18 +34,9 @@ namespace tvm { namespace tir { -namespace { -Call MakeMemCopyHelper(const CallNode* op, std::string packed_func_name) { - PrimExpr dst = op->args[0]; - PrimExpr src = op->args[1]; - PrimExpr size = op->args[2]; - - return Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(packed_func_name), dst, src, size}); -} -} // namespace - -class StackSizeChecker : public StmtExprVisitor { +// Calculate the statistics of packed function. +// These information are needed during codegen. +class BuiltinLower : public StmtExprMutator { public: struct StackSizes { // If a tvm_stack_make_shape call has no arguments, it is still @@ -57,118 +48,6 @@ class StackSizeChecker : public StmtExprVisitor { uint64_t arg_stack{0}; }; - static StackSizes Check(Stmt stmt) { - StackSizeChecker visitor; - visitor.VisitStmt(stmt); - return visitor.max_stack_; - } - - private: - void VisitStmt_(const ForNode* op) final { - if (op->kind == ForKind::kParallel) { - // Parallel for loops have their own stack and allocations, so - // stop the recursion here. - return; - } else { - this->VisitStmt(op->body); - } - } - void VisitExpr_(const CallNode* op) final { - if (op->op.same_as(builtin::tvm_call_packed())) { - return MakeCallPacked(op, /* use_string_lookup */ true); - } else if (op->op.same_as(builtin::tvm_call_cpacked())) { - return MakeCallPacked(op, /* use_string_lookup */ false); - } else if (op->op.same_as(builtin::tvm_call_trace_packed())) { - return MakeCallTracePacked(op); - } else if (op->op.same_as(builtin::tvm_stack_make_shape())) { - return MakeShape(op); - } else if (op->op.same_as(builtin::tvm_stack_make_array())) { - return MakeArray(op); - } else if (op->op.same_as(builtin::mem_copy())) { - // The 3 arguments to mem_copy require 4 arguments to - // tvm_call_packed, so we cannot re-use MakeCallPacked here. - return MakeMemCopy(op); - } else { - return StmtExprVisitor::VisitExpr_(op); - } - } - - void MakeMemCopy(const CallNode* op) { - Call call_packed = MakeMemCopyHelper(op, "nonexistent_function"); - - return VisitExpr(call_packed); - } - - // call shape - void MakeShape(const CallNode* op) { - // if args.size() == 0, it is still valid and represents a scalar - // shape (). Therefore, -1 is used to represent "no shape - // arguments exist", while 0 represents "shape arguments exist, - // all of which are size 0". - if (current_stack_.shape_stack == -1) { - current_stack_.shape_stack = 0; - } - current_stack_.shape_stack += op->args.size(); - StmtExprVisitor::VisitExpr_(op); - } - // make array - void MakeArray(const CallNode* op) { - current_stack_.array_stack += 1; - StmtExprVisitor::VisitExpr_(op); - } - // call packed. - void MakeCallPacked(const CallNode* op, bool use_string_lookup) { - StackSizes restore_stack = current_stack_; - - size_t arg_count = op->args.size(); - - // cpacked expects a resource_handle parameter - if (!use_string_lookup) { - arg_count--; - } - - current_stack_.arg_stack += arg_count; - // Specially handle the buffer packed intrinsic - StmtExprVisitor::VisitExpr_(op); - // Record the amount of stack space needed, then reset the stack - // position to its previous location. - UpdateMaxStack(); - current_stack_ = restore_stack; - } - - void MakeCallTracePacked(const CallNode* op) { - StackSizes restore_stack = current_stack_; - - size_t args_size = op->args.size(); - ICHECK_GT(args_size, 0); - current_stack_.arg_stack += args_size; - - StmtExprVisitor::VisitExpr_(op); - // Record the amount of stack space needed, then reset the stack - // position to its previous location. - UpdateMaxStack(); - current_stack_ = restore_stack; - - // However, the arguments to this CallNode remain on top of the - // stack, so we can use more than one packed function's arguments - // with the one stack. - current_stack_.arg_stack = restore_stack.arg_stack + args_size - 1; - } - - void UpdateMaxStack() { - max_stack_.arg_stack = std::max(current_stack_.arg_stack, max_stack_.arg_stack); - max_stack_.shape_stack = std::max(current_stack_.shape_stack, max_stack_.shape_stack); - max_stack_.array_stack = std::max(current_stack_.array_stack, max_stack_.array_stack); - } - - StackSizes current_stack_; - StackSizes max_stack_; -}; - -// Calculate the statistics of packed function. -// These information are needed during codegen. -class BuiltinLower : public StmtExprMutator { - public: // Record stack frame for existing scope. struct AllocaScope { Buffer stack_shape; @@ -176,51 +55,78 @@ class BuiltinLower : public StmtExprMutator { Var stack_value = Var("stack_value", DataType::Handle()); Buffer stack_tcode; - int64_t max_shape_stack{-1}; - uint64_t max_array_stack{0}; - uint64_t max_arg_stack{0}; + StackSizes max_sizes; + StackSizes run_sizes; + + void UpdateMax() { + max_sizes.shape_stack = std::max(max_sizes.shape_stack, run_sizes.shape_stack); + max_sizes.array_stack = std::max(max_sizes.array_stack, run_sizes.array_stack); + max_sizes.arg_stack = std::max(max_sizes.arg_stack, run_sizes.arg_stack); + } - int64_t run_shape_stack{-1}; - uint64_t run_array_stack{0}; - uint64_t run_arg_stack{0}; + void AssertMaxIsValid() const { + ICHECK((max_sizes.shape_stack >= run_sizes.shape_stack) || + (max_sizes.array_stack >= run_sizes.array_stack) || + (max_sizes.arg_stack >= run_sizes.arg_stack)); + } }; Stmt Build(Stmt stmt) { return this->VisitBodyAndRealizeAlloca(stmt); } + StackSizes GetMaxStack(Stmt stmt) { + BuiltinLower precheck; + precheck.is_precheck_ = true; + precheck.VisitBodyAndRealizeAlloca(stmt); + + precheck.alloca_scope_.emplace_back(); + auto& scope = precheck.alloca_scope_.back(); + scope.stack_shape = + decl_buffer({IntImm(DataType::Int(64), 0)}, DataType::Int(64), "stack_shape"); + scope.stack_tcode = + decl_buffer({IntImm(DataType::UInt(64), 0)}, DataType::Int(32), "stack_tcode"); + + precheck.VisitStmt(stmt); + + ICHECK_EQ(alloca_scope_.size(), 1); + return precheck.alloca_scope_[0].max_sizes; + } + // Allcoate stack frames, only at parallel-for or root. Stmt VisitBodyAndRealizeAlloca(Stmt stmt) { - // Initial check to identify maximum stack sizes. These are used - // to construct Buffer objects to hold the stack, which are then - // used when mutating. - auto max_sizes = StackSizeChecker::Check(stmt); + // Only perform the precheck up to the point where we would add a + // new scope. + if (is_precheck_) { + return stmt; + } alloca_scope_.emplace_back(); auto& scope = alloca_scope_.back(); - if (max_sizes.shape_stack != -1) { - scope.stack_shape = decl_buffer({IntImm(DataType::Int(64), max_sizes.shape_stack)}, + // Initial check to identify maximum stack sizes. These are used + // to construct Buffer objects to hold the stack, which are then + // used when mutating. + scope.max_sizes = GetMaxStack(stmt); + + if (scope.max_sizes.shape_stack != -1) { + scope.stack_shape = decl_buffer({IntImm(DataType::Int(64), scope.max_sizes.shape_stack)}, DataType::Int(64), "stack_shape"); - stmt = LetStmt(scope.stack_shape->data, StackAlloca("shape", max_sizes.shape_stack), stmt); + stmt = + LetStmt(scope.stack_shape->data, StackAlloca("shape", scope.max_sizes.shape_stack), stmt); } - if (max_sizes.array_stack != 0) { - stmt = LetStmt(scope.stack_array, StackAlloca("array", max_sizes.array_stack), stmt); + if (scope.max_sizes.array_stack != 0) { + stmt = LetStmt(scope.stack_array, StackAlloca("array", scope.max_sizes.array_stack), stmt); } - if (max_sizes.arg_stack != 0) { - scope.stack_tcode = decl_buffer({IntImm(DataType::UInt(64), max_sizes.arg_stack)}, + if (scope.max_sizes.arg_stack != 0) { + scope.stack_tcode = decl_buffer({IntImm(DataType::UInt(64), scope.max_sizes.arg_stack)}, DataType::Int(32), "stack_tcode"); - stmt = LetStmt(scope.stack_value, StackAlloca("arg_value", max_sizes.arg_stack), stmt); + stmt = LetStmt(scope.stack_value, StackAlloca("arg_value", scope.max_sizes.arg_stack), stmt); - stmt = LetStmt(scope.stack_tcode->data, StackAlloca("arg_tcode", max_sizes.arg_stack), stmt); + stmt = LetStmt(scope.stack_tcode->data, StackAlloca("arg_tcode", scope.max_sizes.arg_stack), + stmt); } - // Copy these values from the earlier search, for use in bounds - // checks. - scope.max_shape_stack = max_sizes.shape_stack; - scope.max_array_stack = max_sizes.array_stack; - scope.max_arg_stack = max_sizes.arg_stack; - stmt = this->VisitStmt(stmt); ICHECK(!alloca_scope_.empty()); @@ -235,8 +141,8 @@ class BuiltinLower : public StmtExprMutator { auto stmt = StmtExprMutator::VisitStmt(s); auto& scope = alloca_scope_.back(); - ICHECK_EQ(scope.run_shape_stack, -1); - ICHECK_EQ(scope.run_array_stack, 0); + ICHECK_EQ(scope.run_sizes.shape_stack, -1); + ICHECK_EQ(scope.run_sizes.array_stack, 0); auto prep_seq = std::move(prep_seq_stack_.back()); prep_seq_stack_.pop_back(); @@ -368,12 +274,15 @@ class BuiltinLower : public StmtExprMutator { } PrimExpr MakeMemCopy(const CallNode* op) { - std::stringstream packed_func_name; - packed_func_name << "device_api." << runtime::DeviceName(device_type_.as()->value) - << ".mem_copy"; + PrimExpr dst = op->args[0]; + PrimExpr src = op->args[1]; + PrimExpr size = op->args[2]; - Call call_packed = MakeMemCopyHelper(op, packed_func_name.str()); + std::string fdevapi_prefix = + "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".mem_copy"), dst, src, size}); return VisitExpr(call_packed); } @@ -383,11 +292,11 @@ class BuiltinLower : public StmtExprMutator { ICHECK(!alloca_scope_.empty()); auto& scope = alloca_scope_.back(); auto& prep_seq = prep_seq_stack_.back(); - if (scope.run_shape_stack == -1) { - scope.run_shape_stack = 0; + if (scope.run_sizes.shape_stack == -1) { + scope.run_sizes.shape_stack = 0; } - int64_t stack_begin = scope.run_shape_stack; - scope.run_shape_stack += op->args.size(); + int64_t stack_begin = scope.run_sizes.shape_stack; + scope.run_sizes.shape_stack += op->args.size(); PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); // no need to perform any store for a scalar shape @@ -403,8 +312,8 @@ class BuiltinLower : public StmtExprMutator { auto& scope = alloca_scope_.back(); auto& prep_seq = prep_seq_stack_.back(); - size_t idx = scope.run_array_stack; - scope.run_array_stack += 1; + size_t idx = scope.run_sizes.array_stack; + scope.run_sizes.array_stack += 1; PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); @@ -445,9 +354,9 @@ class BuiltinLower : public StmtExprMutator { auto& scope = alloca_scope_.back(); auto& prep_seq = prep_seq_stack_.back(); - int64_t restore_shape_stack = scope.run_shape_stack; - size_t restore_array_stack = scope.run_array_stack; - size_t arg_stack_begin = scope.run_arg_stack; + int64_t restore_shape_stack = scope.run_sizes.shape_stack; + size_t restore_array_stack = scope.run_sizes.array_stack; + size_t arg_stack_begin = scope.run_sizes.arg_stack; size_t arg_count = op->args.size(); @@ -456,7 +365,7 @@ class BuiltinLower : public StmtExprMutator { arg_count--; } - scope.run_arg_stack += arg_count; + scope.run_sizes.arg_stack += arg_count; // Specially handle the buffer packed intrinsic PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); @@ -479,12 +388,14 @@ class BuiltinLower : public StmtExprMutator { prep_seq.emplace_back(BufferStore(scope.stack_tcode, ConstInt32(arg_tcode), {stack_index})); } // Verify stack size matches earlier value. - ICHECK_LE(scope.run_arg_stack, scope.max_arg_stack); - ICHECK_LE(scope.run_shape_stack, scope.max_shape_stack); - ICHECK_LE(scope.run_array_stack, scope.max_array_stack); - scope.run_shape_stack = restore_shape_stack; - scope.run_array_stack = restore_array_stack; - scope.run_arg_stack = arg_stack_begin; + if (is_precheck_) { + scope.UpdateMax(); + } else { + scope.AssertMaxIsValid(); + } + scope.run_sizes.shape_stack = restore_shape_stack; + scope.run_sizes.array_stack = restore_array_stack; + scope.run_sizes.arg_stack = arg_stack_begin; Array packed_args = {op->args[0], scope.stack_value, scope.stack_tcode->data, ConstInt32(arg_stack_begin), ConstInt32(arg_stack_begin + op->args.size() - 1)}; @@ -505,10 +416,10 @@ class BuiltinLower : public StmtExprMutator { auto& scope = alloca_scope_.back(); auto& prep_seq = prep_seq_stack_.back(); - int64_t restore_shape_stack = scope.run_shape_stack; - size_t restore_array_stack = scope.run_array_stack; - size_t arg_stack_begin = scope.run_arg_stack; - scope.run_arg_stack += op->args.size(); + int64_t restore_shape_stack = scope.run_sizes.shape_stack; + size_t restore_array_stack = scope.run_sizes.array_stack; + size_t arg_stack_begin = scope.run_sizes.arg_stack; + scope.run_sizes.arg_stack += op->args.size(); size_t args_size = op->args.size(); ICHECK_GT(args_size, 0); PrimExpr expr = StmtExprMutator::VisitExpr_(op); @@ -529,14 +440,16 @@ class BuiltinLower : public StmtExprMutator { prep_seq.emplace_back(BufferStore(scope.stack_tcode, ConstInt32(arg_tcode), {stack_index})); } // Verify stack size matches earlier value. - ICHECK_LE(scope.run_arg_stack, scope.max_arg_stack); - ICHECK_LE(scope.run_shape_stack, scope.max_shape_stack); - ICHECK_LE(scope.run_array_stack, scope.max_array_stack); - scope.run_shape_stack = restore_shape_stack; - scope.run_array_stack = restore_array_stack; + if (is_precheck_) { + scope.UpdateMax(); + } else { + scope.AssertMaxIsValid(); + } + scope.run_sizes.shape_stack = restore_shape_stack; + scope.run_sizes.array_stack = restore_array_stack; // Update the top of the stack, so we can use more than one // packed function's arguments with the one stack. - scope.run_arg_stack = arg_stack_begin + args_size - 1; + scope.run_sizes.arg_stack = arg_stack_begin + args_size - 1; Array packed_args = {op->args[0], scope.stack_value, scope.stack_tcode->data, ConstInt32(arg_stack_begin), ConstInt32(arg_stack_begin + op->args.size() - 1), @@ -594,6 +507,8 @@ class BuiltinLower : public StmtExprMutator { PrimExpr device_type_; PrimExpr device_id_; + bool is_precheck_{false}; + // Record all stack frames. std::vector alloca_scope_; }; From 7b06e7cbf3135977f9f45fe4e4658002f38515ea Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 12:08:00 -0800 Subject: [PATCH 22/38] pass device and type to device api --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 30 ++++++++++++------- src/tir/transforms/lower_tvm_builtin.cc | 16 +++++++--- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index a2152788bfbb..574827d44d93 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -173,21 +173,27 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM std::map vtcmallocs; TVM_REGISTER_GLOBAL("device_api.hexagon.AllocVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { - int64_t nbytes = args[0]; + int device_type = args[0]; + int device_id = args[1]; + int nbytes = args[2]; + // int height = args[3]; + int dtype_code_hint = args[3]; + int dtype_bits_hint = args[4]; + int64_t shape[1] = {nbytes}; - // TODO: pass device as packed func arg Device dev; - dev.device_type = static_cast(kDLHexagon); + dev.device_type = static_cast(device_type); + dev.device_id = device_id; - // TODO: pass dtype as packed func arg - DLDataType dtype; - dtype.bits = 8; - dtype.lanes = 1; + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); HexagonBuffer* hexbuf = - reinterpret_cast(hexapi->AllocVtcmWorkspace(dev, 1, shape, dtype)); + reinterpret_cast(hexapi->AllocVtcmWorkspace(dev, 1, shape, type_hint)); void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; @@ -195,12 +201,14 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocVtcm").set_body([](TVMArgs args, TV }); TVM_REGISTER_GLOBAL("device_api.hexagon.FreeVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { - void* ptr = args[0]; + int device_type = args[0]; + int device_id = args[1]; + void* ptr = args[2]; HexagonBuffer* hexbuf = vtcmallocs[ptr]; - // TODO: pass device as packed func arg Device dev; - dev.device_type = static_cast(kDLHexagon); + dev.device_type = static_cast(device_type); + dev.device_id = device_id; HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); hexapi->FreeVtcmWorkspace(dev, hexbuf); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 19e16101705d..b76b1d0b571d 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -493,22 +493,30 @@ class BuiltinLower : public StmtExprMutator { } Stmt MakeVtcmAlloc(const LetStmtNode* let, const CallNode* call) { + ICHECK(device_type_.defined()) << "Unknown device type in current IR"; + ICHECK(device_id_.defined()) << "Unknown device id in current IR"; + Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); + DataType dtype = + let->var->type_annotation.as()->element_type.as()->dtype; std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocVtcm"), cast(DataType::UInt(64), call->args[0])}); - + {StringImm(fdevapi_prefix + ".AllocVtcm"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), + IntImm(DataType::Int(32), dtype.code()), IntImm(DataType::Int(32), dtype.bits())}); Stmt alloca = LetStmt(let->var, call_packed, body); - Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeVtcm"), let->var}); + Call free_op = + Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeVtcm"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); From bc372da7b72cfda33f6ffb5da26f0c6937105c23 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 9 Mar 2022 14:12:21 -0600 Subject: [PATCH 23/38] Bugfix, verify the precheck's allocations, not own. --- src/tir/transforms/lower_tvm_builtin.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index df46125844a6..5307fcac1236 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -76,7 +76,6 @@ class BuiltinLower : public StmtExprMutator { StackSizes GetMaxStack(Stmt stmt) { BuiltinLower precheck; precheck.is_precheck_ = true; - precheck.VisitBodyAndRealizeAlloca(stmt); precheck.alloca_scope_.emplace_back(); auto& scope = precheck.alloca_scope_.back(); @@ -87,7 +86,7 @@ class BuiltinLower : public StmtExprMutator { precheck.VisitStmt(stmt); - ICHECK_EQ(alloca_scope_.size(), 1); + ICHECK_EQ(precheck.alloca_scope_.size(), 1); return precheck.alloca_scope_[0].max_sizes; } From 4ff6471c88dc701ed78d1f36e7001d3a4b651684 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 9 Mar 2022 15:35:57 -0600 Subject: [PATCH 24/38] Bugfix, pass context information to the precheck. --- src/tir/transforms/lower_tvm_builtin.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 5307fcac1236..e5c45a5a5f04 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -76,6 +76,8 @@ class BuiltinLower : public StmtExprMutator { StackSizes GetMaxStack(Stmt stmt) { BuiltinLower precheck; precheck.is_precheck_ = true; + precheck.device_id_ = this->device_id_; + precheck.device_type_ = this->device_type_; precheck.alloca_scope_.emplace_back(); auto& scope = precheck.alloca_scope_.back(); From 1c23651f5475b40a61747276330a33b13b8bd336 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 13:22:31 -0800 Subject: [PATCH 25/38] pass order and shape to device api --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 95 ++++++++++--------- .../hexagon/hexagon/hexagon_device_api_v2.h | 3 +- src/tir/transforms/lower_tvm_builtin.cc | 25 +++-- 3 files changed, 70 insertions(+), 53 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 574827d44d93..c7ae94808e0b 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -119,9 +119,9 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) { workspace_allocations_.erase(it); } -void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, - DLDataType dtype) { - return AllocDataSpace(dev, ndim, shape, dtype, String("global.vtcm")); +void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, int ndim, const int64_t* shape, + DLDataType dtype, Optional mem_scope) { + return AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); } @@ -170,50 +170,55 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM *rv = static_cast(0); }); +// TODO: probably need a class here std::map vtcmallocs; -TVM_REGISTER_GLOBAL("device_api.hexagon.AllocVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - int nbytes = args[2]; - // int height = args[3]; - int dtype_code_hint = args[3]; - int dtype_bits_hint = args[4]; - - int64_t shape[1] = {nbytes}; - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; - - HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); - HexagonBuffer* hexbuf = - reinterpret_cast(hexapi->AllocVtcmWorkspace(dev, 1, shape, type_hint)); - - void* ptr = hexbuf->GetPointer()[0]; - vtcmallocs[ptr] = hexbuf; - *rv = ptr; -}); - -TVM_REGISTER_GLOBAL("device_api.hexagon.FreeVtcm").set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - void* ptr = args[2]; - HexagonBuffer* hexbuf = vtcmallocs[ptr]; - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); - hexapi->FreeVtcmWorkspace(dev, hexbuf); - *rv = static_cast(0); -}); +TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") + .set_body([](TVMArgs args, TVMRetValue* rv) { + int device_type = args[0]; + int device_id = args[1]; + int dtype_code_hint = args[2]; + int dtype_bits_hint = args[3]; + std::string scope = args[4]; + const int order = args[5]; + std::vector shape; + for (int i = 0; i < order; ++i) { + shape.push_back(args[6 + i]); + } + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + HexagonBuffer* hexbuf = reinterpret_cast( + hexapi->AllocWorkspace(dev, order, shape.data(), type_hint, String(scope))); + + void* ptr = hexbuf->GetPointer()[0]; + vtcmallocs[ptr] = hexbuf; + *rv = ptr; + }); + +TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNdWithScope") + .set_body([](TVMArgs args, TVMRetValue* rv) { + int device_type = args[0]; + int device_id = args[1]; + void* ptr = args[2]; + HexagonBuffer* hexbuf = vtcmallocs[ptr]; + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + hexapi->FreeVtcmWorkspace(dev, hexbuf); + *rv = static_cast(0); + }); TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index 67526b79b3af..1607fad5bff5 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -90,7 +90,8 @@ class HexagonDeviceAPIv2 final : public DeviceAPI { * \param dtype The element type. * \return The allocated HexagonBuffer pointer. */ - void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype); + void* AllocWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope); //! \brief Free the allocated Nd VTCM workspace. void FreeVtcmWorkspace(Device dev, void* ptr); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index b76b1d0b571d..97ec6c1a3607 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -495,8 +495,8 @@ class BuiltinLower : public StmtExprMutator { Stmt MakeVtcmAlloc(const LetStmtNode* let, const CallNode* call) { ICHECK(device_type_.defined()) << "Unknown device type in current IR"; ICHECK(device_id_.defined()) << "Unknown device id in current IR"; - Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); + Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); @@ -506,16 +506,27 @@ class BuiltinLower : public StmtExprMutator { std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Call call_packed = - Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocVtcm"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), - IntImm(DataType::Int(32), dtype.code()), IntImm(DataType::Int(32), dtype.bits())}); + // TODO: send from pass + std::string scope = "global.vtcm"; + + Array args = {StringImm(fdevapi_prefix + ".AllocNdWithScope"), + cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), + IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits()), + StringImm(scope), + IntImm(DataType::UInt(64), call->args.size())}; // TODO: size 32 or 64? + + for (size_t i = 0; i < call->args.size(); ++i) { + args.push_back(cast(DataType::UInt(64), call->args[i])); + } + + Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args); Stmt alloca = LetStmt(let->var, call_packed, body); Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeVtcm"), cast(DataType::Int(32), device_type_), + {StringImm(fdevapi_prefix + ".FreeNdWithScope"), cast(DataType::Int(32), device_type_), cast(DataType::Int(32), device_id_), let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); From 7e43cd8335ea6d952e1e61af99d3765650c73d61 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 15:00:27 -0800 Subject: [PATCH 26/38] working --- src/tir/transforms/lower_vtcm_alloc.cc | 86 ++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 src/tir/transforms/lower_vtcm_alloc.cc diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc new file mode 100644 index 000000000000..2e09bc6c7473 --- /dev/null +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "../../arith/ir_visitor_with_analyzer.h" + +namespace tvm { +namespace tir { + +inline bool IsVtcmStorage(std::string scope) { + return scope.find("vtcm") != std::string::npos; +} + +class VtcmAllocator : public StmtExprMutator { + public: + using StmtExprMutator::VisitStmt_; + VtcmAllocator() {} + + Stmt VisitStmt_(const AllocateNode* op) final { + Stmt body = this->VisitStmt(op->body); + std::string storage_scope = GetStorageScope(op->buffer_var); + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + + if (IsVtcmStorage(storage_scope)) { + Array args; + //args.push_back(StringImm(storage_scope)); blah + args.push_back(static_cast(op->extents.size())); // TODO: 32 or 64? + for (size_t i = 0; i < op->extents.size(); ++i) { + args.push_back(op->extents[i]); + } + stmt = LetStmt(op->buffer_var, + Call(op->buffer_var.dtype(), builtin::vtcm_alloca(), args), body); + } + + return stmt; + } + + protected: + std::string GetStorageScope(const Var& var) { + auto* ptr = var->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; + } +}; + +PrimFunc LowerVtcmAlloc(PrimFunc func) { + auto fptr = func.CopyOnWrite(); + fptr->body = VtcmAllocator()(std::move(fptr->body)); + return func; +} + +namespace transform { + +Pass LowerVtcmAlloc() { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + return LowerVtcmAlloc(std::move(f)); + }; + return CreatePrimFuncPass(pass_func, 0, "tir.LowerVtcmAlloc", {}); +} + +TVM_REGISTER_GLOBAL("tir.transform.LowerVtcmAlloc").set_body_typed(LowerVtcmAlloc); + +} // namespace transform + +} // namespace tir +} // namespace tvm From 5132fd62d6663406f7132d660127d4eb968f9db2 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 16:30:13 -0800 Subject: [PATCH 27/38] fix up types and arg passing --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 28 ++++++++++++------- src/tir/transforms/lower_tvm_builtin.cc | 14 ++++------ src/tir/transforms/lower_vtcm_alloc.cc | 3 +- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index c7ae94808e0b..094e9401c548 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -175,14 +175,22 @@ std::map vtcmallocs; TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") .set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - int dtype_code_hint = args[2]; - int dtype_bits_hint = args[3]; + HEXAGON_PRINT(ALWAYS, "STRAW: In device_api.hexagon.AllocNdWithScope"); + int32_t device_type = args[0]; + HEXAGON_PRINT(ALWAYS, "STRAW: device type = %d", device_type); + int32_t device_id = args[1]; + HEXAGON_PRINT(ALWAYS, "STRAW: device id = %d", device_id); + int32_t dtype_code = args[2]; + HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code); + int32_t dtype_bits = args[3]; + HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits); std::string scope = args[4]; - const int order = args[5]; + HEXAGON_PRINT(ALWAYS, "STRAW: scope = %s", scope.c_str()); + int64_t order = args[5]; + HEXAGON_PRINT(ALWAYS, "STRAW: order = %d", order); std::vector shape; for (int i = 0; i < order; ++i) { + HEXAGON_PRINT(ALWAYS, "STRAW: dim = %d", args[6 + i]); shape.push_back(args[6 + i]); } @@ -190,14 +198,14 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") dev.device_type = static_cast(device_type); dev.device_id = device_id; - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; + DLDataType dtype; + dtype.code = static_cast(dtype_code); + dtype.bits = static_cast(dtype_bits); + dtype.lanes = 1; HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); HexagonBuffer* hexbuf = reinterpret_cast( - hexapi->AllocWorkspace(dev, order, shape.data(), type_hint, String(scope))); + hexapi->AllocWorkspace(dev, order, shape.data(), dtype, String(scope))); void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 97ec6c1a3607..51df3b680ded 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -508,17 +508,16 @@ class BuiltinLower : public StmtExprMutator { // TODO: send from pass std::string scope = "global.vtcm"; - Array args = {StringImm(fdevapi_prefix + ".AllocNdWithScope"), - cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), + device_type_, + device_id_, IntImm(DataType::Int(32), dtype.code()), IntImm(DataType::Int(32), dtype.bits()), - StringImm(scope), - IntImm(DataType::UInt(64), call->args.size())}; // TODO: size 32 or 64? + StringImm(scope) + }; for (size_t i = 0; i < call->args.size(); ++i) { - args.push_back(cast(DataType::UInt(64), call->args[i])); + args.push_back(call->args[i]); } Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args); @@ -526,8 +525,7 @@ class BuiltinLower : public StmtExprMutator { Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeNdWithScope"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), let->var}); + {StringImm(fdevapi_prefix + ".FreeNdWithScope"), device_type_, device_id_, let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index 2e09bc6c7473..90a7c6acdfab 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -43,8 +43,7 @@ class VtcmAllocator : public StmtExprMutator { if (IsVtcmStorage(storage_scope)) { Array args; - //args.push_back(StringImm(storage_scope)); blah - args.push_back(static_cast(op->extents.size())); // TODO: 32 or 64? + args.push_back(IntImm(DataType::Int(64), op->extents.size())); for (size_t i = 0; i < op->extents.size(); ++i) { args.push_back(op->extents[i]); } From b28ff9c659351e0091a68730120ee54eaf787231 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 16:39:51 -0800 Subject: [PATCH 28/38] pass scope to device api --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 19 +++++++++++-------- src/tir/transforms/lower_tvm_builtin.cc | 4 +--- src/tir/transforms/lower_vtcm_alloc.cc | 1 + 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 094e9401c548..c6e7c75b6e2d 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -175,25 +175,28 @@ std::map vtcmallocs; TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") .set_body([](TVMArgs args, TVMRetValue* rv) { - HEXAGON_PRINT(ALWAYS, "STRAW: In device_api.hexagon.AllocNdWithScope"); int32_t device_type = args[0]; - HEXAGON_PRINT(ALWAYS, "STRAW: device type = %d", device_type); int32_t device_id = args[1]; - HEXAGON_PRINT(ALWAYS, "STRAW: device id = %d", device_id); int32_t dtype_code = args[2]; - HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code); int32_t dtype_bits = args[3]; - HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits); std::string scope = args[4]; - HEXAGON_PRINT(ALWAYS, "STRAW: scope = %s", scope.c_str()); int64_t order = args[5]; - HEXAGON_PRINT(ALWAYS, "STRAW: order = %d", order); std::vector shape; for (int i = 0; i < order; ++i) { - HEXAGON_PRINT(ALWAYS, "STRAW: dim = %d", args[6 + i]); shape.push_back(args[6 + i]); } + HEXAGON_PRINT(ALWAYS, "STRAW: In device_api.hexagon.AllocNdWithScope"); + HEXAGON_PRINT(ALWAYS, "STRAW: device type = %d", device_type); + HEXAGON_PRINT(ALWAYS, "STRAW: device id = %d", device_id); + HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code); + HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits); + HEXAGON_PRINT(ALWAYS, "STRAW: scope = %s", scope.c_str()); + HEXAGON_PRINT(ALWAYS, "STRAW: order = %d", order); + for (int i = 0; i < order; ++i) { + HEXAGON_PRINT(ALWAYS, "STRAW: dim = %d", shape[i]); + } + Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 51df3b680ded..22ae82a41134 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -500,20 +500,18 @@ class BuiltinLower : public StmtExprMutator { Stmt body = SeqStmt( {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); + DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - // TODO: send from pass - std::string scope = "global.vtcm"; Array args = {StringImm(fdevapi_prefix + ".AllocNdWithScope"), device_type_, device_id_, IntImm(DataType::Int(32), dtype.code()), IntImm(DataType::Int(32), dtype.bits()), - StringImm(scope) }; for (size_t i = 0; i < call->args.size(); ++i) { diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index 90a7c6acdfab..77a551148040 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -43,6 +43,7 @@ class VtcmAllocator : public StmtExprMutator { if (IsVtcmStorage(storage_scope)) { Array args; + args.push_back(StringImm(storage_scope)); args.push_back(IntImm(DataType::Int(64), op->extents.size())); for (size_t i = 0; i < op->extents.size(); ++i) { args.push_back(op->extents[i]); From 3d28c5949b125718b1e055340e15a1a271591a29 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 17:01:39 -0800 Subject: [PATCH 29/38] common builtin for texture / vtcm --- include/tvm/tir/builtin.h | 9 ++----- .../hexagon/hexagon/hexagon_device_api_v2.cc | 24 ++++++++++--------- src/runtime/opencl/opencl_device_api.cc | 19 ++++++++------- src/tir/op/builtin.cc | 5 +--- src/tir/transforms/lower_tvm_builtin.cc | 8 +++---- src/tir/transforms/lower_vtcm_alloc.cc | 2 +- src/tir/transforms/texture_flatten.cc | 4 ++-- 7 files changed, 33 insertions(+), 38 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index f145e5bb2bfe..12eabbdee6ab 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -617,9 +617,9 @@ TVM_DLL const Op& vectorcombine(); */ TVM_DLL const Op& atomic_add(); /*! - * \brief Create a texture 2d memory allocation + * \brief Create an Nd memory allocation */ -TVM_DLL const Op& texture2d_alloca(); +TVM_DLL const Op& ndmemalloc(); /*! * \brief Store to texture 2d memory @@ -631,11 +631,6 @@ TVM_DLL const Op& texture2d_store(); */ TVM_DLL const Op& texture2d_load(); -/*! - * \brief Create a vtcm allocation - */ -TVM_DLL const Op& vtcm_alloca(); - /*! * \brief Copy 1d memory from source to destination * Same semantics as memcpy(destination, source, size) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index c6e7c75b6e2d..42cd001fbfd3 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -177,11 +177,12 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") .set_body([](TVMArgs args, TVMRetValue* rv) { int32_t device_type = args[0]; int32_t device_id = args[1]; - int32_t dtype_code = args[2]; - int32_t dtype_bits = args[3]; - std::string scope = args[4]; - int64_t order = args[5]; + int32_t dtype_code_hint = args[2]; + int32_t dtype_bits_hint = args[3]; + std::string scope = args[4]; // TODO: check scope = vtcm + int64_t order = args[5]; // TODO: check no overflow std::vector shape; + // TODO: coallesce to 1d for now? for (int i = 0; i < order; ++i) { shape.push_back(args[6 + i]); } @@ -189,8 +190,8 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") HEXAGON_PRINT(ALWAYS, "STRAW: In device_api.hexagon.AllocNdWithScope"); HEXAGON_PRINT(ALWAYS, "STRAW: device type = %d", device_type); HEXAGON_PRINT(ALWAYS, "STRAW: device id = %d", device_id); - HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code); - HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits); + HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code_hint); + HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits_hint); HEXAGON_PRINT(ALWAYS, "STRAW: scope = %s", scope.c_str()); HEXAGON_PRINT(ALWAYS, "STRAW: order = %d", order); for (int i = 0; i < order; ++i) { @@ -201,20 +202,21 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") dev.device_type = static_cast(device_type); dev.device_id = device_id; - DLDataType dtype; - dtype.code = static_cast(dtype_code); - dtype.bits = static_cast(dtype_bits); - dtype.lanes = 1; + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); HexagonBuffer* hexbuf = reinterpret_cast( - hexapi->AllocWorkspace(dev, order, shape.data(), dtype, String(scope))); + hexapi->AllocWorkspace(dev, order, shape.data(), type_hint, String(scope))); void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; *rv = ptr; }); +// TODO: no scope TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNdWithScope") .set_body([](TVMArgs args, TVMRetValue* rv) { int device_type = args[0]; diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 66561dcdf279..9af325d12213 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -438,13 +438,16 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic initialized_ = true; } -TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - int width = args[2]; - int height = args[3]; - int dtype_code_hint = args[4]; - int dtype_bits_hint = args[5]; +TVM_REGISTER_GLOBAL("device_api.opencl.AllocNdWithScope").set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + int32_t dtype_code_hint = args[2]; + int32_t dtype_bits_hint = args[3]; + std::string scope = args[4]; // TODO: check scope = texture + int64_t order = args[5]; // TODO: check order = 2 + int64_t width = args[6]; + int64_t height = args[7]; + Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -459,7 +462,7 @@ TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args, type_hint); }); -TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) { +TVM_REGISTER_GLOBAL("device_api.opencl.FreeNdWithScope").set_body([](TVMArgs args, TVMRetValue* rv) { int device_type = args[0]; int device_id = args[1]; void* data = args[2]; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 31d494c6c78e..361030668098 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -249,7 +249,7 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine) TIR_DEFINE_BUILTIN_FUNC(atomic_add) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(texture2d_alloca) +TIR_DEFINE_BUILTIN_FUNC(ndmemalloc) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); TIR_DEFINE_BUILTIN_FUNC(texture2d_store) @@ -263,9 +263,6 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load) TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(vtcm_alloca) - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); - } // namespace builtin } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 22ae82a41134..fa879ced1fdf 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -157,10 +157,8 @@ class BuiltinLower : public StmtExprMutator { Stmt VisitStmt_(const LetStmtNode* op) final { if (const CallNode* call = op->value.as()) { - if (call->op.same_as(builtin::texture2d_alloca())) { - return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call)); - } else if (call->op.same_as(builtin::vtcm_alloca())) { - return StmtExprMutator::VisitStmt(MakeVtcmAlloc(op, call)); + if (call->op.same_as(builtin::ndmemalloc())) { + return StmtExprMutator::VisitStmt(MakeNdMemAlloc(op, call)); } } return StmtExprMutator::VisitStmt_(op); @@ -492,7 +490,7 @@ class BuiltinLower : public StmtExprMutator { return body; } - Stmt MakeVtcmAlloc(const LetStmtNode* let, const CallNode* call) { + Stmt MakeNdMemAlloc(const LetStmtNode* let, const CallNode* call) { ICHECK(device_type_.defined()) << "Unknown device type in current IR"; ICHECK(device_id_.defined()) << "Unknown device id in current IR"; Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index 77a551148040..06b6fd22a74c 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -49,7 +49,7 @@ class VtcmAllocator : public StmtExprMutator { args.push_back(op->extents[i]); } stmt = LetStmt(op->buffer_var, - Call(op->buffer_var.dtype(), builtin::vtcm_alloca(), args), body); + Call(op->buffer_var.dtype(), builtin::ndmemalloc(), args), body); } return stmt; diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 7dc800737944..06295ccba52f 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -115,8 +115,8 @@ class TextureFlattener : public TextureLoweringBase { size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); auto texture = ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); - Array args = {texture.width, texture.height}; - stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); + Array args = {StringImm(storage_scope), 2, texture.width, texture.height}; + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::ndmemalloc(), args), body); } return stmt; From 47268c5ad9f5b852d93545eff91f9361f3ab586f Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 17:31:43 -0800 Subject: [PATCH 30/38] add scope to freend api --- include/tvm/tir/builtin.h | 4 +- .../hexagon/hexagon/hexagon_device_api_v2.cc | 17 +++-- .../hexagon/hexagon/hexagon_device_api_v2.h | 4 +- src/runtime/opencl/opencl_device_api.cc | 73 ++++++++++--------- src/tir/op/builtin.cc | 2 +- src/tir/transforms/lower_tvm_builtin.cc | 58 ++++----------- src/tir/transforms/lower_vtcm_alloc.cc | 6 +- src/tir/transforms/texture_flatten.cc | 3 +- 8 files changed, 70 insertions(+), 97 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 12eabbdee6ab..00b7d8a2bff8 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -617,9 +617,9 @@ TVM_DLL const Op& vectorcombine(); */ TVM_DLL const Op& atomic_add(); /*! - * \brief Create an Nd memory allocation + * \brief Create an Nd memory allocation with storage scope */ -TVM_DLL const Op& ndmemalloc(); +TVM_DLL const Op& nd_mem_alloc_with_scope(); /*! * \brief Store to texture 2d memory diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 42cd001fbfd3..b31c1860962f 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -119,8 +119,8 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) { workspace_allocations_.erase(it); } -void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, int ndim, const int64_t* shape, - DLDataType dtype, Optional mem_scope) { +void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, + DLDataType dtype, Optional mem_scope) { return AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } @@ -179,8 +179,8 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") int32_t device_id = args[1]; int32_t dtype_code_hint = args[2]; int32_t dtype_bits_hint = args[3]; - std::string scope = args[4]; // TODO: check scope = vtcm - int64_t order = args[5]; // TODO: check no overflow + std::string scope = args[4]; // TODO: check scope = vtcm + int64_t order = args[5]; // TODO: check no overflow std::vector shape; // TODO: coallesce to 1d for now? for (int i = 0; i < order; ++i) { @@ -209,7 +209,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); HexagonBuffer* hexbuf = reinterpret_cast( - hexapi->AllocWorkspace(dev, order, shape.data(), type_hint, String(scope))); + hexapi->AllocVtcmWorkspace(dev, order, shape.data(), type_hint, String(scope))); void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; @@ -219,9 +219,10 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") // TODO: no scope TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNdWithScope") .set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - void* ptr = args[2]; + int32_t device_type = args[0]; + int32_t device_id = args[1]; + std::string scope = args[2]; // TODO: check scope = vtcm + void* ptr = args[3]; HexagonBuffer* hexbuf = vtcmallocs[ptr]; Device dev; diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index 1607fad5bff5..5cbdf8907fa9 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -90,8 +90,8 @@ class HexagonDeviceAPIv2 final : public DeviceAPI { * \param dtype The element type. * \return The allocated HexagonBuffer pointer. */ - void* AllocWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, - Optional mem_scope); + void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope); //! \brief Free the allocated Nd VTCM workspace. void FreeVtcmWorkspace(Device dev, void* ptr); diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 9af325d12213..26b9c08487cb 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -438,41 +438,44 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic initialized_ = true; } -TVM_REGISTER_GLOBAL("device_api.opencl.AllocNdWithScope").set_body([](TVMArgs args, TVMRetValue* rv) { - int32_t device_type = args[0]; - int32_t device_id = args[1]; - int32_t dtype_code_hint = args[2]; - int32_t dtype_bits_hint = args[3]; - std::string scope = args[4]; // TODO: check scope = texture - int64_t order = args[5]; // TODO: check order = 2 - int64_t width = args[6]; - int64_t height = args[7]; - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; - - OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); - *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), - type_hint); -}); - -TVM_REGISTER_GLOBAL("device_api.opencl.FreeNdWithScope").set_body([](TVMArgs args, TVMRetValue* rv) { - int device_type = args[0]; - int device_id = args[1]; - void* data = args[2]; - OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - ptr->FreeTextureWorkspace(dev, data); - *rv = static_cast(0); -}); +TVM_REGISTER_GLOBAL("device_api.opencl.AllocNdWithScope") + .set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + int32_t dtype_code_hint = args[2]; + int32_t dtype_bits_hint = args[3]; + std::string scope = args[4]; // TODO: check scope = texture + int64_t order = args[5]; // TODO: check order = 2 + int64_t width = args[6]; + int64_t height = args[7]; + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); + *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), + type_hint); + }); + +TVM_REGISTER_GLOBAL("device_api.opencl.FreeNdWithScope") + .set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + std::string scope = args[2]; // TODO: check scope = texture + void* data = args[3]; + OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + ptr->FreeTextureWorkspace(dev, data); + *rv = static_cast(0); + }); TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = OpenCLWorkspace::Global(); diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 361030668098..8ea46c19a154 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -249,7 +249,7 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine) TIR_DEFINE_BUILTIN_FUNC(atomic_add) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(ndmemalloc) +TIR_DEFINE_BUILTIN_FUNC(nd_mem_alloc_with_scope) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); TIR_DEFINE_BUILTIN_FUNC(texture2d_store) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index fa879ced1fdf..d70a34d79009 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -157,8 +157,8 @@ class BuiltinLower : public StmtExprMutator { Stmt VisitStmt_(const LetStmtNode* op) final { if (const CallNode* call = op->value.as()) { - if (call->op.same_as(builtin::ndmemalloc())) { - return StmtExprMutator::VisitStmt(MakeNdMemAlloc(op, call)); + if (call->op.same_as(builtin::nd_mem_alloc_with_scope())) { + return StmtExprMutator::VisitStmt(MakeNdMemAllocWithScope(op, call)); } } return StmtExprMutator::VisitStmt_(op); @@ -458,39 +458,7 @@ class BuiltinLower : public StmtExprMutator { return Call(op->dtype, builtin::tvm_call_trace_packed_lowered(), packed_args); } - Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) { - ICHECK(device_type_.defined()) << "Unknown device type in current IR"; - ICHECK(device_id_.defined()) << "Unknown device id in current IR"; - Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); - - Stmt body = SeqStmt( - {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), - let->body}); - DataType dtype = - let->var->type_annotation.as()->element_type.as()->dtype; - - std::string fdevapi_prefix = "device_api."; - fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Call call_packed = - Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), - cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()), - IntImm(DataType::Int(32), dtype.bits())}); - - Stmt alloca = LetStmt(let->var, call_packed, body); - - Call free_op = - Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), let->var}); - - Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); - body = SeqStmt({alloca, free_stmt}); - return body; - } - - Stmt MakeNdMemAlloc(const LetStmtNode* let, const CallNode* call) { + Stmt MakeNdMemAllocWithScope(const LetStmtNode* let, const CallNode* call) { ICHECK(device_type_.defined()) << "Unknown device type in current IR"; ICHECK(device_id_.defined()) << "Unknown device id in current IR"; Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); @@ -505,12 +473,13 @@ class BuiltinLower : public StmtExprMutator { std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Array args = {StringImm(fdevapi_prefix + ".AllocNdWithScope"), - device_type_, - device_id_, - IntImm(DataType::Int(32), dtype.code()), - IntImm(DataType::Int(32), dtype.bits()), - }; + Array args = { + StringImm(fdevapi_prefix + ".AllocNdWithScope"), + device_type_, + device_id_, + IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits()), + }; for (size_t i = 0; i < call->args.size(); ++i) { args.push_back(call->args[i]); @@ -519,9 +488,10 @@ class BuiltinLower : public StmtExprMutator { Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args); Stmt alloca = LetStmt(let->var, call_packed, body); - Call free_op = - Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeNdWithScope"), device_type_, device_id_, let->var}); + PrimExpr storage_scope = call->args[0]; + Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeNdWithScope"), device_type_, device_id_, + storage_scope, let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index 06b6fd22a74c..d133ce5e9d16 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -26,9 +26,7 @@ namespace tvm { namespace tir { -inline bool IsVtcmStorage(std::string scope) { - return scope.find("vtcm") != std::string::npos; -} +inline bool IsVtcmStorage(std::string scope) { return scope.find("vtcm") != std::string::npos; } class VtcmAllocator : public StmtExprMutator { public: @@ -49,7 +47,7 @@ class VtcmAllocator : public StmtExprMutator { args.push_back(op->extents[i]); } stmt = LetStmt(op->buffer_var, - Call(op->buffer_var.dtype(), builtin::ndmemalloc(), args), body); + Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); } return stmt; diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 06295ccba52f..44d40fb1c60b 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -116,7 +116,8 @@ class TextureFlattener : public TextureLoweringBase { auto texture = ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); Array args = {StringImm(storage_scope), 2, texture.width, texture.height}; - stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::ndmemalloc(), args), body); + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), + body); } return stmt; From 16452381a04432870c35a87a1d81f0379c13c5d4 Mon Sep 17 00:00:00 2001 From: adstraw Date: Wed, 9 Mar 2022 17:42:14 -0800 Subject: [PATCH 31/38] format and lint --- .../hexagon/hexagon/hexagon_device_api_v2.cc | 115 ++++++++---------- .../hexagon/hexagon/hexagon_device_api_v2.h | 4 + src/runtime/opencl/opencl_device_api.cc | 77 ++++++------ src/tir/transforms/lower_tvm_builtin.cc | 8 +- 4 files changed, 101 insertions(+), 103 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index b31c1860962f..4cc72dbaf4a9 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -170,69 +170,62 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM *rv = static_cast(0); }); -// TODO: probably need a class here std::map vtcmallocs; -TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNdWithScope") - .set_body([](TVMArgs args, TVMRetValue* rv) { - int32_t device_type = args[0]; - int32_t device_id = args[1]; - int32_t dtype_code_hint = args[2]; - int32_t dtype_bits_hint = args[3]; - std::string scope = args[4]; // TODO: check scope = vtcm - int64_t order = args[5]; // TODO: check no overflow - std::vector shape; - // TODO: coallesce to 1d for now? - for (int i = 0; i < order; ++i) { - shape.push_back(args[6 + i]); - } - - HEXAGON_PRINT(ALWAYS, "STRAW: In device_api.hexagon.AllocNdWithScope"); - HEXAGON_PRINT(ALWAYS, "STRAW: device type = %d", device_type); - HEXAGON_PRINT(ALWAYS, "STRAW: device id = %d", device_id); - HEXAGON_PRINT(ALWAYS, "STRAW: dtype code = %d", dtype_code_hint); - HEXAGON_PRINT(ALWAYS, "STRAW: dtype bits = %d", dtype_bits_hint); - HEXAGON_PRINT(ALWAYS, "STRAW: scope = %s", scope.c_str()); - HEXAGON_PRINT(ALWAYS, "STRAW: order = %d", order); - for (int i = 0; i < order; ++i) { - HEXAGON_PRINT(ALWAYS, "STRAW: dim = %d", shape[i]); - } - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; - - HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); - HexagonBuffer* hexbuf = reinterpret_cast( - hexapi->AllocVtcmWorkspace(dev, order, shape.data(), type_hint, String(scope))); - - void* ptr = hexbuf->GetPointer()[0]; - vtcmallocs[ptr] = hexbuf; - *rv = ptr; - }); - -// TODO: no scope -TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNdWithScope") - .set_body([](TVMArgs args, TVMRetValue* rv) { - int32_t device_type = args[0]; - int32_t device_id = args[1]; - std::string scope = args[2]; // TODO: check scope = vtcm - void* ptr = args[3]; - HexagonBuffer* hexbuf = vtcmallocs[ptr]; - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); - hexapi->FreeVtcmWorkspace(dev, hexbuf); - *rv = static_cast(0); - }); +TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + int32_t dtype_code_hint = args[2]; + int32_t dtype_bits_hint = args[3]; + std::string scope = args[4]; + CHECK(scope.find("vtcm") != std::string::npos); + int64_t ndim = args[5]; + // Forcing contiguous allocation, for now + // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + CHECK_EQ(ndim, 1); + std::vector shape; + for (int i = 0; i < ndim; ++i) { + shape.push_back(args[6 + i]); + } + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + HexagonBuffer* hexbuf = reinterpret_cast( + hexapi->AllocVtcmWorkspace(dev, ndim, shape.data(), type_hint, String(scope))); + + // Assumes a single contiguous allocation + // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + void* ptr = hexbuf->GetPointer()[0]; + vtcmallocs[ptr] = hexbuf; + *rv = ptr; +}); + +TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + std::string scope = args[2]; + CHECK(scope.find("vtcm") != std::string::npos); + void* ptr = args[3]; + CHECK(vtcmallocs.find(ptr) != vtcmallocs.end()); + + HexagonBuffer* hexbuf = vtcmallocs[ptr]; + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); + hexapi->FreeVtcmWorkspace(dev, hexbuf); + *rv = static_cast(0); +}); TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = HexagonDeviceAPIv2::Global(); diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index 5cbdf8907fa9..c07848241379 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -17,12 +17,16 @@ * under the License. */ + #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_ #include #include +#include +#include +#include namespace tvm { namespace runtime { diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 26b9c08487cb..379f5f772a04 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -438,44 +438,45 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic initialized_ = true; } -TVM_REGISTER_GLOBAL("device_api.opencl.AllocNdWithScope") - .set_body([](TVMArgs args, TVMRetValue* rv) { - int32_t device_type = args[0]; - int32_t device_id = args[1]; - int32_t dtype_code_hint = args[2]; - int32_t dtype_bits_hint = args[3]; - std::string scope = args[4]; // TODO: check scope = texture - int64_t order = args[5]; // TODO: check order = 2 - int64_t width = args[6]; - int64_t height = args[7]; - - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; - - OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); - *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), - type_hint); - }); - -TVM_REGISTER_GLOBAL("device_api.opencl.FreeNdWithScope") - .set_body([](TVMArgs args, TVMRetValue* rv) { - int32_t device_type = args[0]; - int32_t device_id = args[1]; - std::string scope = args[2]; // TODO: check scope = texture - void* data = args[3]; - OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); - Device dev; - dev.device_type = static_cast(device_type); - dev.device_id = device_id; - ptr->FreeTextureWorkspace(dev, data); - *rv = static_cast(0); - }); +TVM_REGISTER_GLOBAL("device_api.opencl.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + int32_t dtype_code_hint = args[2]; + int32_t dtype_bits_hint = args[3]; + std::string scope = args[4]; + CHECK(scope.find("texture") != std::string::npos); + int64_t ndim = args[5]; + CHECK_EQ(ndim, 2); + int64_t width = args[6]; + int64_t height = args[7]; + + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + + DLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); + *rv = ptr->AllocTextureWorkspace(dev, static_cast(width), static_cast(height), + type_hint); +}); + +TVM_REGISTER_GLOBAL("device_api.opencl.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) { + int32_t device_type = args[0]; + int32_t device_id = args[1]; + std::string scope = args[2]; + CHECK(scope.find("texture") != std::string::npos); + void* data = args[3]; + OpenCLWorkspace* ptr = OpenCLWorkspace::Global(); + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + ptr->FreeTextureWorkspace(dev, data); + *rv = static_cast(0); +}); TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = OpenCLWorkspace::Global(); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 93fe9eef9546..8b37a116beea 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -475,7 +475,7 @@ class BuiltinLower : public StmtExprMutator { fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); Array args = { - StringImm(fdevapi_prefix + ".AllocNdWithScope"), + StringImm(fdevapi_prefix + ".AllocNd"), device_type_, device_id_, IntImm(DataType::Int(32), dtype.code()), @@ -490,9 +490,9 @@ class BuiltinLower : public StmtExprMutator { Stmt alloca = LetStmt(let->var, call_packed, body); PrimExpr storage_scope = call->args[0]; - Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeNdWithScope"), device_type_, device_id_, - storage_scope, let->var}); + Call free_op = Call( + DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeNd"), device_type_, device_id_, storage_scope, let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); From caed9f158118cff72fba014e56ec9da8ca6500fc Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 10 Mar 2022 08:14:31 -0800 Subject: [PATCH 32/38] fixed missed format error --- src/runtime/hexagon/hexagon/hexagon_device_api_v2.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h index c07848241379..9e39fc0b0f97 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h @@ -17,16 +17,15 @@ * under the License. */ - #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_ #include -#include #include -#include #include +#include +#include namespace tvm { namespace runtime { From 0f59317b4ab82041cd6cfaa25516a986bf1d12bd Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 10 Mar 2022 22:13:43 -0800 Subject: [PATCH 33/38] restart ci From b7d8dd00220ccb14cecb8b04fdc103d89132dad9 Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 11 Mar 2022 14:23:51 -0800 Subject: [PATCH 34/38] fix test random value issue + code review feedback --- src/runtime/hexagon/hexagon/hexagon_common.cc | 2 +- .../hexagon/hexagon/hexagon_device_api_v2.cc | 17 ++++++++++++----- .../test_hexagon/test_cache_read_write.py | 12 +++++++++--- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc index 246a956ee66b..91d11bc56974 100644 --- a/src/runtime/hexagon/hexagon/hexagon_common.cc +++ b/src/runtime/hexagon/hexagon/hexagon_common.cc @@ -88,7 +88,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr& DLTensor* tensor = static_cast(arg_values[i].v_handle); buffer_args.emplace_back(i, static_cast(tensor->data)); // Assumes a single contiguous allocation - // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + // TODO(Straw): Enable discontiguous allocation tensor->data = buffer_args.back().second->GetPointer()[0]; } } diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 4cc72dbaf4a9..16491142ac9d 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -59,7 +59,7 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; // Forcing contiguous allocation, for now - // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + // TODO(Straw): Enable discontiguous allocation size_t nallocs = 1; size_t nbytes = 1; for (int i = 0; i < ndim; ++i) { @@ -104,7 +104,7 @@ void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType typ dmlc::ThreadLocalStore::Get()->AllocWorkspace(dev, size)); // Assumes a single contiguous allocation - // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + // TODO(Straw): Enable discontiguous allocation void* ptr = hexbuf->GetPointer()[0]; workspace_allocations_.insert({ptr, hexbuf}); return ptr; @@ -121,10 +121,17 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) { void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { + CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; + // Forcing contiguous allocation, for now + // TODO(Straw): Enable discontiguous allocation + CHECK_EQ(ndim, 1); return AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } -void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); } +void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) { + CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; + FreeDataSpace(dev, ptr); +} void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { CHECK_EQ(from->byte_offset, 0); @@ -181,7 +188,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMR CHECK(scope.find("vtcm") != std::string::npos); int64_t ndim = args[5]; // Forcing contiguous allocation, for now - // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + // TODO(Straw): Enable discontiguous allocation CHECK_EQ(ndim, 1); std::vector shape; for (int i = 0; i < ndim; ++i) { @@ -202,7 +209,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMR hexapi->AllocVtcmWorkspace(dev, ndim, shape.data(), type_hint, String(scope))); // Assumes a single contiguous allocation - // TODO(Straw): Enable discontiguous allocation after RFC 39 lands + // TODO(Straw): Enable discontiguous allocation void* ptr = hexbuf->GetPointer()[0]; vtcmallocs[ptr] = hexbuf; *rv = ptr; diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index fb9b352476bd..a638d733b0d2 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -125,9 +125,15 @@ def test_cache_read_write( with launcher.start_session() as sess: mod = launcher.load_module(dso_binary, sess) - xt = tvm.nd.array(np.random.uniform(size=size).astype(x.dtype), device=sess.device) - yt = tvm.nd.array(np.random.uniform(size=size).astype(y.dtype), device=sess.device) - zt = tvm.nd.array(np.random.uniform(size=size).astype(z.dtype), device=sess.device) + xt = tvm.nd.array( + np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device + ) + yt = tvm.nd.array( + np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device + ) + zt = tvm.nd.array( + np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device + ) mod["dmacpy"](xt, yt, zt) launcher.stop_server() From c3a3b3083818cf9e1caa009b0918659b10d6b3b3 Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 11 Mar 2022 15:32:18 -0800 Subject: [PATCH 35/38] fix test hang --- src/tir/transforms/lower_vtcm_alloc.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index d133ce5e9d16..55ac40c96bb4 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -34,12 +34,11 @@ class VtcmAllocator : public StmtExprMutator { VtcmAllocator() {} Stmt VisitStmt_(const AllocateNode* op) final { - Stmt body = this->VisitStmt(op->body); std::string storage_scope = GetStorageScope(op->buffer_var); - Stmt stmt = StmtExprMutator::VisitStmt_(op); - op = stmt.as(); + Stmt stmt = StmtExprMutator::VisitStmt_(op); if (IsVtcmStorage(storage_scope)) { + Stmt body = this->VisitStmt(op->body); Array args; args.push_back(StringImm(storage_scope)); args.push_back(IntImm(DataType::Int(64), op->extents.size())); From 53ce1ee80397d3595f4754b5147ca1158690dd23 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 14 Mar 2022 08:45:55 -0700 Subject: [PATCH 36/38] restructure lower vtcm pass per code review feedback (option a) --- src/tir/transforms/lower_vtcm_alloc.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index 55ac40c96bb4..a1af509fb31e 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -35,8 +35,6 @@ class VtcmAllocator : public StmtExprMutator { Stmt VisitStmt_(const AllocateNode* op) final { std::string storage_scope = GetStorageScope(op->buffer_var); - - Stmt stmt = StmtExprMutator::VisitStmt_(op); if (IsVtcmStorage(storage_scope)) { Stmt body = this->VisitStmt(op->body); Array args; @@ -45,11 +43,9 @@ class VtcmAllocator : public StmtExprMutator { for (size_t i = 0; i < op->extents.size(); ++i) { args.push_back(op->extents[i]); } - stmt = LetStmt(op->buffer_var, - Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); + return LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); } - - return stmt; + return StmtExprMutator::VisitStmt_(op); } protected: From 412857c7cea3e3d83a893a174385a43ad2041d07 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 14 Mar 2022 08:57:58 -0700 Subject: [PATCH 37/38] format error --- src/tir/transforms/lower_vtcm_alloc.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index a1af509fb31e..c540a98f6bcb 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -43,9 +43,10 @@ class VtcmAllocator : public StmtExprMutator { for (size_t i = 0; i < op->extents.size(); ++i) { args.push_back(op->extents[i]); } - return LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); + return LetStmt(op->buffer_var, + Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); } - return StmtExprMutator::VisitStmt_(op); + return StmtExprMutator::VisitStmt_(op); } protected: From 38cd9750c886261dfec86fedaadd21a68da2632c Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 14 Mar 2022 10:37:36 -0700 Subject: [PATCH 38/38] global.vtcm + tvm_stack_make_shape --- src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 9 +++------ src/runtime/opencl/opencl_device_api.cc | 5 +++-- src/tir/transforms/lower_vtcm_alloc.cc | 8 ++++---- src/tir/transforms/texture_flatten.cc | 6 +++++- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index 16491142ac9d..9d5b8f87f446 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -185,15 +185,12 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMR int32_t dtype_code_hint = args[2]; int32_t dtype_bits_hint = args[3]; std::string scope = args[4]; - CHECK(scope.find("vtcm") != std::string::npos); + CHECK(scope.find("global.vtcm") != std::string::npos); int64_t ndim = args[5]; // Forcing contiguous allocation, for now // TODO(Straw): Enable discontiguous allocation CHECK_EQ(ndim, 1); - std::vector shape; - for (int i = 0; i < ndim; ++i) { - shape.push_back(args[6 + i]); - } + int64_t* shape = static_cast(static_cast(args[6])); Device dev; dev.device_type = static_cast(device_type); @@ -206,7 +203,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMR HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global(); HexagonBuffer* hexbuf = reinterpret_cast( - hexapi->AllocVtcmWorkspace(dev, ndim, shape.data(), type_hint, String(scope))); + hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope))); // Assumes a single contiguous allocation // TODO(Straw): Enable discontiguous allocation diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 379f5f772a04..36bb156c8e9f 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -447,8 +447,9 @@ TVM_REGISTER_GLOBAL("device_api.opencl.AllocNd").set_body([](TVMArgs args, TVMRe CHECK(scope.find("texture") != std::string::npos); int64_t ndim = args[5]; CHECK_EQ(ndim, 2); - int64_t width = args[6]; - int64_t height = args[7]; + int64_t* shape = static_cast(static_cast(args[6])); + int64_t width = shape[0]; + int64_t height = shape[1]; Device dev; dev.device_type = static_cast(device_type); diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc index c540a98f6bcb..0b5f7bf1554d 100644 --- a/src/tir/transforms/lower_vtcm_alloc.cc +++ b/src/tir/transforms/lower_vtcm_alloc.cc @@ -26,7 +26,9 @@ namespace tvm { namespace tir { -inline bool IsVtcmStorage(std::string scope) { return scope.find("vtcm") != std::string::npos; } +inline bool IsVtcmStorage(std::string scope) { + return scope.find("global.vtcm") != std::string::npos; +} class VtcmAllocator : public StmtExprMutator { public: @@ -40,9 +42,7 @@ class VtcmAllocator : public StmtExprMutator { Array args; args.push_back(StringImm(storage_scope)); args.push_back(IntImm(DataType::Int(64), op->extents.size())); - for (size_t i = 0; i < op->extents.size(); ++i) { - args.push_back(op->extents[i]); - } + args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), op->extents)); return LetStmt(op->buffer_var, Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); } diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 44d40fb1c60b..a607e5914b39 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -115,7 +115,11 @@ class TextureFlattener : public TextureLoweringBase { size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); auto texture = ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); - Array args = {StringImm(storage_scope), 2, texture.width, texture.height}; + Array args; + args.push_back(StringImm(storage_scope)); + args.push_back(IntImm(DataType::Int(64), 2)); // 2d + args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), + {texture.width, texture.height})); stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body); }