From 9308a3f82111404e4b90b439df4adca8b46ff9d1 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Thu, 4 Nov 2021 19:30:43 +0000 Subject: [PATCH 01/15] USMP integration with AoT executor codegen This commit intergrates USMP with the AoT executor codegen. Additionally, this commit introduces two PassContext options to disable_usmp and disable_storage_rewrite. Change-Id: I98d905fab7f49c2de52e126115953a40b0821e21 --- include/tvm/tir/transform.h | 7 + include/tvm/tir/usmp/algo/algo.h | 62 +++++ include/tvm/tir/usmp/analysis.h | 49 ++++ include/tvm/tir/usmp/transform.h | 66 +++++ include/tvm/tir/usmp/utils.h | 37 ++- src/driver/driver_api.cc | 7 +- src/relay/backend/aot_executor_codegen.cc | 216 +++++++++++------ src/runtime/meta_data.h | 18 +- src/target/source/codegen_source_base.cc | 14 +- src/target/source/codegen_source_base.h | 6 + src/target/source/source_module.cc | 227 ++++++++++++++---- src/tir/transforms/make_unpacked_api.cc | 31 +-- src/tir/usmp/analysis/extract_buffer_info.cc | 19 +- src/tir/usmp/transform/assign_pool_info.cc | 120 +++++++++ .../convert_pool_allocations_to_offsets.cc | 47 +++- src/tir/usmp/unified_static_memory_planner.cc | 96 ++++++++ src/tir/usmp/utils.cc | 82 ++++++- tests/python/contrib/test_ethosu/infra.py | 4 +- tests/python/relay/aot/aot_test_utils.py | 6 +- tests/python/relay/aot/corstone300.ld | 7 + tests/python/relay/aot/test_crt_aot.py | 35 ++- 21 files changed, 971 insertions(+), 185 deletions(-) create mode 100644 include/tvm/tir/usmp/algo/algo.h create mode 100644 include/tvm/tir/usmp/analysis.h create mode 100644 include/tvm/tir/usmp/transform.h create mode 100644 src/tir/usmp/transform/assign_pool_info.cc create mode 100644 src/tir/usmp/unified_static_memory_planner.cc diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index 7a6cfa364447..97d750ce6aad 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -484,6 +484,13 @@ TVM_DLL Pass MergeDynamicSharedMemoryAllocations(); */ TVM_DLL Pass ConvertForLoopsToSerial(); +/*! + * \brief This is the unified static memory planner pass that will + * plan for memory intra- and inter- PrimFuncs together. + * \return The pass. + */ +TVM_DLL Pass UnifiedStaticMemoryPlanner(); + } // namespace transform } // namespace tir } // namespace tvm diff --git a/include/tvm/tir/usmp/algo/algo.h b/include/tvm/tir/usmp/algo/algo.h new file mode 100644 index 000000000000..8a2e3475ce19 --- /dev/null +++ b/include/tvm/tir/usmp/algo/algo.h @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/usmp/algo/algo.h + * \brief The memory planning algorithm for USMP + */ + +#ifndef TVM_TIR_USMP_ALGO_ALGO_H_ +#define TVM_TIR_USMP_ALGO_ALGO_H_ + +#include + +namespace tvm { +namespace tir { +namespace usmp { +namespace algo { + +/*! + * \brief The Greedy-by-Size algorithm to plan memory + * + * This will perform a greedy algorithm in deciding the offsets + * within provided Pools, using the size of the buffer. + * + * \return A Map of BufferInfo objects and their associated PoolAllocation + */ +Map GreedyBySize(const Array& buffer_info_arr, + const Integer& memory_pressure); + +/*! + * \brief The Greedy-by-Conflicts algorithm to plan memory + * + * This will perform a greedy algorithm in deciding the offsets + * within provided Pools, using the number of liveness conflicts of the buffer. + * + * \return A Map of BufferInfo objects and their associated PoolAllocation + */ +Map GreedyByConflicts(const Array& buffer_info_arr, + const Integer& memory_pressure); + +} // namespace algo +} // namespace usmp +} // namespace tir +} // namespace tvm + +#endif // TVM_TIR_USMP_ALGO_ALGO_H_ diff --git a/include/tvm/tir/usmp/analysis.h b/include/tvm/tir/usmp/analysis.h new file mode 100644 index 000000000000..a24851d33182 --- /dev/null +++ b/include/tvm/tir/usmp/analysis.h @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/usmp/analysis.h + * \brief The analysis passes for TIR-based Unified Static Memory Planner + */ + +#ifndef TVM_TIR_USMP_ANALYSIS_H_ +#define TVM_TIR_USMP_ANALYSIS_H_ + +#include +#include + +namespace tvm { +namespace tir { +namespace usmp { + +/*! + * \brief Extract BufferInfo objects from a TIR IRModule + * + * This pass would extract the buffer information of allocate nodes + * including liveness conflict with other buffer info objects. + * + * \return A Map of BufferInfo objects and their associated Stmts + */ +BufferInfoAnalysis ExtractBufferInfo(const PrimFunc& main_func, const IRModule& mod); + +} // namespace usmp +} // namespace tir +} // namespace tvm + +#endif // TVM_TIR_USMP_ANALYSIS_H_ diff --git a/include/tvm/tir/usmp/transform.h b/include/tvm/tir/usmp/transform.h new file mode 100644 index 000000000000..7bc08ea3933c --- /dev/null +++ b/include/tvm/tir/usmp/transform.h @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/usmp/transform.h + * \brief The transform passes for TIR-based Unified Static Memory Planner + */ + +#ifndef TVM_TIR_USMP_TRANSFORM_H_ +#define TVM_TIR_USMP_TRANSFORM_H_ + +#include + +namespace tvm { +namespace tir { +namespace usmp { +namespace transform { + +using Pass = tvm::transform::Pass; + +/*! + * \brief Convert the analyzed PoolAllocation to offsets from pool variables + * + * This pass would convert the IRModule that contains all PrimFuncs that contains + * the associated PoolAllocations to be read from being offset from the input var + * of the PrimFunc. + * + * \return the pass + */ +TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map& pool_allocations, + Bool emit_tvmscript_printable = Bool(false)); + +/*! + * \brief Assign PoolInfo objects to tir.allocate nodes depending on the PrimFunc's target + * + * This pass would assign PoolInfo objects to tir.allocate nodes depending on the each target + * that each PrimFunc would belong to. If there are not any pools provided in the IRModule, + * this pass would create a global workspace pool that every target could access for as the + * default behaviour. + * + * \return the pass + */ +TVM_DLL Pass AssignPoolInfo(); + +} // namespace transform +} // namespace usmp +} // namespace tir +} // namespace tvm + +#endif // TVM_TIR_USMP_TRANSFORM_H_ diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h index 30c8f2ddea49..cf02596cb46c 100644 --- a/include/tvm/tir/usmp/utils.h +++ b/include/tvm/tir/usmp/utils.h @@ -26,6 +26,7 @@ #define TVM_TIR_USMP_UTILS_H_ #include +#include #include #include @@ -59,22 +60,26 @@ struct PoolInfoNode : public Object { Integer size_hint_bytes; /*! \brief The accessibility from each Target*/ Map target_access; // 'rw' or 'ro' + /*! \brief Whether pool is internally generated*/ + Bool is_internal = Bool(false); void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pool_name", &pool_name); v->Visit("size_hint_bytes", &size_hint_bytes); v->Visit("target_access", &target_access); + v->Visit("is_internal", &is_internal); } bool SEqualReduce(const PoolInfoNode* other, SEqualReducer equal) const { return equal(pool_name, other->pool_name) && equal(size_hint_bytes, other->size_hint_bytes) && - equal(target_access, other->target_access); + equal(target_access, other->target_access) && equal(is_internal, other->is_internal); } void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(pool_name); hash_reduce(size_hint_bytes); hash_reduce(target_access); + hash_reduce(is_internal); } static constexpr const char* _type_key = "tir.usmp.PoolInfo"; @@ -89,7 +94,8 @@ static const int kUnrestrictedPoolSizeHint = -1; class PoolInfo : public ObjectRef { public: TVM_DLL PoolInfo(String pool_name, Map target_access, - Integer size_hint_bytes = kUnrestrictedPoolSizeHint); + Integer size_hint_bytes = kUnrestrictedPoolSizeHint, + Bool is_internal = Bool(false)); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PoolInfo, ObjectRef, PoolInfoNode); }; @@ -268,7 +274,16 @@ class AllocatedPoolInfo : public ObjectRef { * * \param buffer_info_map IR-bound BufferInfo map */ -Array CreateArrayBufferInfo(const Map& buffer_info_map); +Array CreateArrayBufferInfo(const Map& buffer_info_map); + +/*! + * \brief Calculate workspace required to execute a IRModule with main expressed in TIR + * + * \param mod the IRModule with TIR-based main function + */ +Integer CalculateModuleWorkspaceSize(const IRModule& mod); + +void PrintConflicts(const Array& buffer_info_arr); /*! * \brief The allocate node attribute to indicate candidate memory pools. @@ -284,6 +299,16 @@ static constexpr const char* kPoolCandidatesAllocateAttr = "candidate_memory_poo */ Integer CalculateExtentsSize(const AllocateNode* op); +/*! + * \brief Joins the Stmt nodes with PoolAllocation objects + * + * \param buffer_info_to_stmt the map of BufferInfo objects to Stmt nodes + * \param buffer_info_to_pool_allocation the map of BufferInfo objects to PoolAllocation objects + */ +Map AssignStmtPoolAllocations( + const Map& buffer_info_to_stmt, + const Map& buffer_info_to_pool_allocation); + } // namespace usmp } // namespace tir @@ -294,6 +319,12 @@ namespace attr { */ static constexpr const char* kPoolArgs = "pool_args"; +/*! + * \brief This is a BaseFunc attribute to indicate which input var represent + * a PoolInfo Object in the form of a Map. + */ +static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos"; + } // namespace attr } // namespace tvm diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 7dc7b28b968b..e750344f4f0c 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -44,6 +44,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.detect_global_barrier", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array>); TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool); @@ -191,6 +192,8 @@ Array CreatePassList(bool disable_loop_partition) { transform::PassContext pass_ctx = transform::PassContext::Current(); bool disable_vectorize = pass_ctx->GetConfig("tir.disable_vectorize", Bool(false)).value(); + bool disable_storage_rewrite = + pass_ctx->GetConfig("tir.disable_storage_rewrite", Bool(false)).value(); bool instrument_bound_checkers = pass_ctx->GetConfig("tir.instrument_bound_checkers", Bool(false)).value(); @@ -260,7 +263,9 @@ Array CreatePassList(bool disable_loop_partition) { pass_list.push_back(tir::transform::VectorizeLoop(!disable_vectorize)); pass_list.push_back(tir::transform::InjectVirtualThread()); pass_list.push_back(tir::transform::InjectDoubleBuffer()); - pass_list.push_back(tir::transform::StorageRewrite()); + if (!disable_storage_rewrite) { + pass_list.push_back(tir::transform::StorageRewrite()); + } pass_list.push_back(tir::transform::UnrollLoop()); // Add user-defined phase-2 passes diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index d901f8a26c4f..38b1cee6b9a2 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -290,7 +291,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - buffer_vars.push_back(main_signature_[input_vars_.size() + output_index]); + buffer_vars.push_back(GetBufferVarForIO(input_vars_.size() + output_index)); continue; } @@ -308,7 +309,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { if (input_iter != input_vars_.end()) { // Input variable int main_index = std::distance(input_vars_.begin(), input_iter); - return {main_signature_[main_index]}; + return {GetBufferVarForIO(main_index)}; } else { // Storage identifier (i.e., intermediate memory) return PackSid(arg); @@ -331,7 +332,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { if (params_by_expr_.find(arg) != params_by_expr_.end()) { auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), {tir::StringImm(params_by_expr_[arg])}); - args.push_back(param_handle); + args.push_back(tvm::tir::Cast(DataType::Handle(), param_handle)); } else { auto var_arg = FindExpr(arg); for (const auto& var : var_arg) { @@ -405,26 +406,11 @@ class AOTExecutorCodegen : public MixedModeVisitor { auto tmp1 = te::Var("tmp1", DataType::Handle()); te::Var loop_idx("i", DataType::Int(32)); auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, tir::const_true()); - - PrimExpr retval_get = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(), - {in, 0, tir::builtin::kArrData}); - PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(), - {out, 0, tir::builtin::kArrData}); - if (use_unpacked_api_) { - tostore = out; - } - - // Do not pack the input if the flag is set or the caller - // explicitly asked to do so (e.g., copying a param to the output) - if (use_unpacked_api_ || !pack_input) { - retval_get = in; - } - // Copy the variable from the input to the output - tir::Stmt copy = tir::For( - loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial, - tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, tir::const_true())); - stmts_.push_back(tir::LetStmt(tmp1, tostore, copy)); + tir::Stmt copy = + tir::For(loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial, + tir::Store(tmp1, tir::Let(tmp0, in, retval_i), loop_idx, tir::const_true())); + stmts_.push_back(tir::LetStmt(tmp1, out, copy)); } /* @@ -546,12 +532,12 @@ class AOTExecutorCodegen : public MixedModeVisitor { if (params_by_expr_.find(expr) != params_by_expr_.end()) { auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), {tir::StringImm(params_by_expr_[expr])}); - CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, - /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]); + CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), param_handle, + /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]); } else { auto var_expr = FindExpr(expr); - CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], - /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]); + CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), var_expr[0], + /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]); } } } @@ -572,7 +558,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { int output_index = std::distance(return_sid_.begin(), output_iter); auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), {tir::StringImm(params_by_expr_[expr])}); - CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, false, + CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), param_handle, false, sinfo->storage_sizes_in_bytes[0]); } } @@ -645,32 +631,120 @@ class AOTExecutorCodegen : public MixedModeVisitor { // TODO(giuseros): we should allocate this once outside the PrimFunc // so we don't pay the price of allocation for every inference if (!allocated[sid]) { - body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body); + PointerType ptype = Downcast(sids_table_[sid]->type_annotation); + DataType element_type = Downcast(ptype->element_type)->dtype; + body = tir::Allocate(sids_table_[sid], element_type, {size}, tir::const_true(), body); } allocated[sid] = true; } } - // Define the attributes - body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body); - body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body); - // Define the PrimFunc attributes Map dict_attrs; String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_run_func_suffix); dict_attrs.Set("global_symbol", run_func_name); dict_attrs.Set("runner_function", Bool(true)); + dict_attrs.Set(tvm::attr::kTarget, target_host_); tir::Stmt device_activations = GenerateAllDeviceHook("Activate"); tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate"); tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations}); // Make the PrimFunc - return tir::PrimFunc(main_signature_, final_body, VoidType(), Map(), + return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_, DictAttrs(dict_attrs)); } + /*! + * brief Access IO vars using the buffer vars and + * not the actual var. + */ + tir::Var GetBufferVarForIO(int index) { return main_buffer_map_[main_signature_[index]]->data; } + + /*! + * brief Create tir::Var for input/output while updating + * the buffer_maps. + */ + void CreateIOVar(const Expr& expr, std::string name) { + if (expr->IsInstance()) { + Tuple tuple = Downcast(expr); + for (unsigned i = 0; i < tuple->fields.size(); i++) { + CreateIOVar(tuple->fields[i], name + std::to_string(i) + "_"); + } + } else { + tir::Var var = tir::Var(name, DataType::Handle()); + main_signature_.push_back(var); + auto tensor_type = expr->checked_type().as(); + DataType elem_type = tensor_type->dtype; + tir::Var buffer_var = + tir::Var(name + "_buffer_var", PointerType(PrimType(elem_type), "global")); + tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0, + name + "_buffer", 16, 1, tir::BufferType::kDefault); + main_buffer_map_.Set(var, buffer); + } + } + + /*! + * brief This function is a wrapper to run memory planning + * followed by recording the latest workspaces required. + */ + IRModule PlanMemoryLoweredModule(const IRModule& mod) { + transform::PassContext pass_ctx = transform::PassContext::Current(); + bool disable_usmp = pass_ctx->GetConfig("tir.usmp.disable", Bool(false)).value(); + + IRModule lowered_mod = mod->ShallowCopy(); + Executor executor_config = mod->GetAttr(tvm::attr::kExecutor).value(); + Integer workspace_byte_alignment = + executor_config->GetAttr("workspace-byte-alignment").value_or(16); + if (!disable_usmp) { + lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod); + // Update workspace size based on the pool allocations. + Optional> allocated_pool_infos = + lowered_mod->GetAttr>(tvm::attr::kPoolArgs); + int main_workspace_size = 0; + if (allocated_pool_infos) { + for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : + allocated_pool_infos.value()) { + main_workspace_size += allocated_pool_info->allocated_size->value; + } + } + for (const auto& kv : function_metadata_) { + if (lowered_mod->ContainGlobalVar(kv.first) && + lowered_mod->Lookup(kv.first)->IsInstance()) { + tir::PrimFunc pfunc = Downcast(lowered_mod->Lookup(kv.first)); + Target tgt = pfunc->GetAttr(tvm::attr::kTarget).value(); + const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment); + kv.second->workspace_sizes.Set(tgt, ws); + } + } + backend::FunctionInfo main_func_info = + lowered_mod->GetAttr("main_func_info").value(); + main_func_info->workspace_sizes.Set(target_host_, main_workspace_size); + function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); + } else { + // Running StorageRewrite just on the main function + tir::PrimFunc tir_main_func = + Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + IRModule main_func_mod; + main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), + tir_main_func); + main_func_mod = tir::transform::StorageRewrite()(main_func_mod); + lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), + main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + tir_main_func = + Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + // Use the PrimFunc to calculate the workspace required to service the allocates + Integer main_workspace_size = + CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment); + backend::FunctionInfo main_func_info = + lowered_mod->GetAttr("main_func_info").value(); + main_func_info->workspace_sizes.Set(target_host_, main_workspace_size); + function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); + } + return lowered_mod; + } + protected: /*! \brief mod */ runtime::Module* mod_; @@ -682,6 +756,8 @@ class AOTExecutorCodegen : public MixedModeVisitor { Map device_contexts_; /*! \brief input and output variables belonging to the main function signature */ Array main_signature_; + /*! \brief input and output variables belonging to the main function signature */ + Map main_buffer_map_; /*! \brief target device */ tec::TargetMap targets_; /*! \brief target host */ @@ -773,7 +849,8 @@ class AOTExecutorCodegen : public MixedModeVisitor { for (auto input : lowered_main_func->params) { input_vars_.push_back(input); - main_signature_.push_back(tir::Var("input", DataType::Handle())); + std::string input_name = input->name_hint(); + CreateIOVar(input, input_name); } // Define the storage allocator ids @@ -792,9 +869,8 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Retrieve the return sids return_sid_ = final_aot_allocator.GetReturnIds(); - for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) { - main_signature_.push_back(tir::Var("output", DataType::Handle())); - } + // Insert outputs to main func signature + CreateIOVar(lowered_main_func->body, "output"); CollectDeviceVariables(lowered_mod->GetAttr>("device_contexts").value()); VisitExpr(lowered_main_func->body); @@ -802,7 +878,6 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Create the runner function. Please note that the function is not legal yet // because the packed calls arguments are not wrapped in TVMValues. To make this happen we need // to run the LegalizePackedCalls pass. - auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size()); LoweredOutput ret; ret.params = std::unordered_map>(); @@ -812,36 +887,23 @@ class AOTExecutorCodegen : public MixedModeVisitor { std::make_pair(static_cast(param_storage_ids_[param.first]), param.second))); } - // Build the TIR IRModule for the main AOT function - Map symbol_map; - symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); - IRModule mod_run(symbol_map, {}, {}, {}, mod->attrs); - VLOG(1) << "main module:" << std::endl << PrettyPrint(mod_run); - - // Apply storage rewrite pass to the runner function to do memory planning - auto storage_rewrite = tir::transform::StorageRewrite(); - mod_run = storage_rewrite(mod_run); - // The workspace for main function should be calculated after performing storage_rewrite for - // the top level TIR function. - Integer main_workspace_size = CalculateWorkspaceBytes( - Downcast(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)), - workspace_byte_alignment); - - Optional main_func_info = - lowered_mod->GetAttr("main_func_info"); - - main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size); - function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); + // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main + // function and replacing it with its TIR version. We should try to make this a Pass. + lowered_mod->Remove(lowered_mod->GetGlobalVar("main")); + auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size()); + lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); + // Parallel for loops are not supported in AoT codegen. + lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod); + lowered_mod = PlanMemoryLoweredModule(lowered_mod); + ret.function_metadata = std::move(function_metadata_); // Legalize AOT if needed. This means that all the packed calls // need to be wrapped in TVMValues (unless use_unpacked_api is set) if (!use_unpacked_api_) { auto pack_calls = tir::transform::LegalizePackedCalls(); - mod_run = pack_calls(mod_run); + lowered_mod = pack_calls(lowered_mod); } - ret.function_metadata = std::move(function_metadata_); - Optional> external_modules = lowered_mod->GetAttr>("external_mods"); ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point."; @@ -859,20 +921,28 @@ class AOTExecutorCodegen : public MixedModeVisitor { ret.external_mods = external_modules.value(); - if (ret.lowered_funcs.find(target_host_) != ret.lowered_funcs.end()) { - VLOG(1) << "merging main into existing module for host target"; - ret.lowered_funcs[target_host_]->Update(mod_run); - } else { - VLOG(1) << "adding main into new module for host target"; - ret.lowered_funcs.Set(target_host_, mod_run); + Map pool_var_info; + std::vector pool_vars; + tir::PrimFunc tir_main_func = + Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + Optional> allocated_pool_infos = + tir_main_func->GetAttr>(tvm::attr::kPoolArgs); + int main_workspace_size = 0; + if (allocated_pool_infos) { + for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) { + pool_vars.push_back(allocated_pool_info->pool_var.value()); + pool_var_info.Set(allocated_pool_info->pool_var.value(), allocated_pool_info); + main_workspace_size += allocated_pool_info->allocated_size->value; + } } - - std::vector input_var_names(input_vars_.size()); - std::transform(input_vars_.begin(), input_vars_.end(), input_var_names.begin(), - [](Var input_var) -> String { return input_var->name_hint(); }); + Array devices = ListDevices(); + Array inputs = + Array(tir_main_func->params.begin(), + tir_main_func->params.begin() + tir_main_func->params.size() - + return_sid_.size() - pool_vars.size() - devices.size()); ret.metadata = - runtime::Metadata(input_var_names, ListDevices(), return_sid_.size(), - runtime::kTvmExecutorAot, mod_name, interface_api, use_unpacked_api_); + runtime::Metadata(inputs, pool_vars, devices, return_sid_.size(), runtime::kTvmExecutorAot, + mod_name, interface_api, use_unpacked_api_, pool_var_info); return ret; } diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h index 8996d1b76e1f..80d3eeb09dd8 100644 --- a/src/runtime/meta_data.h +++ b/src/runtime/meta_data.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -55,9 +56,11 @@ inline String get_name_mangled(const String& module_name, const String& name) { class MetadataNode : public Object { public: /*! \brief input information for the main function */ - Array inputs; + Array inputs; + /*! \brief pool information for the main function */ + Array pools; /*! \brief number of outputs of the main function */ - int num_outputs = 1; + unsigned int num_outputs = 1; /*! \brief device contexts information for the main function */ Array devices; /*! \brief the executor to be used to run the model */ @@ -66,6 +69,8 @@ class MetadataNode : public Object { String interface_api; /*! \brief The internal API (packed or unpacked) in use */ bool unpacked_api; + /*! \brief the input var names that correspond to pool_inputs */ + Optional> pool_inputs; String mod_name = ""; @@ -79,16 +84,21 @@ class MetadataNode : public Object { */ class Metadata : public ObjectRef { public: - TVM_DLL Metadata(Array inputs, Array devices, int num_outputs, String executor, - String mod_name, String interface_api = "packed", bool unpacked_api = false) { + TVM_DLL Metadata(Array inputs, Array pools, Array devices, + int num_outputs, String executor, String mod_name, + String interface_api = "packed", bool unpacked_api = false, + Map pool_inputs = + Map()) { auto n = make_object(); n->inputs = inputs; + n->pools = pools; n->devices = devices; n->num_outputs = num_outputs; n->executor = executor; n->interface_api = interface_api; n->unpacked_api = unpacked_api; n->mod_name = mod_name; + n->pool_inputs = pool_inputs; data_ = std::move(n); } diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index 9f0cf9a70b61..518da725d08f 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -74,13 +74,13 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) { std::string key = v->name_hint; std::string vid = GetUniqueName(key); var_idmap_[v] = vid; - return vid; + return SanitiseName(vid); } std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const { auto it = var_idmap_.find(v); ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint; - return it->second; + return SanitiseName(it->second); } void CodeGenSourceBase::PrintIndent() { @@ -113,5 +113,15 @@ void CodeGenSourceBase::EndScope(int scope_id) { indent_ -= 2; } +std::string CodeGenSourceBase::SanitiseName(std::string name) const { + std::replace_if( + name.begin(), name.end(), + [](char c) { + { return !std::isalnum(c); } + }, + '_'); + return name; +} + } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index d938469b8969..0a3066db51a6 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -53,6 +53,12 @@ class CodeGenSourceBase { */ void MarkConst(std::string value); + /*! + * \brief Sanitize names by removing illegal characters + * \param name The name to be sanitised. + */ + std::string SanitiseName(std::string name) const; + protected: /*! \brief entry in ssa assign map */ struct SSAEntry { diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index e01a3d93d087..9ecbf173cbbc 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -34,6 +34,7 @@ #include "../../runtime/file_utils.h" #include "../../support/str_escape.h" #include "../func_registry_generator.h" +#include "codegen_c.h" #include "codegen_source_base.h" namespace tvm { @@ -165,6 +166,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { Target target_; relay::Runtime runtime_; runtime::Metadata metadata_; + CodeGenC codegen_c_; void CreateFuncRegistry() { code_ << "#include \n"; @@ -197,45 +199,161 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { << "}\n"; } + String GenerateDLTensorStructWrapper(String reference_arg) { + code_ << "DLTensor " << reference_arg << "_dlt = {\n"; + code_ << ".data = &" << reference_arg << "\n"; + code_ << "};\n"; + code_ << "TVMValue " << reference_arg << "_tvmv = {\n"; + code_ << ".v_handle = &" << reference_arg << "_dlt\n"; + code_ << "};\n"; + return reference_arg + "_tvmv"; + } + + void GenerateInternalWorkspaceBuffers() { + if (metadata_->pool_inputs.defined()) { + for (const auto& kv : metadata_->pool_inputs.value()) { + tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second; + if (allocated_pool_info->pool_info->is_internal) { + code_ << "__attribute__((section(\".bss.tvm\"), "; + code_ << "aligned(" << 16 << ")))\n"; + code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "[" + << allocated_pool_info->allocated_size->value << "];\n"; + } + } + } + } + + bool IsInternalWorkspaceBuffer(const tir::Var& pool_var) { + if (metadata_->pool_inputs.defined()) { + Map allocated_pool_infos = + metadata_->pool_inputs.value(); + if (allocated_pool_infos.find(pool_var) != allocated_pool_infos.end()) { + tir::usmp::AllocatedPoolInfo allocate_pool_info = allocated_pool_infos[pool_var]; + if (allocate_pool_info->pool_info->is_internal) { + return true; + } + } + } + return false; + } + void GenerateEntrypointForUnpackedAPI(const std::string& entrypoint_name, const std::string& run_func) { code_ << "TVM_DLL int32_t " << run_func << "("; - unsigned int total_args = (metadata_->inputs.size() + metadata_->num_outputs); - for (unsigned int i = 0; i < total_args; ++i) { - code_ << "void* arg" << i; - if (i + 1 != total_args) { - code_ << ","; + + { + std::stringstream call_args_ss; + for (const tir::Var& input_var : metadata_->inputs) { + if (input_var->type_annotation.defined()) { + codegen_c_.PrintType(input_var->type_annotation, call_args_ss); + } else { + codegen_c_.PrintType(input_var.dtype(), call_args_ss); + } + call_args_ss << " " << input_var->name_hint << ","; + } + for (unsigned int i = 0; i < metadata_->num_outputs; ++i) { + call_args_ss << "void* output" << i << ","; + } + for (const tir::Var& pool_var : metadata_->pools) { + if (pool_var->type_annotation.defined()) { + codegen_c_.PrintType(pool_var->type_annotation, call_args_ss); + } else { + codegen_c_.PrintType(pool_var.dtype(), call_args_ss); + } + call_args_ss << " " << pool_var->name_hint << ","; } + std::string call_args_str = call_args_ss.str(); + call_args_str.pop_back(); + code_ << call_args_str; } + code_ << ");\n"; code_ << "int32_t " << entrypoint_name; code_ << "(void* args, void* type_code, int num_args, void* out_value, void* " "out_type_code, void* resource_handle) {\n"; code_ << "return " << run_func << "("; - for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) { - code_ << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,"; + + { + std::stringstream call_args_ss; + for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) { + call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,"; + } + for (unsigned int i = 0; i < metadata_->num_outputs; ++i) { + int j = metadata_->inputs.size() + i; + call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data,"; + } + for (const tir::Var& pool_var : metadata_->pools) { + if (IsInternalWorkspaceBuffer(pool_var)) { + call_args_ss << "&" << metadata_->pool_inputs.value()[pool_var]->pool_info->pool_name + << ","; + } + } + std::string call_args_str = call_args_ss.str(); + call_args_str.pop_back(); + code_ << call_args_str; + code_ << ");\n"; + code_ << "}\n"; + } + } + + std::unordered_map GenerateRunFuncToEntryPointArgMap() { + std::unordered_map run_func_to_entry_point_args; + int entrypoint_arg_count = 0; + int run_func_arg_count = 0; + + for (unsigned int i = 0; i < metadata_->inputs.size(); i++) { + run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count); + entrypoint_arg_count++; + run_func_arg_count++; } - for (int i = 0; i < metadata_->num_outputs; ++i) { - int j = metadata_->inputs.size() + i; - code_ << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data"; - if (i + 1 != metadata_->num_outputs) { - code_ << ","; + for (unsigned int i = 0; i < metadata_->num_outputs; i++) { + run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count); + entrypoint_arg_count++; + run_func_arg_count++; + } + for (const tir::Var& pool_var : metadata_->pools) { + if (IsInternalWorkspaceBuffer(pool_var)) { + tir::usmp::AllocatedPoolInfo allocated_pool_info = metadata_->pool_inputs.value()[pool_var]; + run_func_to_entry_point_args[run_func_arg_count] = + allocated_pool_info->pool_info->pool_name; + run_func_arg_count++; } } - code_ << ");\n"; - code_ << "}\n"; + return run_func_to_entry_point_args; } void GenerateEntrypointForPackedAPI(const std::string& entrypoint_name, const std::string& run_func) { code_ << "TVM_DLL int32_t " << run_func; code_ << "(void* args, void* type_code, int num_args, void* out_value, void* " - "out_type_code, void* resource_handle);\n"; + "out_type_code, void* resource_handle);\n\n"; + code_ << "int32_t " << entrypoint_name; code_ << "(void* args, void* type_code, int num_args, void* out_value, void* " "out_type_code, void* resource_handle) {\n"; + + // We are creating a copy of the set of pointers + size_t number_of_io_tensors = + metadata_->inputs.size() + metadata_->num_outputs + metadata_->pools.size(); + code_ << "TVMValue tensors[" << number_of_io_tensors << "];\n"; + + std::unordered_map run_func_to_entry_point_args = + GenerateRunFuncToEntryPointArgMap(); + for (unsigned int i = 0; i < number_of_io_tensors; i++) { + if (run_func_to_entry_point_args.find(i) != run_func_to_entry_point_args.end()) { + if (run_func_to_entry_point_args[i]->IsInstance()) { + String pool_name = Downcast(run_func_to_entry_point_args[i]); + String pool_name_tvmv = GenerateDLTensorStructWrapper(pool_name); + code_ << "tensors[" << i << "] = " << pool_name_tvmv << ";\n"; + } else { + code_ << "tensors[" << i << "] = ((TVMValue*)args)[" + << run_func_to_entry_point_args[Integer(i)] << "];\n"; + } + } + } + code_ << "return " << run_func; - code_ << "(args, type_code, num_args, out_value, out_type_code, resource_handle);\n"; + code_ << "((void*)tensors, type_code, num_args, out_value, out_type_code, resource_handle);\n"; code_ << "}\n"; } @@ -245,14 +363,35 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { const std::string& mod_name) { code_ << "#include <" << mod_name << ".h>\n"; code_ << "TVM_DLL int32_t " << run_func << "("; - unsigned int total_args = - (metadata_->inputs.size() + metadata_->devices.size() + metadata_->num_outputs); - for (unsigned int i = 0; i < total_args; ++i) { - code_ << "void* arg" << i; - if (i + 1 != total_args) { - code_ << ","; + { + std::stringstream call_args_ss; + for (const tir::Var& input_var : metadata_->inputs) { + if (input_var->type_annotation.defined()) { + codegen_c_.PrintType(input_var->type_annotation, call_args_ss); + } else { + codegen_c_.PrintType(input_var.dtype(), call_args_ss); + } + call_args_ss << " " << codegen_c_.SanitiseName(input_var->name_hint) << ","; + } + for (unsigned int i = 0; i < metadata_->num_outputs; ++i) { + call_args_ss << "void* output" << i << ","; + } + for (const tir::Var& pool_var : metadata_->pools) { + if (pool_var->type_annotation.defined()) { + codegen_c_.PrintType(pool_var->type_annotation, call_args_ss); + } else { + codegen_c_.PrintType(pool_var.dtype(), call_args_ss); + } + call_args_ss << " " << pool_var->name_hint << ","; + } + for (const String& device : metadata_->devices) { + call_args_ss << "void* " << device << ","; } + std::string call_args_str = call_args_ss.str(); + call_args_str.pop_back(); + code_ << call_args_str; } + code_ << ");\n"; code_ << "int32_t " << entrypoint_name << "("; code_ << "struct " << runtime::get_name_mangled(mod_name, "inputs") << "* inputs,"; @@ -265,32 +404,32 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { code_ << ") {" << "return " << run_func << "("; - for (const auto& input : metadata_->inputs) { - std::string sanitised_input = input; - std::replace_if(sanitised_input.begin(), sanitised_input.end(), isNotAlnum, '_'); - code_ << "inputs->" << sanitised_input << ","; - } - if (metadata_->num_outputs == 1) { - code_ << "outputs->output"; - } else { - for (int i = 0; i < metadata_->num_outputs; ++i) { - code_ << "outputs->output" << i; - if (i + 1 != metadata_->num_outputs) { - code_ << ","; + + { + std::stringstream call_args_ss; + for (const auto& input : metadata_->inputs) { + call_args_ss << "inputs->" << codegen_c_.SanitiseName(input->name_hint) << ","; + } + if (metadata_->num_outputs == 1) { + call_args_ss << "outputs->output,"; + } else { + for (unsigned int i = 0; i < metadata_->num_outputs; ++i) { + call_args_ss << "outputs->output" << i << ","; } } - } - - if (!metadata_->devices.empty()) { - code_ << ","; - for (const String& device : metadata_->devices) { - code_ << "devices->" << device; - if (device != metadata_->devices.back()) { - code_ << ","; + for (const tir::Var& pool_var : metadata_->pools) { + if (IsInternalWorkspaceBuffer(pool_var)) { + call_args_ss << "&" << metadata_->pool_inputs.value()[pool_var]->pool_info->pool_name + << ","; } } + for (const String& device : metadata_->devices) { + call_args_ss << "devices->" << device << ","; + } + std::string call_args_str = call_args_ss.str(); + call_args_str.pop_back(); + code_ << call_args_str; } - code_ << ");\n"; code_ << "}\n"; } @@ -309,6 +448,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { code_ << "extern \"C\" {\n"; code_ << "#endif\n"; + GenerateInternalWorkspaceBuffers(); + if (metadata_->unpacked_api) { if (metadata_->interface_api == "c") { GenerateCInterfaceEntrypoint(entrypoint_mangled, run_func_mangled, metadata_->mod_name); diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc index 169983a525df..d118547517fe 100644 --- a/src/tir/transforms/make_unpacked_api.cc +++ b/src/tir/transforms/make_unpacked_api.cc @@ -57,33 +57,22 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) { const Stmt nop = Evaluate(0); std::vector device_init; - // Create arg to buffer binder - std::unordered_map vmap; - ArgBinder binder(&vmap); // Collect variables and buffers to map between Array args; - std::vector> var_def; - bool buffer_map_found = false; - - for (int i = 0; i < static_cast(func_ptr->params.size()); ++i) { - Var param = func_ptr->params[i]; - - auto it = func_ptr->buffer_map.find(param); - if (it != func_ptr->buffer_map.end()) { - args.push_back((*it).second->data); - buffer_map_found = true; - } else { - args.push_back(param); - } + // We only iterate the function params upto number of buffer_map keys + // because if there exist a resource handle, it will not have a buffer + for (unsigned int i = 0; i < func->buffer_map.size(); i++) { + Var param = func->params[i]; + args.push_back(func->buffer_map[param]->data); } - - if (buffer_map_found) { - device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop)); - device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop)); + if (func->params.size() == func->buffer_map.size() + 1) { + args.push_back(func->params.back()); } + device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop)); + device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop)); - func_ptr->body = MergeNest({device_init, binder.init_nest(), binder.asserts()}, func_ptr->body); + func_ptr->body = MergeNest(device_init, func_ptr->body); func_ptr->params = args; func_ptr->ret_type = PrimType(DataType::Int(32)); diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc index ea53f27e5558..fb4fb52c507e 100644 --- a/src/tir/usmp/analysis/extract_buffer_info.cc +++ b/src/tir/usmp/analysis/extract_buffer_info.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -58,7 +59,9 @@ class BufferInfoExtractor : public StmtExprVisitor { public: explicit BufferInfoExtractor(const IRModule& module) : module_(module) { for (const auto& gv_func : module_->functions) { - functions_.Set(gv_func.first->name_hint, Downcast(gv_func.second)); + if (gv_func.second->IsInstance()) { + functions_.Set(gv_func.first->name_hint, Downcast(gv_func.second)); + } } // Pushing a scope info for the initial body of the main function scope_stack_.push(ScopeInfo()); @@ -342,16 +345,24 @@ void BufferInfoExtractor::VisitExpr_(const VarNode* op) { Array static GetMatchedBuffers(const PrimFunc& func) { Array buffer_vars; - for (const auto& param : func->params) { + for (unsigned int i = 0; i < func->params.size() - 1; i++) { + Var param = func->params[i]; buffer_vars.push_back(func->buffer_map[param]->data); } + Var last_param = func->params.back(); + // Checks whether last var is present in the buffer map + // because it could be the resource handle + if (func->buffer_map.find(last_param) != func->buffer_map.end()) { + buffer_vars.push_back(func->buffer_map[last_param]->data); + } return buffer_vars; } void BufferInfoExtractor::UpdateAliases(const Array& args, const PrimFunc& func) { auto param_buffers = GetMatchedBuffers(func); - ICHECK(args.size() == param_buffers.size()); - for (size_t i = 0; i < args.size(); i++) { + // Last var could be a resource handle that does not have a Buffer + ICHECK(args.size() == param_buffers.size() || args.size() - 1 == param_buffers.size()); + for (size_t i = 0; i < param_buffers.size(); i++) { auto arg = args[i]; auto param_buf = param_buffers[i]; // If tir.allocates are passed in to functions diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc new file mode 100644 index 000000000000..ef6559b27caa --- /dev/null +++ b/src/tir/usmp/transform/assign_pool_info.cc @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace tvm { +namespace tir { +namespace usmp { + +/*! \brief Assign PoolInfo objects to allocate that does not have any. + * The schedulers have the oppurtunity to assign PoolInfo objects to + * allocate nodes. However, each allocate node is expected to have + * at least one PoolInfo node assigned to it. If it was not the case, + * this Pass will assign all PoolInfo objects that the target could + * access.*/ +class PoolInfoAssigner : public StmtExprMutator { + public: + explicit PoolInfoAssigner(const IRModule& module) { + PrimFunc main_func = + Downcast(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + ICHECK(main_func.defined()) << "main function is not in the module"; + Optional target_host = main_func->GetAttr(tvm::attr::kTarget); + ICHECK(target_host) << "main function does not have a target attr"; + Array pool_infos = + module->GetAttr>(tvm::attr::kPoolInfoIRModuleAttr) + .value_or({usmp::PoolInfo("global_workspace", + {{target_host.value(), usmp::kTargetPoolReadWriteAccess}}, + usmp::kUnrestrictedPoolSizeHint, Bool(true))}); + for (const usmp::PoolInfo& pool_info : pool_infos) { + for (const auto& kv : pool_info->target_access) { + Target tgt = kv.first; + if (target_pool_infos_.find(tgt) == target_pool_infos_.end()) { + target_pool_infos_.Set(tgt, Array()); + } + Array pool_info_arr = target_pool_infos_[tgt]; + pool_info_arr.push_back(pool_info); + target_pool_infos_.Set(tgt, pool_info_arr); + } + } + mod_ = module->ShallowCopy(); + } + + IRModule operator()(); + + private: + Stmt VisitStmt_(const AllocateNode* op) override; + + IRModule mod_; + Map> target_pool_infos_; + PrimFunc func_; +}; + +Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) { + Optional tgt = func_->GetAttr(tvm::attr::kTarget).value(); + ICHECK(tgt) << "The following PrimFunc does not have a target attr: \n" << func_; + Map annotations = Map(op->annotations); + if (op->annotations.find(kPoolCandidatesAllocateAttr) == op->annotations.end()) { + annotations.Set(kPoolCandidatesAllocateAttr, target_pool_infos_[tgt.value()]); + } + Stmt body = VisitStmt(op->body); + auto allocate = + Allocate(op->buffer_var, op->dtype, op->extents, op->condition, body, annotations); + return allocate; +} + +IRModule PoolInfoAssigner::operator()() { + for (const auto& kv : mod_->functions) { + GlobalVar gv = kv.first; + if (kv.second->IsInstance()) { + func_ = Downcast(kv.second); + Stmt body = this->VisitStmt(func_->body); + PrimFunc new_prim_func = + PrimFunc(func_->params, body, func_->ret_type, func_->buffer_map, func_->attrs); + mod_->Update(gv, new_prim_func); + } + } + return mod_; +} + +namespace transform { + +tvm::transform::Pass AssignPoolInfo() { + auto pass_func = [=](IRModule m, tvm::transform::PassContext ctx) { + return PoolInfoAssigner(m)(); + }; + return tvm::transform::CreateModulePass(pass_func, 0, "tir.usmp.AssignPoolInfo", {}); +} + +TVM_REGISTER_GLOBAL("tir.usmp.transform.AssignPoolInfo").set_body_typed(AssignPoolInfo); + +} // namespace transform + +} // namespace usmp +} // namespace tir +} // namespace tvm diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc index 5ebf3c557b06..cd797681d474 100644 --- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc +++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -120,12 +121,15 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator { /*! \brief This is a helper to append the pool args to * the callsite of the function. */ - Array AppendPoolParamsToArgs(const Array& args); + Array AppendPoolParamsToArgs(Array args, const PrimFunc& func); /*! \brief Some arguments that used to be Allocate nodes * should be replaced by Let nodes in the pass that loads * the space from a pool variable. */ Array ReplaceAllocateArgsWithLetArgs(const Array& args); + /*! \brief Obtain a resource handle if its there + */ + Optional GetResourceHandle(const PrimFunc& func); /*! \brief The tir::Var map to PoolInfo objects */ Map primfunc_args_to_pool_info_map_; @@ -151,10 +155,23 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator { std::unordered_set visited_primfuncs; }; +Optional PoolAllocationToOffsetConverter::GetResourceHandle(const PrimFunc& func) { + if (func->buffer_map.find(func->params.back()) == func->buffer_map.end()) { + return func->params.back(); + } + return Optional(); +} + PoolAllocationToOffsetConverter::ScopeInfo PoolAllocationToOffsetConverter::UpdateFunctionScopeInfo( const PrimFunc& original_func) { ScopeInfo si; + + Optional resource_handle = GetResourceHandle(original_func); si.params = original_func->params; + if (resource_handle) { + si.params.pop_back(); + ICHECK(si.params.size() == original_func->params.size() - 1); + } si.buffer_map = original_func->buffer_map; Map ret; for (const AllocatedPoolInfo& allocated_pool_info : allocated_pool_ordering_) { @@ -179,6 +196,9 @@ PoolAllocationToOffsetConverter::ScopeInfo PoolAllocationToOffsetConverter::Upda si.buffer_map.Set(pool_var, Buffer(buffer_var, elem_dtype, {pool_size}, {1}, 1, buffer_var_name, 16, 1, BufferType::kDefault)); } + if (resource_handle) { + si.params.push_back(resource_handle.value()); + } return si; } @@ -199,7 +219,7 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams( PrimFunc ret = PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map, original_attrs); if (!emit_tvmscript_printable_) { - return WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params); + ret = WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params); } visited_primfuncs.insert(ret); return ret; @@ -207,9 +227,14 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams( return original_primfunc; } -Array PoolAllocationToOffsetConverter::AppendPoolParamsToArgs( - const Array& args) { +Array PoolAllocationToOffsetConverter::AppendPoolParamsToArgs(Array args, + const PrimFunc& func) { Array new_args; + PrimExpr resource_handle_arg; + if (args.size() == func->params.size() + 1) { + resource_handle_arg = args.back(); + args.pop_back(); + } for (const auto& arg : args) { new_args.push_back(VisitExpr(arg)); } @@ -219,6 +244,9 @@ Array PoolAllocationToOffsetConverter::AppendPoolParamsToArgs( Buffer buffer_var = top_scope.buffer_map[pool_var]; new_args.push_back(buffer_var->data); } + if (resource_handle_arg.defined()) { + new_args.push_back(resource_handle_arg); + } return new_args; } @@ -240,12 +268,13 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) { if (op->op.same_as(builtin::call_extern()) || op->op.same_as(builtin::tvm_call_cpacked())) { String func_name = Downcast(op->args[0])->value; Array new_args; - if (module_->ContainGlobalVar(func_name)) { + if (module_->ContainGlobalVar(func_name) && + module_->Lookup(func_name)->IsInstance()) { GlobalVar gv = module_->GetGlobalVar(func_name); PrimFunc func = Downcast(module_->Lookup(gv)); PrimFunc prim_func = CreatePrimFuncWithPoolParams(func); module_->Update(gv, prim_func); - new_args = AppendPoolParamsToArgs(op->args); + new_args = AppendPoolParamsToArgs(op->args, prim_func); new_args = ReplaceAllocateArgsWithLetArgs(new_args); } else { new_args = ReplaceAllocateArgsWithLetArgs(op->args); @@ -255,8 +284,7 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) { if (op->op->IsInstance()) { PrimFunc func = Downcast(op->op); PrimFunc prim_func = CreatePrimFuncWithPoolParams(func); - Array new_args = AppendPoolParamsToArgs(op->args); - new_args = AppendPoolParamsToArgs(new_args); + Array new_args = AppendPoolParamsToArgs(op->args, prim_func); new_args = ReplaceAllocateArgsWithLetArgs(new_args); return Call(op->dtype, prim_func, new_args); } @@ -329,8 +357,7 @@ IRModule PoolAllocationToOffsetConverter::operator()() { namespace transform { tvm::transform::Pass ConvertPoolAllocationsToOffsets( - const Map& pool_allocations, - Bool emit_tvmscript_printable = Bool(false)) { + const Map& pool_allocations, Bool emit_tvmscript_printable) { auto pass_func = [=](IRModule m, tvm::transform::PassContext ctx) { return Downcast(PoolAllocationToOffsetConverter( m, pool_allocations, emit_tvmscript_printable->value != 0)()); diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc new file mode 100644 index 000000000000..9ef88f9acca9 --- /dev/null +++ b/src/tir/usmp/unified_static_memory_planner.cc @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/analysis/usmp/unified_static_memory_planner.cc + * \brief This is the pass that integrates the USMP passes to + * a single composite pass. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace tvm { + +TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.disable", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String); + +namespace tir { +namespace usmp { + +static constexpr const char* kDefaultAlgo = "greedy_by_size"; + +static std::unordered_map( + const Array&, const Integer&)>> + algorithms{{"greedy_by_size", algo::GreedyBySize}, + {"greedy_by_conflicts", algo::GreedyByConflicts}}; + +IRModule PlanMemory(const IRModule& mod, String algo) { + VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod); + PrimFunc main_func = Downcast(mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod); + Array buffer_info_arr = + CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts); + Map buffer_info_pool_allocations = + algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure); + Map stmt_pool_allocations = AssignStmtPoolAllocations( + buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations); + IRModule ret = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(mod); + tir::PrimFunc tir_main_func = + Downcast(ret->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + Optional> allocated_pool_infos = + tir_main_func->GetAttr>(tvm::attr::kPoolArgs); + if (allocated_pool_infos) { + for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) { + VLOG(1) << "pool_size = " << allocated_pool_info->allocated_size; + } + } + return ret; +} + +} // namespace usmp + +namespace transform { + +tvm::transform::Pass UnifiedStaticMemoryPlanner() { + auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) { + auto algorithm_str = ctx->GetConfig("tir.usmp.algorithm", String(usmp::kDefaultAlgo)); + return Downcast( + usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo)))); + }; + + return tvm::transform::Sequential( + {tvm::tir::usmp::transform::AssignPoolInfo(), + tvm::transform::CreateModulePass(usmp_main_pass_func, 0, + "tir.transform.UnifiedStaticMemoryPlanner", {})}); +} + +TVM_REGISTER_GLOBAL("tir.transform.UnifiedStaticMemoryPlanner") + .set_body_typed(UnifiedStaticMemoryPlanner); + +} // namespace transform +} // namespace tir +} // namespace tvm diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc index 14b3d26641a3..36aa3b6d768a 100644 --- a/src/tir/usmp/utils.cc +++ b/src/tir/usmp/utils.cc @@ -24,7 +24,11 @@ #include #include +#include +#include +#include #include +#include #include namespace tvm { @@ -88,11 +92,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) << ",\n memory_pressure=" << node->memory_pressure << ")"; }); -PoolInfo::PoolInfo(String pool_name, Map target_access, Integer size_hint_bytes) { +PoolInfo::PoolInfo(String pool_name, Map target_access, Integer size_hint_bytes, + Bool is_internal) { auto poolinfo_node = make_object(); poolinfo_node->pool_name = pool_name; poolinfo_node->size_hint_bytes = size_hint_bytes; poolinfo_node->target_access = target_access; + poolinfo_node->is_internal = is_internal; data_ = std::move(poolinfo_node); } @@ -168,6 +174,20 @@ Array CreateArrayBufferInfo(const Map& buffer_info return ret; } +void PrintConflicts(const Array& bi_arr) { + for (const auto& bi : bi_arr) { + std::stringstream ss; + ss << "buf=" << bi->name_hint; + ss << "|conflicts="; + for (const auto& conflict_bi : bi->conflicts) { + BufferInfo cbi = Downcast(conflict_bi); + ss << cbi->name_hint << ","; + } + ss << "\n"; + LOG(INFO) << ss.str(); + } +} + Map AssignStmtPoolAllocations( const Map& buffer_info_to_stmt, const Map& buffer_info_to_pool_allocation) { @@ -195,6 +215,66 @@ Integer CalculateExtentsSize(const AllocateNode* op) { return Integer(num_elements * element_size_bytes); } +class ModuleWorkspaceSizeCalculator : public StmtExprVisitor { + public: + explicit ModuleWorkspaceSizeCalculator(const IRModule& module) : mod_(module) { + for (const auto& gv_func : mod_->functions) { + functions_.Set(gv_func.first->name_hint, Downcast(gv_func.second)); + } + main_func_ = Downcast(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + ICHECK(main_func_.defined()) << "main function is not in the module"; + Optional target_host = main_func_->GetAttr(tvm::attr::kTarget); + ICHECK(target_host) << "main function does not have a target attr"; + target_host_ = target_host.value(); + } + + Integer operator()() { + UpdateWorkspaceData(main_func_); + return Integer(max_workspace_size); + } + + private: + void UpdateWorkspaceData(const PrimFunc& func) { + Target tgt = func->GetAttr(tvm::attr::kTarget).value_or(target_host_); + Integer workspace_byte_alignment = + tgt->GetAttr("workspace-byte-alignment").value_or(16); + Integer workspace_req = CalculateWorkspaceBytes(func, workspace_byte_alignment); + if (workspace_req) { + current_workspace_size_ += workspace_req->value; + } + if (max_workspace_size < current_workspace_size_) { + max_workspace_size = current_workspace_size_; + } + this->VisitStmt(func->body); + if (workspace_req) { + current_workspace_size_ -= workspace_req->value; + } + } + + void VisitExpr_(const CallNode* op) override { + if (op->op.same_as(builtin::call_extern())) { + PrimFunc func = functions_.at(Downcast(op->args[0])->value); + UpdateWorkspaceData(func); + } else if (op->op->IsInstance()) { + PrimFunc func = Downcast(op->op); + UpdateWorkspaceData(func); + } else { + StmtExprVisitor::VisitExpr_(op); + } + } + + IRModule mod_; + Target target_host_; + PrimFunc main_func_; + Map functions_; + size_t current_workspace_size_ = 0; + size_t max_workspace_size = 0; +}; + +Integer CalculateModuleWorkspaceSize(const IRModule& mod) { + return ModuleWorkspaceSizeCalculator(mod)(); +} + TVM_REGISTER_GLOBAL("tir.usmp.CreateArrayBufferInfo") .set_body_typed([](Map buffer_info_map) { return (CreateArrayBufferInfo(buffer_info_map)); diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index 96c8433a6384..36786ed8f2fe 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -214,7 +214,9 @@ def create_test_runner(accel="ethos-u55-256"): pass_config={ "relay.ext.ethos-u.options": { "accelerator_config": accel, - } + }, + # TODO(@manupa-arm): enable USMP once TIR to CS translator is capable of handling let bindings + "tir.usmp.disable": True, }, ) diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 3f448ca4a7d9..c65888953c33 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -702,9 +702,8 @@ def run_and_check( t = tarfile.open(tar_file) t.extractall(base_path) - workspace_bytes += model.extra_memory_in_bytes - if interface_api == "packed": - workspace_bytes += mlf_extract_workspace_size_bytes(tar_file) + # TODO(@manupa-arm): remove the stack allocator once the microNPU uses target hooks + workspace_bytes = model.extra_memory_in_bytes for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) @@ -815,6 +814,7 @@ def compile_and_run( target=target, target_opts=target_opts, ) + run_and_check( models=compiled_test_mods, runner=runner, diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld index ddf55b868780..e066b1538481 100644 --- a/tests/python/relay/aot/corstone300.ld +++ b/tests/python/relay/aot/corstone300.ld @@ -249,6 +249,13 @@ SECTIONS . = ALIGN(16); } > SRAM AT > SRAM + .bss.tvm : + { + . = ALIGN(16); + *(.bss.tvm) + . = ALIGN(16); + } > DDR + .bss.NoInit : { . = ALIGN(16); diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 8a2b1f1bb84d..6d2806b8c8e9 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -515,8 +515,8 @@ def test_quant_mobilenet_tfl(): import tvm.relay.testing.tf as tf_testing - interface_api = "packed" - use_unpacked_api = False + use_unpacked_api = True + interface_api = "c" test_runner = AOT_DEFAULT_RUNNER tflite_model_file = tf_testing.get_workload_official( @@ -658,14 +658,14 @@ def test_deprecated_target_arguments(capsys): @pytest.mark.parametrize( - "workspace_byte_alignment,main_workspace_size,sum_workspace_size", + "workspace_byte_alignment,main_workspace_size", [ - (8, 10368, 15200), - (16, 10368, 15232), - (256, 10752, 17408), + (8, 17280), + (16, 17280), + (256, 17792), ], ) -def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_workspace_size): +def test_memory_planning(workspace_byte_alignment, main_workspace_size): mod, params = tvm.relay.testing.synthetic.get_workload() target = "c" runtime = Runtime("crt") @@ -675,22 +675,18 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work "workspace-byte-alignment": workspace_byte_alignment, }, ) - with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + with tvm.transform.PassContext( + opt_level=3, + config={ + "tir.disable_vectorize": True, + "tir.disable_storage_rewrite": True, + "tir.usmp.algorithm": "greedy_by_conflicts", + }, + ): lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) - assert ( sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size ) - assert ( - sum( - [ - size - for metadata in lib.function_metadata.values() - for size in metadata.workspace_sizes.values() - ] - ) - == sum_workspace_size - ) def test_aot_codegen_backend_alloc_workspace_calls(): @@ -726,6 +722,7 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl models=AOTTestModel(module=relay_mod, inputs=None, outputs=None), interface_api="c", use_unpacked_api=True, + pass_config={"tir.usmp.disable": True}, ) source = compiled_test_mods[0].executor_factory.lib.imported_modules[0].get_source() # There should be three allocates created for three primitive relay function From 2e945c28e2d5ea6257e525b90fef62f6dc284dba Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Mon, 13 Dec 2021 13:18:14 +0000 Subject: [PATCH 02/15] USMP integration with AoT executor codegen * This commit breaks out the USMP related tests to test_crt_aot_usmp.py. * Switched the polarity of the pass config option to explicitly enable USMP rather than disable. Change-Id: Id4bf35b18479b70924ec24e6bb7ba2682b05326e --- src/relay/backend/aot_executor_codegen.cc | 4 +- src/tir/usmp/unified_static_memory_planner.cc | 2 +- tests/python/contrib/test_ethosu/infra.py | 2 - tests/python/relay/aot/aot_test_utils.py | 4 +- tests/python/relay/aot/test_crt_aot.py | 33 --- tests/python/relay/aot/test_crt_aot_usmp.py | 266 ++++++++++++++++++ 6 files changed, 272 insertions(+), 39 deletions(-) create mode 100644 tests/python/relay/aot/test_crt_aot_usmp.py diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 38b1cee6b9a2..a6a433a7cb76 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -691,13 +691,13 @@ class AOTExecutorCodegen : public MixedModeVisitor { */ IRModule PlanMemoryLoweredModule(const IRModule& mod) { transform::PassContext pass_ctx = transform::PassContext::Current(); - bool disable_usmp = pass_ctx->GetConfig("tir.usmp.disable", Bool(false)).value(); + bool enable_usmp = pass_ctx->GetConfig("tir.usmp.enable", Bool(false)).value(); IRModule lowered_mod = mod->ShallowCopy(); Executor executor_config = mod->GetAttr(tvm::attr::kExecutor).value(); Integer workspace_byte_alignment = executor_config->GetAttr("workspace-byte-alignment").value_or(16); - if (!disable_usmp) { + if (enable_usmp) { lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod); // Update workspace size based on the pool allocations. Optional> allocated_pool_infos = diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc index 9ef88f9acca9..00abaac1b0dd 100644 --- a/src/tir/usmp/unified_static_memory_planner.cc +++ b/src/tir/usmp/unified_static_memory_planner.cc @@ -35,7 +35,7 @@ namespace tvm { -TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.disable", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.enable", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String); namespace tir { diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index 36786ed8f2fe..0b058a94fb60 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -215,8 +215,6 @@ def create_test_runner(accel="ethos-u55-256"): "relay.ext.ethos-u.options": { "accelerator_config": accel, }, - # TODO(@manupa-arm): enable USMP once TIR to CS translator is capable of handling let bindings - "tir.usmp.disable": True, }, ) diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index c65888953c33..6900bdc2e6e1 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -702,8 +702,10 @@ def run_and_check( t = tarfile.open(tar_file) t.extractall(base_path) - # TODO(@manupa-arm): remove the stack allocator once the microNPU uses target hooks workspace_bytes = model.extra_memory_in_bytes + use_usmp = runner.pass_config.get("tir.usmp.enable", False) + if interface_api == "packed" and not use_usmp: + workspace_bytes += mlf_extract_workspace_size_bytes(tar_file) for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 6d2806b8c8e9..566566da1dce 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -657,38 +657,6 @@ def test_deprecated_target_arguments(capsys): ) -@pytest.mark.parametrize( - "workspace_byte_alignment,main_workspace_size", - [ - (8, 17280), - (16, 17280), - (256, 17792), - ], -) -def test_memory_planning(workspace_byte_alignment, main_workspace_size): - mod, params = tvm.relay.testing.synthetic.get_workload() - target = "c" - runtime = Runtime("crt") - executor = Executor( - "aot", - { - "workspace-byte-alignment": workspace_byte_alignment, - }, - ) - with tvm.transform.PassContext( - opt_level=3, - config={ - "tir.disable_vectorize": True, - "tir.disable_storage_rewrite": True, - "tir.usmp.algorithm": "greedy_by_conflicts", - }, - ): - lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) - assert ( - sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size - ) - - def test_aot_codegen_backend_alloc_workspace_calls(): """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls""" @@ -722,7 +690,6 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl models=AOTTestModel(module=relay_mod, inputs=None, outputs=None), interface_api="c", use_unpacked_api=True, - pass_config={"tir.usmp.disable": True}, ) source = compiled_test_mods[0].executor_factory.lib.imported_modules[0].get_source() # There should be three allocates created for three primitive relay function diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py new file mode 100644 index 000000000000..b88e0905dba5 --- /dev/null +++ b/tests/python/relay/aot/test_crt_aot_usmp.py @@ -0,0 +1,266 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" This file contains test that use USMP + AoT using C runtime APIs""" + +from collections import OrderedDict +import sys + +import numpy as np +import pytest + +import tvm +from tvm import relay, TVMError +from tvm.ir.module import IRModule +from tvm.relay import testing, transform +from tvm.relay.testing import byoc +from tvm.relay.op.annotation import compiler_begin, compiler_end +from tvm.relay.backend import Executor, Runtime +from aot_test_utils import ( + AOTTestModel, + AOTTestRunner, + generate_ref_data, + convert_to_relay, + compile_and_run, + compile_models, + parametrize_aot_options, + run_and_check, +) + + +def check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module): + """This checker checks whether any c-source produced has TVMBackendAllocWorkspace calls. + If USMP is invoked, none of them should have TVMBAW calls""" + dso_modules = mod._collect_dso_modules() + for dso_mod in dso_modules: + assert ( + dso_mod.type_key == "c" + ), 'Current CRT AoT codegen flow should only produce type "c" runtime modules' + source = dso_mod.get_source() + source.count( + "TVMBackendAllocWorkspace" + ) == 0, "This is failing because USMP was unable to plan for every tir.allocate node" + + +@pytest.mark.parametrize( + "workspace_byte_alignment,main_workspace_size", + [ + (8, 17280), + (16, 17280), + (256, 17792), + ], +) +def test_memory_planning(workspace_byte_alignment, main_workspace_size): + mod, params = tvm.relay.testing.synthetic.get_workload() + target = "c" + runtime = Runtime("crt") + executor = Executor( + "aot", + { + "workspace-byte-alignment": workspace_byte_alignment, + }, + ) + with tvm.transform.PassContext( + opt_level=3, + config={ + "tir.disable_vectorize": True, + "tir.disable_storage_rewrite": True, + "tir.usmp.enable": True, + "tir.usmp.algorithm": "greedy_by_conflicts", + }, + ): + lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) + assert ( + sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size + ) + + +@parametrize_aot_options +@pytest.mark.parametrize("groups,weight_shape", [(1, 32), (32, 1)]) +def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_shape): + """Test a subgraph with a single conv2d operator.""" + dtype = "float32" + ishape = (1, 32, 14, 14) + wshape = (32, weight_shape, 3, 3) + pass_config = {"tir.usmp.enable": True} + test_runner = AOTTestRunner( + makefile=test_runner.makefile, + prologue=test_runner.prologue, + epilogue=test_runner.epilogue, + includes=test_runner.includes, + parameters=test_runner.parameters, + pass_config=pass_config, + ) + + data0 = relay.var("data", shape=ishape, dtype=dtype) + weight0 = relay.var("weight", shape=wshape, dtype=dtype) + out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=groups) + main_f = relay.Function([data0, weight0], out) + mod = tvm.IRModule() + mod["main"] = main_f + mod = transform.InferType()(mod) + + i_data = np.random.uniform(0, 1, ishape).astype(dtype) + w1_data = np.random.uniform(0, 1, wshape).astype(dtype) + + inputs = OrderedDict([("data", i_data), ("weight", w1_data)]) + + output_list = generate_ref_data(mod, inputs) + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + test_runner, + interface_api, + use_unpacked_api, + ) + compiled_test_mods = compile_models( + models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + interface_api=interface_api, + use_unpacked_api=use_unpacked_api, + pass_config=test_runner.pass_config, + ) + + for compiled_model in compiled_test_mods: + check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib) + + run_and_check( + models=compiled_test_mods, + runner=test_runner, + interface_api=interface_api, + ) + + +@pytest.mark.parametrize("merge_compiler_regions", [False, True]) +def test_byoc_microtvm(merge_compiler_regions): + """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036""" + use_unpacked_api = False + interface_api = "packed" + test_runner = AOTTestRunner(pass_config={"tir.usmp.enable": True}) + + x = relay.var("x", shape=(10, 10)) + w0 = relay.var("w0", shape=(10, 10)) + w1 = relay.var("w1", shape=(10, 10)) + + # z0 = x + w0 + x_ = compiler_begin(x, "ccompiler") + w0_ = compiler_begin(w0, "ccompiler") + z0_ = relay.add(x_, w0_) + z0 = compiler_end(z0_, "ccompiler") + + # z1 = z0 + w1 + z0__ = compiler_begin(z0, "ccompiler") + w1_ = compiler_begin(w1, "ccompiler") + z1_ = relay.add(z0__, w1_) + z1 = compiler_end(z1_, "ccompiler") + + # z2 = z0 + z1 + z2 = relay.add(z0, z1) + + f = relay.Function([x, w0, w1], z2) + mod = tvm.IRModule() + mod["main"] = f + + if merge_compiler_regions: + mod = transform.MergeCompilerRegions()(mod) + + mod = transform.PartitionGraph("mod_name")(mod) + mod = transform.InferType()(mod) + + x_data = [("x", np.random.rand(10, 10).astype("float32"))] + w_data = [("w{}".format(i), np.random.rand(10, 10).astype("float32")) for i in range(2)] + + map_inputs = OrderedDict(x_data + w_data) + output_list = generate_ref_data(mod, map_inputs) + + compiled_test_mods = compile_models( + AOTTestModel(name="my_mod", module=mod, inputs=map_inputs, outputs=output_list), + interface_api=interface_api, + use_unpacked_api=use_unpacked_api, + pass_config=test_runner.pass_config, + ) + + for compiled_model in compiled_test_mods: + check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib) + + run_and_check( + models=compiled_test_mods, + runner=test_runner, + interface_api=interface_api, + ) + + +MOBILENET_V1_URL = ( + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite", +) + + +@pytest.mark.parametrize( + "model_url, usmp_algo, workspace_size,", + [ + (MOBILENET_V1_URL, "greedy_by_size", 4845696), + (MOBILENET_V1_URL, "greedy_by_conflicts", 4845696), + ], +) +def test_tflite_model(model_url, usmp_algo, workspace_size): + """This checks for ML models and the memory used by them when using USMP with different algorithms""" + pytest.importorskip("tflite") + + import tvm.relay.testing.tf as tf_testing + + use_unpacked_api = True + interface_api = "c" + test_runner = AOTTestRunner( + pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo} + ) + + tflite_model_file = tf_testing.get_workload_official( + model_url[0], + model_url[1], + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + data_shape = (1, 224, 224, 3) + in_min, in_max = (0, 255) + data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8") + mod, params = convert_to_relay(tflite_model_buf, data, "input") + inputs = {"input": data} + output_list = generate_ref_data(mod, inputs, params) + + compiled_test_mods = compile_models( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params), + interface_api=interface_api, + use_unpacked_api=use_unpacked_api, + pass_config=test_runner.pass_config, + ) + + for compiled_model in compiled_test_mods: + check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib) + + # Checking the workspace size + assert ( + sum( + compiled_model.executor_factory.function_metadata[ + "__tvm_main__" + ].workspace_sizes.values() + ) + == workspace_size + ) + + run_and_check( + models=compiled_test_mods, + runner=test_runner, + interface_api=interface_api, + ) From 13afe19d30aabc881e638dc293707412b689b041 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Mon, 13 Dec 2021 17:00:23 +0000 Subject: [PATCH 03/15] USMP integration with AoT executor codegen * Swap pool var and output var ordering in the main function Change-Id: Id03748e6f3528399a0ddd9cc2d011adfcee8d554 --- ..._tir_usmp_transform_convert_pool_allocations_to_offsets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index fc615775c160..ab40c646391c 100644 --- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -140,7 +140,7 @@ def run_model(input: T.handle, output: T.handle) -> None: @tvm.script.ir_module class LinearStructurePlanned: @T.prim_func - def run_model(input: T.handle, output: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle) -> None: + def run_model(input: T.handle, fast_memory_0_var: T.handle, slow_memory_1_var: T.handle, output: T.handle) -> None: fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16) slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16) # body @@ -464,7 +464,7 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1_let, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True) @T.prim_func - def run_model(input: T.handle, output: T.handle, global_workspace_0_var: T.handle) -> None: + def run_model(input: T.handle, global_workspace_0_var: T.handle, output: T.handle) -> None: global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16) # body T.attr("default", "device_id", 0) From 6aa927c3a048ea4e47ee4c895fe863adc49e1ca5 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Mon, 20 Dec 2021 12:22:00 +0000 Subject: [PATCH 04/15] USMP integration with AoT executor codegen * Improving comments * Changing the usage of tir Bool to just plain bool Change-Id: I907cf9a0befa172183ae488bb3b94660aad39807 --- include/tvm/tir/transform.h | 3 ++- include/tvm/tir/usmp/transform.h | 12 +++++------- include/tvm/tir/usmp/utils.h | 10 ++++++---- src/tir/usmp/unified_static_memory_planner.cc | 2 ++ 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index 97d750ce6aad..3a964eb77d1b 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -486,7 +486,8 @@ TVM_DLL Pass ConvertForLoopsToSerial(); /*! * \brief This is the unified static memory planner pass that will - * plan for memory intra- and inter- PrimFuncs together. + * plan for memory intra- and inter- PrimFuncs together. The pass + * requires all the function to be PrimFuncs including the main. * \return The pass. */ TVM_DLL Pass UnifiedStaticMemoryPlanner(); diff --git a/include/tvm/tir/usmp/transform.h b/include/tvm/tir/usmp/transform.h index 7bc08ea3933c..6de64704bd8b 100644 --- a/include/tvm/tir/usmp/transform.h +++ b/include/tvm/tir/usmp/transform.h @@ -37,9 +37,9 @@ using Pass = tvm::transform::Pass; /*! * \brief Convert the analyzed PoolAllocation to offsets from pool variables * - * This pass would convert the IRModule that contains all PrimFuncs that contains - * the associated PoolAllocations to be read from being offset from the input var - * of the PrimFunc. + * This pass would convert the main function to accept pool variables as an input + * that get passed onto the operator PrimFuncs. Furthermore, the static allocations + * will be converted to offsets within the pool variable. * * \return the pass */ @@ -49,10 +49,8 @@ TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map target_access; // 'rw' or 'ro' - /*! \brief Whether pool is internally generated*/ - Bool is_internal = Bool(false); + /*! \brief Whether pool is internally generated. + * The internal pools will be generated as part of + * the entry point code generation of the executor*/ + bool is_internal = false; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pool_name", &pool_name); @@ -320,8 +322,8 @@ namespace attr { static constexpr const char* kPoolArgs = "pool_args"; /*! - * \brief This is a BaseFunc attribute to indicate which input var represent - * a PoolInfo Object in the form of a Map. + * \brief This is a IRModule attribute that contains all the PoolInfo objects + * as an Array. */ static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos"; diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc index 00abaac1b0dd..91e0218e34c8 100644 --- a/src/tir/usmp/unified_static_memory_planner.cc +++ b/src/tir/usmp/unified_static_memory_planner.cc @@ -54,6 +54,8 @@ IRModule PlanMemory(const IRModule& mod, String algo) { BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod); Array buffer_info_arr = CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts); + CHECK(algorithms.count(algo)) << "The selected USMP algorithm : " << algo + << "is not defined. Please define it in the above algorithms map."; Map buffer_info_pool_allocations = algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure); Map stmt_pool_allocations = AssignStmtPoolAllocations( From 6e1d305c8be4d67611e1ee4353d3fe815785d8e8 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Mon, 20 Dec 2021 18:29:40 +0000 Subject: [PATCH 05/15] USMP integration with AoT executor codegen * fixing unit tests to scope With * removing global sanitization of var names Change-Id: Ifa8934f94744eeaac13e4b4ddcd671842c3dcb21 --- src/relay/backend/aot_executor_codegen.cc | 3 ++- src/target/source/codegen_source_base.cc | 6 +++--- src/target/source/codegen_source_base.h | 2 +- tests/cpp/build_module_test.cc | 16 ++++++++++------ 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index a6a433a7cb76..3b9ffa75c275 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -42,6 +42,7 @@ #include #include +#include "../../target/source/codegen_source_base.h" #include "../op/annotation/annotation.h" #include "../op/call/call.h" #include "../op/memory/device_copy.h" @@ -849,7 +850,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { for (auto input : lowered_main_func->params) { input_vars_.push_back(input); - std::string input_name = input->name_hint(); + std::string input_name = codegen::CodeGenSourceBase::SanitiseName(input->name_hint()); CreateIOVar(input, input_name); } diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index 518da725d08f..e0becee2dd64 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -74,13 +74,13 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) { std::string key = v->name_hint; std::string vid = GetUniqueName(key); var_idmap_[v] = vid; - return SanitiseName(vid); + return vid; } std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const { auto it = var_idmap_.find(v); ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint; - return SanitiseName(it->second); + return it->second; } void CodeGenSourceBase::PrintIndent() { @@ -113,7 +113,7 @@ void CodeGenSourceBase::EndScope(int scope_id) { indent_ -= 2; } -std::string CodeGenSourceBase::SanitiseName(std::string name) const { +std::string CodeGenSourceBase::SanitiseName(std::string name) { std::replace_if( name.begin(), name.end(), [](char c) { diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index 0a3066db51a6..ff94fa68dae7 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -57,7 +57,7 @@ class CodeGenSourceBase { * \brief Sanitize names by removing illegal characters * \param name The name to be sanitised. */ - std::string SanitiseName(std::string name) const; + static std::string SanitiseName(std::string name); protected: /*! \brief entry in ssa assign map */ diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index d5a4c91a3c43..ff3641cd6982 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -107,18 +107,22 @@ TEST(BuildModule, Heterogeneous) { auto elemwise_sub = compute( C->shape, [©, &C](PrimExpr i) { return copy[i] - C[i]; }, "elemwise_sub"); - With cuda_scope(target_cuda); - auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add}); + auto fcreate_s1 = [=]() { + With cuda_scope(target_cuda); + return topi::cuda::schedule_injective(target_cuda, {elemwise_add}); + }; - With llvm_scope(target_llvm); - auto s2 = create_schedule({elemwise_sub->op}); + auto fcreate_s2 = [=]() { + With llvm_scope(target_llvm); + return create_schedule({elemwise_sub->op}); + }; auto args1 = Array({A, B, elemwise_add}); auto args2 = Array({copy, C, elemwise_sub}); std::unordered_map binds; - auto lowered_s1 = LowerSchedule(s1, args1, "elemwise_add", binds); - auto lowered_s2 = LowerSchedule(s2, args2, "elemwise_sub", binds); + auto lowered_s1 = LowerSchedule(fcreate_s1(), args1, "elemwise_add", binds); + auto lowered_s2 = LowerSchedule(fcreate_s2(), args2, "elemwise_sub", binds); Map inputs = {{target_cuda, lowered_s1}, {target_llvm, lowered_s2}}; auto module = build(inputs, Target()); From 82ba5fb555c324f4743dc6fd0d0a9753c843041e Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Tue, 21 Dec 2021 10:18:43 +0000 Subject: [PATCH 06/15] USMP integration with AoT executor codegen * changing unpacked_api to accept PrimFunc without buffer_maps. Change-Id: I97e8df5272df7f1f8313f184aa660924989940e3 --- src/tir/transforms/make_unpacked_api.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc index d118547517fe..bc9870b9ac84 100644 --- a/src/tir/transforms/make_unpacked_api.cc +++ b/src/tir/transforms/make_unpacked_api.cc @@ -60,15 +60,17 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) { // Collect variables and buffers to map between Array args; - // We only iterate the function params upto number of buffer_map keys - // because if there exist a resource handle, it will not have a buffer - for (unsigned int i = 0; i < func->buffer_map.size(); i++) { - Var param = func->params[i]; - args.push_back(func->buffer_map[param]->data); - } - if (func->params.size() == func->buffer_map.size() + 1) { - args.push_back(func->params.back()); + for (const Var& param : func->params) { + // Ideally all func params should have Buffers defined in the buffer_map + // We should look to insert buffer_maps for all PrimFuncs that are returned + // to the core compiler. + if (func->buffer_map.find(param) != func->buffer_map.end()) { + args.push_back(func->buffer_map[param]->data); + } else { + args.push_back(param); + } } + device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop)); device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop)); From ee9f83939b848d5539e88a276c151a828832e437 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Tue, 21 Dec 2021 18:13:37 +0000 Subject: [PATCH 07/15] USMP integration with AoT executor codegen * Restoring the conditional behaviour as we are getting PrimFuncs without buffer maps. Change-Id: Icd374353e0947515c523d506c1e7c02cf5d930b4 --- src/tir/transforms/make_unpacked_api.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc index bc9870b9ac84..6365e09246fc 100644 --- a/src/tir/transforms/make_unpacked_api.cc +++ b/src/tir/transforms/make_unpacked_api.cc @@ -71,8 +71,10 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) { } } - device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop)); - device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop)); + if (func->buffer_map.size()) { + device_init.push_back(AttrStmt(node, attr::device_id, device_id, nop)); + device_init.push_back(AttrStmt(node, attr::device_type, device_type, nop)); + } func_ptr->body = MergeNest(device_init, func_ptr->body); func_ptr->params = args; From 3d7aaa40a76d62516f0ba561858b0dc281238144 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Tue, 21 Dec 2021 18:34:27 +0000 Subject: [PATCH 08/15] USMP integration with AoT executor codegen * Adds tir includes to source_module.cc Change-Id: I8ad5654ea32abb3aabc0fcc9ef8157a609e9d2dc --- src/target/source/source_module.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index 9ecbf173cbbc..50cba6b0b996 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include From e23fe524d49f9b69a637496c2be7ef200b0a82c1 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Wed, 22 Dec 2021 08:32:37 +0000 Subject: [PATCH 09/15] USMP integration with AoT executor codegen * fixing c device api tests Change-Id: I47ac38b5664a00f36b293a0d9d96330d4206173d --- tests/python/relay/aot/test_c_device_api.py | 64 +++++++++------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py index 473b8d5ee300..c4ca733422e6 100644 --- a/tests/python/relay/aot/test_c_device_api.py +++ b/tests/python/relay/aot/test_c_device_api.py @@ -92,7 +92,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True): workspace_byte_alignment=16, pass_config=test_runner.pass_config, ) - main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[1][1] + main_ir_module = compiled_models[0].executor_factory.lowered_ir_mods.items()[0][1] main_func = main_ir_module["run_model"] return main_func @@ -136,44 +136,29 @@ def test_device_api_hooks_unpacked_api(device_api_main_func): # Activate Device assert ( - str(main_func.body[0][0].value) - == "@tir.call_extern(" - + '"TVMDeviceEthosUActivate",' - + " device_context_ethos_u: handle," - + " dtype=int32)" + str(main_func.body[0]) + == "tir.call_extern(" + '"TVMDeviceEthosUActivate",' + " device_context_ethos_u)\n" ) # Open Device + print(str(main_func.body[1])) assert ( - str(main_func.body[1].body.body[0][0][0].value) - == "@tir.call_extern(" - + '"TVMDeviceEthosUOpen",' - + " device_context_ethos_u: handle," - + " dtype=int32)" + str(main_func.body[1][0][0][0]) + == "tir.call_extern(" + '"TVMDeviceEthosUOpen",' + " device_context_ethos_u)\n" ) # Device Call assert ( - str(main_func.body[1].body.body[0][0][1].value) - == "@tir.call_extern(" - + '"tvmgen_default_ethos_u_main_0",' - + " input: handle, output: handle," - + " device_context_ethos_u: handle," - + " dtype=int32)" + str(main_func.body[1][0][0][1]) + == 'tir.call_extern("tvmgen_default_ethos_u_main_0", x_int8_buffer_var, output_buffer_var, device_context_ethos_u)\n' ) # Close Device assert ( - str(main_func.body[1].body.body[0][0][2].value) - == "@tir.call_extern(" - + '"TVMDeviceEthosUClose",' - + " device_context_ethos_u: handle," - + " dtype=int32)" + str(main_func.body[1][0][0][2]) + == "tir.call_extern(" + '"TVMDeviceEthosUClose",' + " device_context_ethos_u)\n" ) # Deactivate Device assert ( - str(main_func.body[2][0].value) - == "@tir.call_extern(" - + '"TVMDeviceEthosUDeactivate",' - + " device_context_ethos_u: handle," - + " dtype=int32)" + str(str(main_func.body[2])) + == "tir.call_extern(" + '"TVMDeviceEthosUDeactivate",' + " device_context_ethos_u)\n" ) @@ -231,13 +216,10 @@ def test_without_device_api_unpacked_api(non_device_api_main_func): """Test a graph without the Device API with the unpacked internal calls""" main_func = non_device_api_main_func(interface_api="c", use_unpacked_api=True) - + print(str(main_func.body)) assert ( - str(main_func.body[1].body.body[0][0].value) - == "@tir.call_extern(" - + '"tvmgen_default_fused_multiply",' - + " input: handle, input_1: handle, output: handle," - + " dtype=int32)" + str(main_func.body) + == 'tir.call_extern("tvmgen_default_fused_multiply", x_buffer_var, y_buffer_var, output_buffer_var)\n' ) @@ -245,12 +227,18 @@ def test_without_device_api_packed_api(non_device_api_main_func): """Test a graph without the Device API with the packed internal calls""" main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False) - + print(str(main_func.body)) assert ( - str(main_func.body[1].body.body[0][0]) - == 'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n' - + "tir.tvm_struct_set(tvm_value_0, 0, 1, tir.reinterpret((uint64)0))\n" - + 'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", input, input, output, tvm_value_0)\n' + str(main_func.body) + == 'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n' + + 'let tvm_value_2 = tir.tvm_stack_alloca("array", 1)\n' + + 'let tvm_value_1 = tir.tvm_stack_alloca("array", 1)\n' + + 'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n' + + "tir.tvm_struct_set(tvm_value_0, 0, 1, x_buffer_var)\n" + + "tir.tvm_struct_set(tvm_value_1, 0, 1, y_buffer_var)\n" + + "tir.tvm_struct_set(tvm_value_2, 0, 1, output_buffer_var)\n" + + "tir.tvm_struct_set(tvm_value_3, 0, 1, tir.reinterpret((uint64)0))\n" + + 'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", tvm_value_0, tvm_value_1, tvm_value_2, tvm_value_3)\n' ) From 439d1efef5d486ce62ced2e5a37a3970d9c8d2d5 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Wed, 12 Jan 2022 16:40:33 +0000 Subject: [PATCH 10/15] USMP integration with AoT executor codegen * moving algo/algo.h to algorithms.h * creating two functions for USMP and StorageRewrite * expanding codegen variable names to more descriptive * removing unncessary print functions and print statements * re-using relay::backend::SanitizeName Change-Id: I890caa0104c07a4883eb0c34d3bfcfb8bb56653f --- .../tir/usmp/{algo/algo.h => algorithms.h} | 8 +- include/tvm/tir/usmp/utils.h | 10 ++ src/relay/backend/aot_executor_codegen.cc | 120 ++++++++++-------- src/target/source/codegen_source_base.cc | 10 -- src/target/source/codegen_source_base.h | 6 - src/target/source/source_module.cc | 17 +-- src/tir/usmp/algo/greedy.cc | 1 + src/tir/usmp/transform/assign_pool_info.cc | 2 +- src/tir/usmp/unified_static_memory_planner.cc | 8 +- src/tir/usmp/utils.cc | 14 -- tests/python/relay/aot/test_c_device_api.py | 3 - 11 files changed, 95 insertions(+), 104 deletions(-) rename include/tvm/tir/usmp/{algo/algo.h => algorithms.h} (93%) diff --git a/include/tvm/tir/usmp/algo/algo.h b/include/tvm/tir/usmp/algorithms.h similarity index 93% rename from include/tvm/tir/usmp/algo/algo.h rename to include/tvm/tir/usmp/algorithms.h index 8a2e3475ce19..77276a2c931c 100644 --- a/include/tvm/tir/usmp/algo/algo.h +++ b/include/tvm/tir/usmp/algorithms.h @@ -18,12 +18,12 @@ */ /*! - * \file tir/usmp/algo/algo.h + * \file tir/usmp/algorithms.h * \brief The memory planning algorithm for USMP */ -#ifndef TVM_TIR_USMP_ALGO_ALGO_H_ -#define TVM_TIR_USMP_ALGO_ALGO_H_ +#ifndef TVM_TIR_USMP_ALGORITHMS_H_ +#define TVM_TIR_USMP_ALGORITHMS_H_ #include @@ -59,4 +59,4 @@ Map GreedyByConflicts(const Array& buffe } // namespace tir } // namespace tvm -#endif // TVM_TIR_USMP_ALGO_ALGO_H_ +#endif // TVM_TIR_USMP_ALGORITHMS_H_ diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h index 28194143e7e8..ec909a296fd8 100644 --- a/include/tvm/tir/usmp/utils.h +++ b/include/tvm/tir/usmp/utils.h @@ -31,6 +31,16 @@ #include namespace tvm { + +/*! + * \brief PassContext option to enable the USMP + */ +constexpr const char* kUSMPEnableOption = "tir.usmp.enable"; +/*! + * \brief PassContext option to select the memory planning algorithm in USMP + */ +constexpr const char* kUSMPAlgorithmOption = "tir.usmp.algorithm"; + namespace tir { namespace usmp { diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 3b9ffa75c275..dd66758d8c71 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -687,62 +687,73 @@ class AOTExecutorCodegen : public MixedModeVisitor { } /*! - * brief This function is a wrapper to run memory planning - * followed by recording the latest workspaces required. + * brief Run USMP to plan memory for lowered IRModule */ - IRModule PlanMemoryLoweredModule(const IRModule& mod) { - transform::PassContext pass_ctx = transform::PassContext::Current(); - bool enable_usmp = pass_ctx->GetConfig("tir.usmp.enable", Bool(false)).value(); - - IRModule lowered_mod = mod->ShallowCopy(); + IRModule PlanMemoryWithUSMP(const IRModule& mod) { Executor executor_config = mod->GetAttr(tvm::attr::kExecutor).value(); Integer workspace_byte_alignment = executor_config->GetAttr("workspace-byte-alignment").value_or(16); - if (enable_usmp) { - lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod); - // Update workspace size based on the pool allocations. - Optional> allocated_pool_infos = - lowered_mod->GetAttr>(tvm::attr::kPoolArgs); - int main_workspace_size = 0; - if (allocated_pool_infos) { - for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : - allocated_pool_infos.value()) { - main_workspace_size += allocated_pool_info->allocated_size->value; - } + IRModule lowered_mod = mod->ShallowCopy(); + lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod); + // Update workspace size based on the pool allocations. + for (const auto& kv : function_metadata_) { + if (lowered_mod->ContainGlobalVar(kv.first) && + lowered_mod->Lookup(kv.first)->IsInstance()) { + tir::PrimFunc pfunc = Downcast(lowered_mod->Lookup(kv.first)); + Target tgt = pfunc->GetAttr(tvm::attr::kTarget).value(); + const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment); + kv.second->workspace_sizes.Set(tgt, ws); } - for (const auto& kv : function_metadata_) { - if (lowered_mod->ContainGlobalVar(kv.first) && - lowered_mod->Lookup(kv.first)->IsInstance()) { - tir::PrimFunc pfunc = Downcast(lowered_mod->Lookup(kv.first)); - Target tgt = pfunc->GetAttr(tvm::attr::kTarget).value(); - const auto& ws = CalculateWorkspaceBytes(pfunc, workspace_byte_alignment); - kv.second->workspace_sizes.Set(tgt, ws); + } + Optional> allocated_pool_infos = + lowered_mod->GetAttr>(tvm::attr::kPoolArgs); + backend::FunctionInfo main_func_info = + lowered_mod->GetAttr("main_func_info").value(); + main_func_info->workspace_sizes.clear(); + if (allocated_pool_infos) { + for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) { + for (const auto& kv : allocated_pool_info->pool_info->target_access) { + Target tgt = kv.first; + if (main_func_info->workspace_sizes.find(tgt) == main_func_info->workspace_sizes.end()) { + main_func_info->workspace_sizes.Set(tgt, allocated_pool_info->allocated_size); + } else { + main_func_info->workspace_sizes.Set(tgt, + main_func_info->workspace_sizes[tgt]->value + + allocated_pool_info->allocated_size->value); + } } } - backend::FunctionInfo main_func_info = - lowered_mod->GetAttr("main_func_info").value(); - main_func_info->workspace_sizes.Set(target_host_, main_workspace_size); - function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); - } else { - // Running StorageRewrite just on the main function - tir::PrimFunc tir_main_func = - Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); - IRModule main_func_mod; - main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), - tir_main_func); - main_func_mod = tir::transform::StorageRewrite()(main_func_mod); - lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), - main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); - tir_main_func = - Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); - // Use the PrimFunc to calculate the workspace required to service the allocates - Integer main_workspace_size = - CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment); - backend::FunctionInfo main_func_info = - lowered_mod->GetAttr("main_func_info").value(); - main_func_info->workspace_sizes.Set(target_host_, main_workspace_size); - function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); } + function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); + return lowered_mod; + } + + /*! + * brief Run StorageRewrite to plan memory for lowered IRModule + */ + IRModule PlanMemoryWithStorageRewrite(const IRModule& mod) { + Executor executor_config = mod->GetAttr(tvm::attr::kExecutor).value(); + Integer workspace_byte_alignment = + executor_config->GetAttr("workspace-byte-alignment").value_or(16); + IRModule lowered_mod = mod->ShallowCopy(); + // Running StorageRewrite just on the main function + tir::PrimFunc tir_main_func = + Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + IRModule main_func_mod; + main_func_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), + tir_main_func); + main_func_mod = tir::transform::StorageRewrite()(main_func_mod); + lowered_mod->Update(lowered_mod->GetGlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), + main_func_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + tir_main_func = + Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); + // Use the PrimFunc to calculate the workspace required to service the allocates + Integer main_workspace_size_bytes = + CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment); + backend::FunctionInfo main_func_info = + lowered_mod->GetAttr("main_func_info").value(); + main_func_info->workspace_sizes.Set(target_host_, main_workspace_size_bytes); + function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info); return lowered_mod; } @@ -850,7 +861,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { for (auto input : lowered_main_func->params) { input_vars_.push_back(input); - std::string input_name = codegen::CodeGenSourceBase::SanitiseName(input->name_hint()); + std::string input_name = SanitizeName(input->name_hint()); CreateIOVar(input, input_name); } @@ -895,7 +906,14 @@ class AOTExecutorCodegen : public MixedModeVisitor { lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); // Parallel for loops are not supported in AoT codegen. lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod); - lowered_mod = PlanMemoryLoweredModule(lowered_mod); + + transform::PassContext pass_ctx = transform::PassContext::Current(); + bool enable_usmp = pass_ctx->GetConfig(kUSMPEnableOption, Bool(false)).value(); + if (enable_usmp) { + lowered_mod = PlanMemoryWithUSMP(lowered_mod); + } else { + lowered_mod = PlanMemoryWithStorageRewrite(lowered_mod); + } ret.function_metadata = std::move(function_metadata_); // Legalize AOT if needed. This means that all the packed calls @@ -928,12 +946,10 @@ class AOTExecutorCodegen : public MixedModeVisitor { Downcast(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)); Optional> allocated_pool_infos = tir_main_func->GetAttr>(tvm::attr::kPoolArgs); - int main_workspace_size = 0; if (allocated_pool_infos) { for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) { pool_vars.push_back(allocated_pool_info->pool_var.value()); pool_var_info.Set(allocated_pool_info->pool_var.value(), allocated_pool_info); - main_workspace_size += allocated_pool_info->allocated_size->value; } } Array devices = ListDevices(); diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index e0becee2dd64..9f0cf9a70b61 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -113,15 +113,5 @@ void CodeGenSourceBase::EndScope(int scope_id) { indent_ -= 2; } -std::string CodeGenSourceBase::SanitiseName(std::string name) { - std::replace_if( - name.begin(), name.end(), - [](char c) { - { return !std::isalnum(c); } - }, - '_'); - return name; -} - } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index ff94fa68dae7..d938469b8969 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -53,12 +53,6 @@ class CodeGenSourceBase { */ void MarkConst(std::string value); - /*! - * \brief Sanitize names by removing illegal characters - * \param name The name to be sanitised. - */ - static std::string SanitiseName(std::string name); - protected: /*! \brief entry in ssa assign map */ struct SSAEntry { diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index 50cba6b0b996..deec2c343ae2 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -26,19 +26,16 @@ #include #include #include -#include -#include -#include #include #include #include +#include "../../relay/backend/name_transforms.h" #include "../../runtime/file_utils.h" #include "../../support/str_escape.h" #include "../func_registry_generator.h" #include "codegen_c.h" -#include "codegen_source_base.h" namespace tvm { namespace codegen { @@ -203,13 +200,13 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { } String GenerateDLTensorStructWrapper(String reference_arg) { - code_ << "DLTensor " << reference_arg << "_dlt = {\n"; + code_ << "DLTensor " << reference_arg << "_dltensor = {\n"; code_ << ".data = &" << reference_arg << "\n"; code_ << "};\n"; - code_ << "TVMValue " << reference_arg << "_tvmv = {\n"; - code_ << ".v_handle = &" << reference_arg << "_dlt\n"; + code_ << "TVMValue " << reference_arg << "_tvm_value = {\n"; + code_ << ".v_handle = &" << reference_arg << "_dltensor\n"; code_ << "};\n"; - return reference_arg + "_tvmv"; + return reference_arg + "_tvm_value"; } void GenerateInternalWorkspaceBuffers() { @@ -374,7 +371,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { } else { codegen_c_.PrintType(input_var.dtype(), call_args_ss); } - call_args_ss << " " << codegen_c_.SanitiseName(input_var->name_hint) << ","; + call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ","; } for (unsigned int i = 0; i < metadata_->num_outputs; ++i) { call_args_ss << "void* output" << i << ","; @@ -411,7 +408,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { { std::stringstream call_args_ss; for (const auto& input : metadata_->inputs) { - call_args_ss << "inputs->" << codegen_c_.SanitiseName(input->name_hint) << ","; + call_args_ss << "inputs->" << relay::backend::SanitizeName(input->name_hint) << ","; } if (metadata_->num_outputs == 1) { call_args_ss << "outputs->output,"; diff --git a/src/tir/usmp/algo/greedy.cc b/src/tir/usmp/algo/greedy.cc index a434d206162f..324474c569d4 100644 --- a/src/tir/usmp/algo/greedy.cc +++ b/src/tir/usmp/algo/greedy.cc @@ -40,6 +40,7 @@ #include #include #include +#include #include namespace tvm { diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc index ef6559b27caa..516ddd1a241b 100644 --- a/src/tir/usmp/transform/assign_pool_info.cc +++ b/src/tir/usmp/transform/assign_pool_info.cc @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc index 91e0218e34c8..5a2125077566 100644 --- a/src/tir/usmp/unified_static_memory_planner.cc +++ b/src/tir/usmp/unified_static_memory_planner.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,8 +35,8 @@ namespace tvm { -TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.enable", Bool); -TVM_REGISTER_PASS_CONFIG_OPTION("tir.usmp.algorithm", String); +TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPEnableOption, Bool); +TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPAlgorithmOption, String); namespace tir { namespace usmp { @@ -79,7 +79,7 @@ namespace transform { tvm::transform::Pass UnifiedStaticMemoryPlanner() { auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) { - auto algorithm_str = ctx->GetConfig("tir.usmp.algorithm", String(usmp::kDefaultAlgo)); + auto algorithm_str = ctx->GetConfig(kUSMPAlgorithmOption, String(usmp::kDefaultAlgo)); return Downcast( usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo)))); }; diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc index 36aa3b6d768a..1fff70f5892e 100644 --- a/src/tir/usmp/utils.cc +++ b/src/tir/usmp/utils.cc @@ -174,20 +174,6 @@ Array CreateArrayBufferInfo(const Map& buffer_info return ret; } -void PrintConflicts(const Array& bi_arr) { - for (const auto& bi : bi_arr) { - std::stringstream ss; - ss << "buf=" << bi->name_hint; - ss << "|conflicts="; - for (const auto& conflict_bi : bi->conflicts) { - BufferInfo cbi = Downcast(conflict_bi); - ss << cbi->name_hint << ","; - } - ss << "\n"; - LOG(INFO) << ss.str(); - } -} - Map AssignStmtPoolAllocations( const Map& buffer_info_to_stmt, const Map& buffer_info_to_pool_allocation) { diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py index c4ca733422e6..d369fd0a4a30 100644 --- a/tests/python/relay/aot/test_c_device_api.py +++ b/tests/python/relay/aot/test_c_device_api.py @@ -140,7 +140,6 @@ def test_device_api_hooks_unpacked_api(device_api_main_func): == "tir.call_extern(" + '"TVMDeviceEthosUActivate",' + " device_context_ethos_u)\n" ) # Open Device - print(str(main_func.body[1])) assert ( str(main_func.body[1][0][0][0]) == "tir.call_extern(" + '"TVMDeviceEthosUOpen",' + " device_context_ethos_u)\n" @@ -216,7 +215,6 @@ def test_without_device_api_unpacked_api(non_device_api_main_func): """Test a graph without the Device API with the unpacked internal calls""" main_func = non_device_api_main_func(interface_api="c", use_unpacked_api=True) - print(str(main_func.body)) assert ( str(main_func.body) == 'tir.call_extern("tvmgen_default_fused_multiply", x_buffer_var, y_buffer_var, output_buffer_var)\n' @@ -227,7 +225,6 @@ def test_without_device_api_packed_api(non_device_api_main_func): """Test a graph without the Device API with the packed internal calls""" main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False) - print(str(main_func.body)) assert ( str(main_func.body) == 'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n' From 7558345f845e789d771ad5bdfacd92e123151272 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Thu, 13 Jan 2022 10:37:22 +0000 Subject: [PATCH 11/15] USMP integration with AoT executor codegen Moving PrintType from codegen_c.cc to codegen_source_base.cc to be accessible by source_module.cc Change-Id: Icb6a85de4c26110d2fea370fbd75e02a3639de59 --- src/target/source/codegen_c.cc | 55 ------------------------ src/target/source/codegen_c.h | 13 +----- src/target/source/codegen_source_base.cc | 55 ++++++++++++++++++++++++ src/target/source/codegen_source_base.h | 12 ++++++ src/target/source/source_module.cc | 36 +++++++++++----- 5 files changed, 94 insertions(+), 77 deletions(-) diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index ac6f3adad606..e6f81646242d 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -362,61 +362,6 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { ICHECK_EQ(scope, "global"); } -void CodeGenC::PrintType(DataType t, std::ostream& os) { // NOLINT(*) - ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types"; - if (t.is_handle()) { - os << "void*"; - return; - } - if (t.is_float()) { - if (t.bits() == 32) { - os << "float"; - return; - } - if (t.bits() == 64) { - os << "double"; - return; - } - } else if (t.is_uint()) { - switch (t.bits()) { - case 8: - case 16: - case 32: - case 64: { - os << "uint" << t.bits() << "_t"; - return; - } - case 1: - os << "int"; - return; - } - } else if (t.is_int()) { - switch (t.bits()) { - case 8: - case 16: - case 32: - case 64: { - os << "int" << t.bits() << "_t"; - return; - } - } - } - LOG(FATAL) << "Cannot convert type " << t << " to C type"; -} - -void CodeGenC::PrintType(const Type& type, std::ostream& os) { // NOLINT(*) - if (auto* ptr = type.as()) { - return PrintType(ptr->dtype, os); - } else if (auto* ptr = type.as()) { - PrintType(ptr->element_type, os); - os << '*'; - } else if (IsVoidType(type)) { - os << "void"; - } else { - LOG(FATAL) << "Type " << type << " does not have a corresponding C Type"; - } -} - inline void PrintConst(const IntImmNode* op, std::ostream& os, CodeGenC* p) { // NOLINT(*) if (op->dtype == DataType::Int(32)) { std::ostringstream temp; diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h index 299f7e0a9cef..3b042b9fbd2c 100644 --- a/src/target/source/codegen_c.h +++ b/src/target/source/codegen_c.h @@ -163,18 +163,7 @@ class CodeGenC : public ExprFunctor, void VisitStmt_(const AssertStmtNode* op) override; void VisitStmt_(const EvaluateNode* op) override; void VisitStmt_(const SeqStmtNode* op) override; - /*! - * Print Type represetnation of type t. - * \param t The type representation. - * \param os The stream to print the ctype into - */ - virtual void PrintType(DataType t, std::ostream& os); // NOLINT(*) - /*! - * Print Type represetnation of type type. - * \param type The type representation. - * \param os The stream to print the ctype into - */ - virtual void PrintType(const Type& type, std::ostream& os); // NOLINT(*) + /*! * \brief Print expr representing the thread tag * \param IterVar iv The thread index to be binded; diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index 9f0cf9a70b61..933bd0502662 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -113,5 +113,60 @@ void CodeGenSourceBase::EndScope(int scope_id) { indent_ -= 2; } +void CodeGenSourceBase::PrintType(DataType t, std::ostream& os) { // NOLINT(*) + ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types"; + if (t.is_handle()) { + os << "void*"; + return; + } + if (t.is_float()) { + if (t.bits() == 32) { + os << "float"; + return; + } + if (t.bits() == 64) { + os << "double"; + return; + } + } else if (t.is_uint()) { + switch (t.bits()) { + case 8: + case 16: + case 32: + case 64: { + os << "uint" << t.bits() << "_t"; + return; + } + case 1: + os << "int"; + return; + } + } else if (t.is_int()) { + switch (t.bits()) { + case 8: + case 16: + case 32: + case 64: { + os << "int" << t.bits() << "_t"; + return; + } + } + } + LOG(FATAL) << "Cannot convert type " << t << " to C type"; +} + +void CodeGenSourceBase::PrintType(const Type& type, std::ostream& os) { // NOLINT(*) + if (auto* ptr = type.as()) { + return PrintType(ptr->dtype, os); + } else if (auto* ptr = type.as()) { + PrintType(ptr->element_type, os); + os << '*'; + } else if (IsVoidType(type)) { + os << "void"; + } else { + LOG(FATAL) << "Type " << type << " does not have a corresponding C Type"; + } +} + } // namespace codegen } // namespace tvm diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index d938469b8969..a516ff6b3d96 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -52,6 +52,18 @@ class CodeGenSourceBase { * \param value The constant value. */ void MarkConst(std::string value); + /*! + * Print Type represetnation of type t. + * \param t The type representation. + * \param os The stream to print the ctype into + */ + virtual void PrintType(DataType t, std::ostream& os); // NOLINT(*) + /*! + * Print Type represetnation of type type. + * \param type The type representation. + * \param os The stream to print the ctype into + */ + virtual void PrintType(const Type& type, std::ostream& os); // NOLINT(*) protected: /*! \brief entry in ssa assign map */ diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index deec2c343ae2..8fe95970a1c6 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -35,7 +35,7 @@ #include "../../runtime/file_utils.h" #include "../../support/str_escape.h" #include "../func_registry_generator.h" -#include "codegen_c.h" +#include "codegen_source_base.h" namespace tvm { namespace codegen { @@ -128,6 +128,22 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt, return runtime::Module(n); } +/*! + * \brief A concrete class to get access to base methods of CodegenSourceBase. + * + * This class exist to get access to methods of CodegenSourceBase without duplicating + * them. Therefore, keeping alignment with how codegen and source_module here generates + * code. + */ +class ConcreteCodegenSourceBase : public CodeGenSourceBase { + /*! + * \brief Do nothing as this class exist to get access to methods of CodeGenSourceBase + */ + void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) final { + return; + } +}; + class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { public: CSourceCrtMetadataModuleNode(const Array& func_names, const std::string& fmt, @@ -166,7 +182,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { Target target_; relay::Runtime runtime_; runtime::Metadata metadata_; - CodeGenC codegen_c_; + ConcreteCodegenSourceBase codegen_c_base_; void CreateFuncRegistry() { code_ << "#include \n"; @@ -245,9 +261,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { std::stringstream call_args_ss; for (const tir::Var& input_var : metadata_->inputs) { if (input_var->type_annotation.defined()) { - codegen_c_.PrintType(input_var->type_annotation, call_args_ss); + codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss); } else { - codegen_c_.PrintType(input_var.dtype(), call_args_ss); + codegen_c_base_.PrintType(input_var.dtype(), call_args_ss); } call_args_ss << " " << input_var->name_hint << ","; } @@ -256,9 +272,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { } for (const tir::Var& pool_var : metadata_->pools) { if (pool_var->type_annotation.defined()) { - codegen_c_.PrintType(pool_var->type_annotation, call_args_ss); + codegen_c_base_.PrintType(pool_var->type_annotation, call_args_ss); } else { - codegen_c_.PrintType(pool_var.dtype(), call_args_ss); + codegen_c_base_.PrintType(pool_var.dtype(), call_args_ss); } call_args_ss << " " << pool_var->name_hint << ","; } @@ -367,9 +383,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { std::stringstream call_args_ss; for (const tir::Var& input_var : metadata_->inputs) { if (input_var->type_annotation.defined()) { - codegen_c_.PrintType(input_var->type_annotation, call_args_ss); + codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss); } else { - codegen_c_.PrintType(input_var.dtype(), call_args_ss); + codegen_c_base_.PrintType(input_var.dtype(), call_args_ss); } call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ","; } @@ -378,9 +394,9 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { } for (const tir::Var& pool_var : metadata_->pools) { if (pool_var->type_annotation.defined()) { - codegen_c_.PrintType(pool_var->type_annotation, call_args_ss); + codegen_c_base_.PrintType(pool_var->type_annotation, call_args_ss); } else { - codegen_c_.PrintType(pool_var.dtype(), call_args_ss); + codegen_c_base_.PrintType(pool_var.dtype(), call_args_ss); } call_args_ss << " " << pool_var->name_hint << ","; } From 48d25c6697ae4b916f54387c95640b02a7e30bee Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Fri, 14 Jan 2022 14:28:47 +0000 Subject: [PATCH 12/15] USMP integration with AoT executor codegen Moving runtime::metadata to be ExecutorCodegeMetadata as it contains metadata produced by ExecutorCodegen for actual codegeneration (not a runtime component). Change-Id: I13e95573ef331fb995281dbe220db01a7aa91add --- src/relay/backend/aot_executor_codegen.cc | 6 +-- src/relay/backend/build_module.cc | 4 +- src/relay/backend/utils.h | 59 ++++++++++++++++++++++- src/relay/backend/vm/compiler.cc | 3 +- src/runtime/meta_data.h | 56 --------------------- src/target/metadata_module.cc | 2 +- src/target/metadata_module.h | 4 +- src/target/source/source_module.cc | 13 +++-- src/target/source/source_module.h | 4 +- tests/python/relay/aot/corstone300.ld | 9 +--- 10 files changed, 82 insertions(+), 78 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index dd66758d8c71..f076efeb4ac5 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -957,9 +957,9 @@ class AOTExecutorCodegen : public MixedModeVisitor { Array(tir_main_func->params.begin(), tir_main_func->params.begin() + tir_main_func->params.size() - return_sid_.size() - pool_vars.size() - devices.size()); - ret.metadata = - runtime::Metadata(inputs, pool_vars, devices, return_sid_.size(), runtime::kTvmExecutorAot, - mod_name, interface_api, use_unpacked_api_, pool_var_info); + ret.metadata = ExecutorCodegenMetadata(inputs, pool_vars, devices, return_sid_.size(), + runtime::kTvmExecutorAot, mod_name, interface_api, + use_unpacked_api_, pool_var_info); return ret; } diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index ccfd30476f67..2f986669e758 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -103,7 +103,9 @@ struct ExecutorCodegen { Array ListDevices() { return CallFunc>("get_devices"); } - runtime::Metadata GetMetadata() { return CallFunc("get_metadata"); } + relay::backend::ExecutorCodegenMetadata GetMetadata() { + return CallFunc("get_metadata"); + } virtual ~ExecutorCodegen() {} protected: diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 658283b5dc36..8e7aff0e6785 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -53,6 +53,63 @@ class TECompiler; namespace backend { using Pass = tvm::transform::Pass; +/*! + * \brief Structure that can be optionally used by the executor codegen + */ +class ExecutorCodegenMetadataNode : public Object { + public: + /*! \brief input information for the main function */ + Array inputs; + /*! \brief pool information for the main function */ + Array pools; + /*! \brief number of outputs of the main function */ + unsigned int num_outputs = 1; + /*! \brief device contexts information for the main function */ + Array devices; + /*! \brief the executor to be used to run the model */ + String executor = runtime::kTvmExecutorGraph; + /*! \brief The external API (packed or c) in use */ + String interface_api; + /*! \brief The internal API (packed or unpacked) in use */ + bool unpacked_api; + /*! \brief the input var names that correspond to pool_inputs */ + Optional> pool_inputs; + + String mod_name = ""; + + static constexpr const uint32_t _type_index = TypeIndex::kDynamic; + static constexpr const char* _type_key = "MetadataObj"; + TVM_DECLARE_FINAL_OBJECT_INFO(ExecutorCodegenMetadataNode, Object); +}; + +/*! + * \brief Managed reference to ExecutorCodegenMetadataNode. + */ +class ExecutorCodegenMetadata : public ObjectRef { + public: + TVM_DLL ExecutorCodegenMetadata(Array inputs, Array pools, + Array devices, int num_outputs, String executor, + String mod_name, String interface_api = "packed", + bool unpacked_api = false, + Map pool_inputs = + Map()) { + auto n = make_object(); + n->inputs = inputs; + n->pools = pools; + n->devices = devices; + n->num_outputs = num_outputs; + n->executor = executor; + n->interface_api = interface_api; + n->unpacked_api = unpacked_api; + n->mod_name = mod_name; + n->pool_inputs = pool_inputs; + data_ = std::move(n); + } + + TVM_DEFINE_OBJECT_REF_METHODS(ExecutorCodegenMetadata, ObjectRef, ExecutorCodegenMetadataNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(ExecutorCodegenMetadataNode); +}; + /*! * \brief The static storage information for each Tensor in the result of a Relay expression * (as per relay::FlattenTupleType). @@ -147,7 +204,7 @@ struct LoweredOutput { Array external_mods; Map function_metadata; std::unordered_map> params; - runtime::Metadata metadata; + ExecutorCodegenMetadata metadata; }; /*! diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 73f4b672a81c..f68dd9f8d2df 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -1162,7 +1162,8 @@ void VMCompiler::Codegen() { } lib = codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target, - Runtime::Create("cpp"), runtime::Metadata()); + Runtime::Create("cpp"), + relay::backend::ExecutorCodegenMetadata()); exec_->SetLib(lib); } diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h index 80d3eeb09dd8..3b9f5db4da6a 100644 --- a/src/runtime/meta_data.h +++ b/src/runtime/meta_data.h @@ -50,62 +50,6 @@ inline String get_name_mangled(const String& module_name, const String& name) { return ss.str(); } -/*! - * \brief Structure that can be optionally used by the executor codegen - */ -class MetadataNode : public Object { - public: - /*! \brief input information for the main function */ - Array inputs; - /*! \brief pool information for the main function */ - Array pools; - /*! \brief number of outputs of the main function */ - unsigned int num_outputs = 1; - /*! \brief device contexts information for the main function */ - Array devices; - /*! \brief the executor to be used to run the model */ - String executor = kTvmExecutorGraph; - /*! \brief The external API (packed or c) in use */ - String interface_api; - /*! \brief The internal API (packed or unpacked) in use */ - bool unpacked_api; - /*! \brief the input var names that correspond to pool_inputs */ - Optional> pool_inputs; - - String mod_name = ""; - - static constexpr const uint32_t _type_index = TypeIndex::kDynamic; - static constexpr const char* _type_key = "MetadataObj"; - TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, Object); -}; - -/*! - * \brief Managed reference to MetadataNode. - */ -class Metadata : public ObjectRef { - public: - TVM_DLL Metadata(Array inputs, Array pools, Array devices, - int num_outputs, String executor, String mod_name, - String interface_api = "packed", bool unpacked_api = false, - Map pool_inputs = - Map()) { - auto n = make_object(); - n->inputs = inputs; - n->pools = pools; - n->devices = devices; - n->num_outputs = num_outputs; - n->executor = executor; - n->interface_api = interface_api; - n->unpacked_api = unpacked_api; - n->mod_name = mod_name; - n->pool_inputs = pool_inputs; - data_ = std::move(n); - } - - TVM_DEFINE_OBJECT_REF_METHODS(Metadata, ObjectRef, MetadataNode); - TVM_DEFINE_OBJECT_REF_COW_METHOD(MetadataNode); -}; - /*! * \brief Create a metadata module object. * diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc index 2b190e5d66ed..2facf1de64d5 100644 --- a/src/target/metadata_module.cc +++ b/src/target/metadata_module.cc @@ -37,7 +37,7 @@ namespace codegen { runtime::Module CreateMetadataModule( const std::unordered_map& params, tvm::runtime::Module target_module, const Array& ext_modules, Target target, - tvm::relay::Runtime runtime, runtime::Metadata metadata) { + tvm::relay::Runtime runtime, relay::backend::ExecutorCodegenMetadata metadata) { // Here we split modules into two groups: // 1. Those modules which can be exported to C-runtime. These are DSO-exportable // (i.e. llvm or c) modules which return nothing from get_const_vars(). diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h index ee6f7231b3a1..2afcf3497ab8 100644 --- a/src/target/metadata_module.h +++ b/src/target/metadata_module.h @@ -33,7 +33,7 @@ #include #include -#include "../runtime/meta_data.h" +#include "../relay/backend/utils.h" namespace tvm { namespace codegen { @@ -54,7 +54,7 @@ namespace codegen { runtime::Module CreateMetadataModule( const std::unordered_map& params, runtime::Module target_module, const Array& ext_modules, Target target, tvm::relay::Runtime runtime, - runtime::Metadata metadata); + relay::backend::ExecutorCodegenMetadata metadata); } // namespace codegen } // namespace tvm diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index 8fe95970a1c6..43fc619133b5 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -147,7 +147,8 @@ class ConcreteCodegenSourceBase : public CodeGenSourceBase { class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { public: CSourceCrtMetadataModuleNode(const Array& func_names, const std::string& fmt, - Target target, relay::Runtime runtime, runtime::Metadata metadata) + Target target, relay::Runtime runtime, + relay::backend::ExecutorCodegenMetadata metadata) : fmt_(fmt), func_names_(func_names), target_(target), @@ -181,7 +182,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { Array func_names_; Target target_; relay::Runtime runtime_; - runtime::Metadata metadata_; + relay::backend::ExecutorCodegenMetadata metadata_; ConcreteCodegenSourceBase codegen_c_base_; void CreateFuncRegistry() { @@ -230,7 +231,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { for (const auto& kv : metadata_->pool_inputs.value()) { tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second; if (allocated_pool_info->pool_info->is_internal) { - code_ << "__attribute__((section(\".bss.tvm\"), "; + code_ << "__attribute__((section(\".data.tvm\"), "; code_ << "aligned(" << 16 << ")))\n"; code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "[" << allocated_pool_info->allocated_size->value << "];\n"; @@ -496,7 +497,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { }; runtime::Module CreateCSourceCrtMetadataModule(const Array& modules, Target target, - relay::Runtime runtime, runtime::Metadata metadata) { + relay::Runtime runtime, + relay::backend::ExecutorCodegenMetadata metadata) { Array func_names; for (runtime::Module mod : modules) { auto pf_funcs = mod.GetFunction("get_func_names"); @@ -580,7 +582,8 @@ TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule") .set_body_typed([](const Array& modules, Target target, relay::Runtime runtime) { // Note that we don't need metadata when we compile a single operator - return CreateCSourceCrtMetadataModule(modules, target, runtime, runtime::Metadata()); + return CreateCSourceCrtMetadataModule(modules, target, runtime, + relay::backend::ExecutorCodegenMetadata()); }); } // namespace codegen diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h index fde363c1198a..3b482a107600 100644 --- a/src/target/source/source_module.h +++ b/src/target/source/source_module.h @@ -29,6 +29,7 @@ #include #include +#include "../../relay/backend/utils.h" #include "../../runtime/meta_data.h" namespace tvm { @@ -43,7 +44,8 @@ namespace codegen { * \return The wrapped module. */ runtime::Module CreateCSourceCrtMetadataModule(const Array& modules, Target target, - relay::Runtime runtime, runtime::Metadata metadata); + relay::Runtime runtime, + relay::backend::ExecutorCodegenMetadata metadata); } // namespace codegen } // namespace tvm diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld index e066b1538481..a825da74c1db 100644 --- a/tests/python/relay/aot/corstone300.ld +++ b/tests/python/relay/aot/corstone300.ld @@ -141,6 +141,8 @@ SECTIONS . = ALIGN (16); *(.rodata.tvm) . = ALIGN (16); + *(.data.tvm) + . = ALIGN (16); } > DDR .text : @@ -249,13 +251,6 @@ SECTIONS . = ALIGN(16); } > SRAM AT > SRAM - .bss.tvm : - { - . = ALIGN(16); - *(.bss.tvm) - . = ALIGN(16); - } > DDR - .bss.NoInit : { . = ALIGN(16); From 8235183f9f994b17645e63e7a0b6467f79ac431d Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Fri, 14 Jan 2022 14:59:15 +0000 Subject: [PATCH 13/15] USMP integration with AoT executor codegen Remove unused function declaration Change-Id: Ib274553938eb5c82fe2b30cecc982481cff3937d --- include/tvm/tir/usmp/utils.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h index ec909a296fd8..ea37111f3ef0 100644 --- a/include/tvm/tir/usmp/utils.h +++ b/include/tvm/tir/usmp/utils.h @@ -295,7 +295,6 @@ Array CreateArrayBufferInfo(const Map& buffer_info */ Integer CalculateModuleWorkspaceSize(const IRModule& mod); -void PrintConflicts(const Array& buffer_info_arr); /*! * \brief The allocate node attribute to indicate candidate memory pools. From 526f3070abd5dc1b5938bfdee361387d25fbed44 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Fri, 14 Jan 2022 17:05:28 +0000 Subject: [PATCH 14/15] USMP integration with AoT executor codegen * Remove unncessary header file inclusion Change-Id: I5575d6226baa74fe09dce1c71176e557d84f669a --- include/tvm/tir/usmp/utils.h | 1 - src/relay/backend/utils.h | 1 + src/runtime/meta_data.h | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h index ea37111f3ef0..582399865d6f 100644 --- a/include/tvm/tir/usmp/utils.h +++ b/include/tvm/tir/usmp/utils.h @@ -295,7 +295,6 @@ Array CreateArrayBufferInfo(const Map& buffer_info */ Integer CalculateModuleWorkspaceSize(const IRModule& mod); - /*! * \brief The allocate node attribute to indicate candidate memory pools. * This needs to be kept in sync with CANDIDATE_MEMORY_POOL_ATTR in diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 8e7aff0e6785..cb019083a9d5 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h index 3b9f5db4da6a..e83e1a3a7629 100644 --- a/src/runtime/meta_data.h +++ b/src/runtime/meta_data.h @@ -30,7 +30,6 @@ #include #include #include -#include #include #include From 4a7c907ea5e85dc43a577638951a7fc515d0f873 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Tue, 18 Jan 2022 15:49:49 +0000 Subject: [PATCH 15/15] USMP integration with AoT executor codegen * fixed a typo * Improved consistenty with arg names for type printers Change-Id: I25263831c8bbfa11e6e0d6f8f8dd998d254294ff --- src/target/source/codegen_source_base.cc | 26 ++++++++++++------------ src/target/source/codegen_source_base.h | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc index 933bd0502662..5dcf1587bdb9 100644 --- a/src/target/source/codegen_source_base.cc +++ b/src/target/source/codegen_source_base.cc @@ -113,46 +113,46 @@ void CodeGenSourceBase::EndScope(int scope_id) { indent_ -= 2; } -void CodeGenSourceBase::PrintType(DataType t, std::ostream& os) { // NOLINT(*) - ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types"; - if (t.is_handle()) { +void CodeGenSourceBase::PrintType(DataType type, std::ostream& os) { // NOLINT(*) + ICHECK_EQ(type.lanes(), 1) << "do not yet support vector types"; + if (type.is_handle()) { os << "void*"; return; } - if (t.is_float()) { - if (t.bits() == 32) { + if (type.is_float()) { + if (type.bits() == 32) { os << "float"; return; } - if (t.bits() == 64) { + if (type.bits() == 64) { os << "double"; return; } - } else if (t.is_uint()) { - switch (t.bits()) { + } else if (type.is_uint()) { + switch (type.bits()) { case 8: case 16: case 32: case 64: { - os << "uint" << t.bits() << "_t"; + os << "uint" << type.bits() << "_t"; return; } case 1: os << "int"; return; } - } else if (t.is_int()) { - switch (t.bits()) { + } else if (type.is_int()) { + switch (type.bits()) { case 8: case 16: case 32: case 64: { - os << "int" << t.bits() << "_t"; + os << "int" << type.bits() << "_t"; return; } } } - LOG(FATAL) << "Cannot convert type " << t << " to C type"; + LOG(FATAL) << "Cannot convert type " << type << " to C type"; } void CodeGenSourceBase::PrintType(const Type& type, std::ostream& os) { // NOLINT(*) diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index a516ff6b3d96..8f8f9e1b8bf2 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -53,13 +53,13 @@ class CodeGenSourceBase { */ void MarkConst(std::string value); /*! - * Print Type represetnation of type t. + * Print Type representation of type type. * \param t The type representation. * \param os The stream to print the ctype into */ - virtual void PrintType(DataType t, std::ostream& os); // NOLINT(*) + virtual void PrintType(DataType type, std::ostream& os); // NOLINT(*) /*! - * Print Type represetnation of type type. + * Print Type representation of type type. * \param type The type representation. * \param os The stream to print the ctype into */