From a9e0b72954563d754ee251fd78ffc0ba46b85578 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 6 May 2021 11:06:48 +0100 Subject: [PATCH 1/7] Fix an issue with storage-rewrite pass and packed functions Change-Id: I13888471d4b8927a4012d6a8e749fb7a8935dd77 --- src/relay/backend/aot_executor_codegen.cc | 282 ++++++++++++++++------ src/tir/transforms/storage_rewrite.cc | 36 ++- tests/python/relay/aot/aot_test_utils.py | 48 +++- tests/python/relay/aot/test_crt_aot.py | 42 ++++ 4 files changed, 336 insertions(+), 72 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 93935af70fca..a6eae6d5854e 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -44,52 +45,185 @@ namespace tvm { namespace relay { namespace backend { +/** + * Struct to contain information about the intermediate tensors in the + * runner function + */ +struct StorageInfo { + /*! \brief storage integer identifier of the particular intermediate buffer */ + int sid; + /*! \brief exact size of the temporary */ + int size_bytes; + /*! \brief device type of the intermediate tensor */ + int dev_type; +}; + using IntegerArray = Array; using TargetsMap = std::unordered_map; +using StorageMap = std::unordered_map, runtime::ObjectPtrHash, + runtime::ObjectPtrEqual>; -class AotReturnSidVisitor : public ExprVisitor { +/** + * This is an on demand allocator for AOT. A new temporary + * (storage allocator identifier) is allocated for each operation. + */ +class AOTOnDemandAllocator : public ExprVisitor { public: - explicit AotReturnSidVisitor(Map> storage_device_map) - : storage_device_map_{storage_device_map}, return_sid_{-1} {} + // run the visitor on a function. + void Run(const Function& func) { + node_device_map_ = CollectDeviceInfo(func); - IntegerArray FindReturnSid(Function func) { - VisitExpr(func->body); - return return_sid_; + for (Expr param : func->params) { + CreateStorage(param.operator->()); + } + + GetStorage(func->body); } - protected: - void AssignReturnSid(Expr e) { - auto iter = storage_device_map_.find(e); - if (iter != storage_device_map_.end()) { - return_sid_ = (*iter).second[0]; + std::vector GetReturnIds() const { return return_ids_; } + + StorageMap GetStorageMap() const { return storage_device_map_; } + + void VisitExpr_(const ConstantNode* op) final { + CreateStorage(op); + AssignReturnSid(GetRef(op)); + } + + void VisitExpr_(const CallNode* op) final { + // create token for the call node. + CreateStorage(op); + for (Expr arg : op->args) { + GetStorage(arg); } + AssignReturnSid(GetRef(op)); } - void VisitExpr_(const ConstantNode* cn) override { - ExprVisitor::VisitExpr_(cn); - AssignReturnSid(GetRef(cn)); + void VisitExpr_(const VarNode* op) final { + ExprVisitor::VisitExpr_(op); + AssignReturnSid(GetRef(op)); } - void VisitExpr_(const VarNode* vn) override { - ExprVisitor::VisitExpr_(vn); - AssignReturnSid(GetRef(vn)); + void VisitExpr_(const FunctionNode* op) final { + // do not recurse into sub function. } - void VisitExpr_(const CallNode* cn) override { - ExprVisitor::VisitExpr_(cn); - AssignReturnSid(GetRef(cn)); + void VisitExpr_(const GlobalVarNode* op) final { + // Do nothing. } - void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); } + void VisitExpr_(const OpNode* op) final { + // Do nothing. + } + + void VisitExpr_(const TupleNode* op) final { + std::vector field_sids; + Expr expr = GetRef(op); + for (Expr field : op->fields) { + auto sid = GetStorage(field); + field_sids.insert(field_sids.end(), sid.begin(), sid.end()); + } - void VisitExpr_(const TupleNode* tn) override { - ExprVisitor::VisitExpr_(tn); - AssignReturnSid(GetRef(tn)); + storage_device_map_[expr] = field_sids; + AssignReturnSid(expr); + } + + void VisitExpr_(const TupleGetItemNode* op) final { + Expr expr = GetRef(op); + const auto& sids = GetStorage(op->tuple); + ICHECK_LT(static_cast(op->index), sids.size()); + storage_device_map_[expr] = {sids[op->index]}; + AssignReturnSid(expr); } + void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; } + + void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "if is not supported."; } + private: - Map> storage_device_map_; - IntegerArray return_sid_; + void AssignReturnSid(Expr e) { + if (storage_device_map_.find(e) != storage_device_map_.end()) { + auto buffers = storage_device_map_[e]; + std::vector return_ids; + for (auto buffer : buffers) { + return_ids.push_back(buffer.sid); + } + return_ids_ = return_ids; + } + } + /*! + * \brief ceil(size/word_size) to get number of words. + * \param size The original size. + * \param word_size The element size. + */ + static size_t DivRoundUp(size_t size, size_t word_size) { + return (size + word_size - 1) / word_size; + } + /*! + * \brief Get the memory requirement. + * \param prototype The prototype token. + * \return The required memory size. + */ + size_t GetMemorySize(const TensorTypeNode* ttype) { + ICHECK(ttype != nullptr); + size_t size = 1; + for (IndexExpr dim : ttype->shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); + return size; + } + /*! + * \brief Get the necessary storage for the expression. + * \param expr The expression. + * \return The corresponding token. + */ + std::vector GetStorage(const Expr& expr) { + this->VisitExpr(expr); + auto it = storage_device_map_.find(expr); + ICHECK(it != storage_device_map_.end()); + return it->second; + } + + /*! + * \brief Create storage for the expression. + * \param expr The expression. + */ + void CreateStorage(const ExprNode* op) { + std::vector buffers; + Expr expr = GetRef(op); + int device_type = node_device_map_.count(GetRef(op)) ? node_device_map_[expr]->value : 0; + if (const auto* tuple_type = op->checked_type().as()) { + for (Type t : tuple_type->fields) { + const auto* ttype = t.as(); + ICHECK(ttype); + StorageInfo buffer; + buffer.sid = sid_++; + buffer.size_bytes = GetMemorySize(ttype); + buffer.dev_type = device_type; + buffers.push_back(buffer); + } + } else { + const auto* ttype = op->checked_type().as(); + ICHECK(ttype); + StorageInfo buffer; + buffer.sid = sid_++; + buffer.size_bytes = GetMemorySize(ttype); + buffer.dev_type = device_type; + buffers.push_back(buffer); + } + storage_device_map_[expr] = buffers; + } + /*! \brief mapping of expression -> storageInfo*/ + StorageMap storage_device_map_; + /*! \brief mapping of expression -> device type*/ + Map node_device_map_; + /*! \brief current id of the temporary allocated*/ + int sid_{0}; + /*! \brief the set of intermediate tensors that are return variables */ + std::vector return_ids_; }; /*! \brief Code generator for AOT executor */ @@ -120,37 +254,37 @@ class AOTExecutorCodegen : public ExprVisitor { * \brief Return a vector of variables that represents the sids for the given Relay Expr */ std::vector PackSid(Expr expr) { - Array sids = storage_device_map_[expr]; - std::vector sid_vars; + auto buffers = storage_device_map_[expr]; + std::vector buffer_vars; // Note that an expression can have multiple sids associated with it // e.g., returning multiple values from a function - for (const auto& sid : sids[0]) { + for (const auto& buffer : buffers) { // Determine if an sid is an output buffer - int sid_int = static_cast((sid.as())->value); - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int); + int sid = buffer.sid; + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - sid_vars.push_back(main_signature_[input_vars_.size() + output_index]); + buffer_vars.push_back(main_signature_[input_vars_.size() + output_index]); continue; } - // Pack the sid inside the TVMValue - auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle()); - auto sid_value = sids_table_[sid]; + + auto sid_value = sids_table_[sid]; if (!use_unpacked_api_) { + // Pack the sid inside the TVMValue + auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle()); tvm::PrimExpr set_tensor = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), {sid_array, 0, tir::builtin::kArrData, sid_value}); stmts_.push_back( tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor))); + buffer_vars.push_back(sid_array); } else { - stmts_.push_back(tir::LetStmt(sid_array, sid_value, tir::Evaluate(0))); + buffer_vars.push_back(sid_value) } - - sid_vars.push_back(sid_array); } - return sid_vars; + return buffer_vars; } /*! @@ -390,8 +524,7 @@ class AOTExecutorCodegen : public ExprVisitor { } ICHECK_GE(storage_device_map_.count(expr), 0); - auto& device_type = storage_device_map_[expr][1]; - auto call_dev_type = device_type[0]->value; + auto call_dev_type = storage_device_map_[expr][0].dev_type; // Normal Relay Function if (targets_.size() == 1) { // homogeneous execution. @@ -428,14 +561,14 @@ class AOTExecutorCodegen : public ExprVisitor { // If the Var node is an output node we need to copy the content of the variable to the output // It's safe to check the SID here because Var StorageToken are never reallocated - Array sids = storage_device_map_[expr]; + auto buffers = storage_device_map_[expr]; - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), - static_cast((sids[0][0].as())->value)); + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); auto var_expr = FindExpr(expr); - CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]); + CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], + buffers[0].size_bytes); } } @@ -444,18 +577,18 @@ class AOTExecutorCodegen : public ExprVisitor { size_t index = params_.size(); std::string name = "p" + std::to_string(index); - param_storage_ids_[name] = storage_device_map_[expr][0][0]->value; + param_storage_ids_[name] = storage_device_map_[expr][0].sid; params_[name] = op->data; params_by_expr_.Set(expr, name); // If the Constant node is an output node we need to copy the content of the parameter to the // output A Var node can only produce a single output - Array sids = storage_device_map_[expr]; - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), - static_cast((sids[0][0].as())->value)); + auto buffers = storage_device_map_[expr]; + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]); + CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), + buffers[0].size_bytes); } } @@ -511,9 +644,9 @@ class AOTExecutorCodegen : public ExprVisitor { continue; } - for (unsigned int i = 0; i < kv.second[0].size(); i++) { - int size = kv.second[2][i]; - int sid = static_cast((kv.second[0][i].as())->value); + for (unsigned int i = 0; i < kv.second.size(); i++) { + int size = kv.second[i].size_bytes; + int sid = kv.second[i].sid; if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) { continue; @@ -523,6 +656,8 @@ class AOTExecutorCodegen : public ExprVisitor { // so we don't pay the price of allocation for every inference if (!allocated[sid]) { body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body); + body = tir::AttrStmt(sids_table_[sid], tir::attr::storage_scope, tir::StringImm("global"), + body); } allocated[sid] = true; } @@ -578,7 +713,8 @@ class AOTExecutorCodegen : public ExprVisitor { std::unordered_map param_storage_ids_; /*! \brief plan memory of device result */ - Map> storage_device_map_; + StorageMap storage_device_map_; + /*! \brief mapping sid -> tir::Var */ std::unordered_map sids_table_; /*! \brief lowered funcs */ std::unordered_map lowered_funcs_; @@ -589,10 +725,11 @@ class AOTExecutorCodegen : public ExprVisitor { /*! \brief the set of statements that make the program */ std::vector stmts_; /*! \brief the list of return sids (note that the function might return more then one output */ - IntegerArray return_sid_; + std::vector return_sid_; /*! \brief the module name we use to mangle the function names */ String mod_name_; + public: AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host) : mod_(mod), @@ -602,9 +739,11 @@ class AOTExecutorCodegen : public ExprVisitor { compile_engine_(CompileEngine::Global()) {} LoweredOutput Codegen(relay::Function func, String mod_name) { - // Get the module, storage map and token sizes - auto pf = GetPackedFunc("relay.backend.GraphPlanMemory"); - storage_device_map_ = (*pf)(func); + auto aot_allocator = AOTOnDemandAllocator(); + aot_allocator.Run(func); + + // Retrieve the storage map + storage_device_map_ = aot_allocator.GetStorageMap(); mod_name_ = mod_name; for (auto input : func->params) { @@ -614,14 +753,14 @@ class AOTExecutorCodegen : public ExprVisitor { // Define the storage allocator ids for (auto kv : storage_device_map_) { - for (const auto& sid : kv.second[0]) { - te::Var sid_var(MakeString("sid_", sid), PointerType(PrimType(DataType::Int(8)))); - sids_table_[sid] = sid_var; + for (const auto& buffer : kv.second) { + te::Var buffer_var(MakeString("sid_", buffer.sid), PointerType(PrimType(DataType::Int(8)))); + sids_table_[buffer.sid] = buffer_var; } } - // Find the return sid - return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func); + // Retrieve the return sids + return_sid_ = aot_allocator.GetReturnIds(); for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) { main_signature_.push_back(tir::Var("output", DataType::Handle())); } @@ -649,14 +788,21 @@ class AOTExecutorCodegen : public ExprVisitor { } ret.external_mods = compile_engine_->LowerExternalFunctions(); + // Build the TIR IRModule + Map symbol_map; + symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); + IRModule mod_run(symbol_map); + + // Apply storage rewrite pass to the runner function to do memory planning + auto storage_rewrite = tir::transform::StorageRewrite(); + mod_run = storage_rewrite(mod_run); + + // Update the lowered functions auto target_host_str = target_host_->str(); if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) { - ret.lowered_funcs[target_host_str]->Add( - GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); + ret.lowered_funcs[target_host_str]->Update(mod_run); } else { - Map symbol_map; - symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_suffix), prim_func); - ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map)); + ret.lowered_funcs.Set(target_host_str, mod_run); } ret.function_metadata = std::move(function_metadata_); ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size(), diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 36eeddb17d89..66218b3555d3 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -138,6 +138,34 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { if (op->op.same_as(builtin::address_of())) { const LoadNode* l = op->args[0].as(); this->VisitExpr(l->index); + } else if (op->op.same_as(builtin::tvm_call_cpacked())) { + // Recall that the arguments of a tvm_call_cpacked are passed as + // TVMValues. But a TVMValue is only a container, that points to + // a real buffer previously allocated. We need to signal that those + // buffers need to be live at the same time (i.e., cannot be overridden) + Array args = op->args; + for (auto arg : args) { + const VarNode* var = arg.as(); + if (value_to_alloc_.find(var) != value_to_alloc_.end()) { + auto allocs = value_to_alloc_[var]; + for (const VarNode* alloc : allocs) { + VisitExpr_(alloc); + } + } else { + this->VisitExpr(arg); + } + } + } else if (op->op.same_as(builtin::tvm_struct_set())) { + // If we are using a struct_set built-in, and we are setting + // a DLTensor ArrayData field, let's note down the + // buffers that the TVMValue refers to + const VarNode* var = op->args[0].as(); + const VarNode* alloc = op->args[3].as(); + const int field_id = op->args[2].as()->value; + if (var && alloc && field_id == tir::builtin::kArrData) { + value_to_alloc_[var].push_back(alloc); + } + StmtExprVisitor::VisitExpr_(op); } else { StmtExprVisitor::VisitExpr_(op); } @@ -206,6 +234,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { bool in_thread_env_{false}; // The scope stack. std::vector scope_; + // This is a map to connect TVMValues to real allocations + std::unordered_map> value_to_alloc_; }; // Verify if the statement can be run safely via inplace fashion @@ -887,11 +917,11 @@ class StoragePlanRewriter : public StmtExprMutator { // symbolic free list, for non constant items. std::list sym_free_list_; // The allocation attach map - std::unordered_map > attach_map_; + std::unordered_map> attach_map_; // The allocation assign map std::unordered_map alloc_map_; // The allocations - std::vector > alloc_vec_; + std::vector> alloc_vec_; // analyzer arith::Analyzer analyzer_; }; @@ -950,7 +980,7 @@ class VectorAllocRewriter : public StmtExprMutator { } // Internal access map - std::unordered_map > acc_map_; + std::unordered_map> acc_map_; // Variables to remap Map var_remap_; // internal analyzer diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index a18a0fa7dbe7..26b88f005001 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -37,9 +37,50 @@ from tvm.micro import export_model_library_format +<<<<<<< HEAD def mangle_name(mod_name, name): mod_name = mangle_module_name(mod_name) return mod_name + "_" + name +======= +def convert_to_relay( + tflite_model_buf, + input_data, + input_node, +): + """ Convert a tflite model buffer in a Relay module """ + + def convert_to_list(x): + if not isinstance(x, list): + x = [x] + return x + + # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 + try: + import tflite.Model + + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) + except AttributeError: + import tflite + + tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) + except ImportError: + raise ImportError("The tflite package must be installed") + + input_data = convert_to_list(input_data) + input_node = convert_to_list(input_node) + + shape_dict = {} + dtype_dict = {} + for i, e in enumerate(input_node): + shape_dict[e] = input_data[i].shape + dtype_dict[e] = input_data[i].dtype.name + + mod, params = relay.frontend.from_tflite( + tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict + ) + mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) + return mod, params +>>>>>>> 7d02e64f6... Fix an issue with storage-rewrite pass and packed functions def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout): @@ -221,6 +262,7 @@ def compile_and_run( params=None, workspace_byte_alignment=8, mod_name=None, + enable_op_fusion=True, ): """ This method verifies the generated source @@ -232,7 +274,11 @@ def compile_and_run( if not use_calculated_workspaces: cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " - with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + config = {"tir.disable_vectorize": True} + if not enable_op_fusion: + config["relay.FuseOps.max_depth"] = 1 + + with tvm.transform.PassContext(opt_level=3, config=config): lib = tvm.relay.build(mod, target, target_host=target, params=params, mod_name=mod_name) tmp_path = utils.tempdir() diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 36596a4bec21..307ef845e199 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -465,5 +465,47 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), ) +def test_quant_mobilenet_tfl(): + pytest.importorskip("tflite") + + import tvm.relay.testing.tf as tf_testing + + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/" + "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite", + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + data_shape = (1, 224, 224, 3) + in_min, in_max = (0, 255) + data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8") + mod, params = convert_to_relay(tflite_model_buf, data, "input") + inputs = {"input": data} + output_list = generate_ref_data(mod, inputs, params) + input_list = [inputs["input"]] + compile_and_run(mod, input_list, output_list, True, params) + +@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"]) +def test_transpose(target_options): + dtype = "float32" + x = relay.var("x", shape=(10, 5), dtype=dtype) + y = relay.var("y", shape=(10, 5), dtype=dtype) + t = relay.var("z", shape=(), dtype=dtype) + a = relay.add(x, y) + b = relay.transpose(a) + z = relay.add(b, t) + # Check result. + func = relay.Function([x, y, t], z) + x_data = np.random.rand(10, 5).astype(dtype) + y_data = np.random.rand(10, 5).astype(dtype) + t_data = np.random.uniform(size=()).astype(dtype) + inputs = {"x": x_data, "y": y_data, "z": t_data} + + output_list = generate_ref_data(func, inputs) + input_list = [inputs["x"], inputs["y"], inputs["z"]] + compile_and_run(func, input_list, output_list, target_options, True, enable_op_fusion=False) + + if __name__ == "__main__": pytest.main([__file__]) From 52a510909db78de3978c522eddd016881b84eb52 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Mon, 7 Jun 2021 16:00:33 +0100 Subject: [PATCH 2/7] Rebasing Change-Id: I7aa12e0217b8a2e1ff2a97a7c5fdda6b7597ae64 --- src/relay/backend/aot_executor_codegen.cc | 7 +++---- tests/python/relay/aot/aot_test_utils.py | 2 +- tests/python/relay/aot/test_crt_aot.py | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index a6eae6d5854e..566f8dcb4574 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -269,11 +269,10 @@ class AOTExecutorCodegen : public ExprVisitor { continue; } - auto sid_value = sids_table_[sid]; if (!use_unpacked_api_) { - // Pack the sid inside the TVMValue - auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle()); + // Pack the sid inside the TVMValue + auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle()); tvm::PrimExpr set_tensor = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), {sid_array, 0, tir::builtin::kArrData, sid_value}); @@ -281,7 +280,7 @@ class AOTExecutorCodegen : public ExprVisitor { tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor))); buffer_vars.push_back(sid_array); } else { - buffer_vars.push_back(sid_value) + buffer_vars.push_back(sid_value); } } return buffer_vars; diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 26b88f005001..29994fb2afff 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -47,7 +47,7 @@ def convert_to_relay( input_data, input_node, ): - """ Convert a tflite model buffer in a Relay module """ + """Convert a tflite model buffer in a Relay module""" def convert_to_list(x): if not isinstance(x, list): diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 307ef845e199..321c77851006 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -484,7 +484,8 @@ def test_quant_mobilenet_tfl(): inputs = {"input": data} output_list = generate_ref_data(mod, inputs, params) input_list = [inputs["input"]] - compile_and_run(mod, input_list, output_list, True, params) + compile_and_run(mod, input_list, output_list, "--unpacked-api=0", True, params) + @pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"]) def test_transpose(target_options): From 75167f3345ef98a55f0c516c9dbe34062f97bcaf Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Mon, 7 Jun 2021 22:20:01 +0100 Subject: [PATCH 3/7] Addressing comments Change-Id: If9f1ee190690f9a810fe41eb1933d736f1eb4ec3 --- src/relay/backend/aot_executor_codegen.cc | 19 +++++++++---------- src/tir/transforms/storage_rewrite.cc | 10 ++++++++-- tests/python/relay/aot/test_crt_aot.py | 5 +++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 566f8dcb4574..2f677bec17aa 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -137,17 +137,16 @@ class AOTOnDemandAllocator : public ExprVisitor { void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; } - void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "if is not supported."; } + void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "let is not supported."; } private: void AssignReturnSid(Expr e) { if (storage_device_map_.find(e) != storage_device_map_.end()) { auto buffers = storage_device_map_[e]; - std::vector return_ids; + return_ids_.clear(); for (auto buffer : buffers) { - return_ids.push_back(buffer.sid); + return_ids_.push_back(buffer.sid); } - return_ids_ = return_ids; } } /*! @@ -163,7 +162,7 @@ class AOTOnDemandAllocator : public ExprVisitor { * \param prototype The prototype token. * \return The required memory size. */ - size_t GetMemorySize(const TensorTypeNode* ttype) { + size_t GetMemorySizeBytes(const TensorTypeNode* ttype) { ICHECK(ttype != nullptr); size_t size = 1; for (IndexExpr dim : ttype->shape) { @@ -200,8 +199,8 @@ class AOTOnDemandAllocator : public ExprVisitor { const auto* ttype = t.as(); ICHECK(ttype); StorageInfo buffer; - buffer.sid = sid_++; - buffer.size_bytes = GetMemorySize(ttype); + buffer.sid = next_available_sid_++; + buffer.size_bytes = GetMemorySizeBytes(ttype); buffer.dev_type = device_type; buffers.push_back(buffer); } @@ -209,8 +208,8 @@ class AOTOnDemandAllocator : public ExprVisitor { const auto* ttype = op->checked_type().as(); ICHECK(ttype); StorageInfo buffer; - buffer.sid = sid_++; - buffer.size_bytes = GetMemorySize(ttype); + buffer.sid = next_available_sid_++; + buffer.size_bytes = GetMemorySizeBytes(ttype); buffer.dev_type = device_type; buffers.push_back(buffer); } @@ -221,7 +220,7 @@ class AOTOnDemandAllocator : public ExprVisitor { /*! \brief mapping of expression -> device type*/ Map node_device_map_; /*! \brief current id of the temporary allocated*/ - int sid_{0}; + int next_available_sid_{0}; /*! \brief the set of intermediate tensors that are return variables */ std::vector return_ids_; }; diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 66218b3555d3..cd91a4b53317 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -142,7 +142,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { // Recall that the arguments of a tvm_call_cpacked are passed as // TVMValues. But a TVMValue is only a container, that points to // a real buffer previously allocated. We need to signal that those - // buffers need to be live at the same time (i.e., cannot be overridden) + // buffers need to be live at the same time (i.e., cannot be overwritten during the function + // call) Array args = op->args; for (auto arg : args) { const VarNode* var = arg.as(); @@ -234,7 +235,12 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { bool in_thread_env_{false}; // The scope stack. std::vector scope_; - // This is a map to connect TVMValues to real allocations + // This is a map to connect TVMValues to real allocations. When we pass parameters + // to a tvm_call_cpacked, the data needs to be wrapped in a TVMValue. The wrapping + // happens through the tvm_struct_set built-in. This map is mapping the variable + // representing the TVMValue to the variable representing the real buffer. The live + // analysis needs to happen on the latter and not on the TVMValue which only acts as + // a container. std::unordered_map> value_to_alloc_; }; diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 321c77851006..13cbfa71b6ae 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -466,6 +466,9 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), def test_quant_mobilenet_tfl(): + """Since in AOT we pass directly the output buffer from the user, in quantized networks sharing the output buffers is not possible. + This is because the output data type is int8 and the intermediate buffer are int32 or int16. We use mobilenet quantized to stress this + situation and verify that the output buffer sharing is disabled in AOT.""" pytest.importorskip("tflite") import tvm.relay.testing.tf as tf_testing @@ -489,6 +492,8 @@ def test_quant_mobilenet_tfl(): @pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"]) def test_transpose(target_options): + """Test that non-inpleaceable operations (e.g., transpose) do not happen in-place.""" + dtype = "float32" x = relay.var("x", shape=(10, 5), dtype=dtype) y = relay.var("y", shape=(10, 5), dtype=dtype) From a1c345503218500b5add43946a7c5c0a5773c61b Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 18 Jun 2021 14:47:29 +0100 Subject: [PATCH 4/7] Add a pass to legalize packed calls Change-Id: I8aa43d3a1b837b03a5cf3c6b32fc760bd78d3436 --- include/tvm/tir/transform.h | 5 + src/relay/backend/aot_executor_codegen.cc | 94 ++++++++--------- src/tir/transforms/ir_utils.h | 23 +++++ src/tir/transforms/legalize_packed_calls.cc | 108 ++++++++++++++++++++ src/tir/transforms/lower_tvm_builtin.cc | 10 -- src/tir/transforms/storage_rewrite.cc | 42 +------- 6 files changed, 181 insertions(+), 101 deletions(-) create mode 100644 src/tir/transforms/legalize_packed_calls.cc diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index 2113d58f1ffa..5ee847e2f010 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -418,6 +418,11 @@ TVM_DLL Pass ConvertBlocksToOpaque(); */ TVM_DLL Pass CompactBufferAllocation(); +/*! + * This pass legalizes packed calls by wrapping their arguments into TVMValues + */ +TVM_DLL Pass LegalizePackedCalls(); + /*! * \brief Flatten the multi-dimensional BufferLoad and BufferStore * to single dimensional Load/Store. Also remove Block to diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 2f677bec17aa..66f3f1ecaa20 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -269,50 +269,11 @@ class AOTExecutorCodegen : public ExprVisitor { } auto sid_value = sids_table_[sid]; - if (!use_unpacked_api_) { - // Pack the sid inside the TVMValue - auto sid_array = te::Var(MakeString("sid_", sid, "_value"), DataType::Handle()); - tvm::PrimExpr set_tensor = - tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), - {sid_array, 0, tir::builtin::kArrData, sid_value}); - stmts_.push_back( - tir::LetStmt(sid_array, StackAlloca("array", 1), tir::Evaluate(set_tensor))); - buffer_vars.push_back(sid_array); - } else { - buffer_vars.push_back(sid_value); - } + buffer_vars.push_back(sid_value); } return buffer_vars; } - /*! - * \brief Utility function to return a parameter associated with an expression - * \param expr Relay Expression associated with the parameter - * \return Variable that represents the DLTensor associated with the parameters - */ - tir::Var PackParam(Expr expr) { - int param_sid = param_storage_ids_[params_by_expr_[expr]]; - auto param_array = te::Var(MakeString("param_", param_sid, "_array"), DataType::Handle()); - - // Compose the lookup_call using a local stack - Array lookup_call; - // Set the param to the value returned by lookup_call - auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), - {tir::StringImm(params_by_expr_[expr])}); - - if (!use_unpacked_api_) { - tvm::PrimExpr set_param_array = - tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), - {param_array, 0, tir::builtin::kArrData, param_handle}); - stmts_.push_back( - tir::LetStmt(param_array, StackAlloca("arg_value", 1), tir::Evaluate(set_param_array))); - } else { - stmts_.push_back(tir::LetStmt(param_array, param_handle, tir::Evaluate(0))); - } - - return param_array; - } - /*! * brief Given an expression return the variable(s) associated with that expression */ @@ -322,9 +283,6 @@ class AOTExecutorCodegen : public ExprVisitor { // Input variable int main_index = std::distance(input_vars_.begin(), input_iter); return {main_signature_[main_index]}; - } else if (params_by_expr_.find(arg) != params_by_expr_.end()) { - // Parameter of the network - return {PackParam(arg)}; } else { // Storage identifier (i.e., intermediate memory) return PackSid(arg); @@ -340,8 +298,14 @@ class AOTExecutorCodegen : public ExprVisitor { // Pack the inputs for (Expr arg : call->args) { - auto var_arg = FindExpr(arg); - args.push_back(var_arg[0]); + if (params_by_expr_.find(arg) != params_by_expr_.end()) { + auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), + {tir::StringImm(params_by_expr_[arg])}); + args.push_back(param_handle); + } else { + auto var_arg = FindExpr(arg); + args.push_back(var_arg[0]); + } } auto ret_expr = Downcast(call); @@ -369,7 +333,7 @@ class AOTExecutorCodegen : public ExprVisitor { * TODO(giuseros): we should try to avoid unnecessary copy to the output, e.g., in a * copy-on-write fashion. */ - void CopyToOutput(te::Var out, te::Var in, size_t size) { + void CopyToOutput(PrimExpr out, PrimExpr in, bool pack_input, size_t size) { // Define intermediate DLTensor to load/store the data auto tmp0 = te::Var("tmp0", DataType::Handle()); auto tmp1 = te::Var("tmp1", DataType::Handle()); @@ -381,10 +345,15 @@ class AOTExecutorCodegen : public ExprVisitor { PrimExpr tostore = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_get(), {out, 0, tir::builtin::kArrData}); if (use_unpacked_api_) { - retval_get = in; tostore = out; } + // Do not pack the input if the flag is set or the caller + // explicitly asked to do so (e.g., copying a param to the output) + if (use_unpacked_api_ || !pack_input) { + retval_get = in; + } + // Copy the variable from the input to the output tir::Stmt copy = tir::For( loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial, @@ -564,9 +533,16 @@ class AOTExecutorCodegen : public ExprVisitor { auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - auto var_expr = FindExpr(expr); - CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], - buffers[0].size_bytes); + if (params_by_expr_.find(expr) != params_by_expr_.end()) { + auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), + {tir::StringImm(params_by_expr_[expr])}); + CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, + /*pack_input*/ true, buffers[0].size_bytes); + } else { + auto var_expr = FindExpr(expr); + CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], + /*pack_input*/ true, buffers[0].size_bytes); + } } } @@ -585,7 +561,9 @@ class AOTExecutorCodegen : public ExprVisitor { auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), + auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), + {tir::StringImm(params_by_expr_[expr])}); + CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, false, buffers[0].size_bytes); } } @@ -626,7 +604,9 @@ class AOTExecutorCodegen : public ExprVisitor { throw std::invalid_argument("match case not yet implemented"); } - // Create the main PrimFunc to execute the graph + // Create the main PrimFunc to execute the graph. Please note that + // the packed function calls don't pack their arguments. The AOT + // runner function needs to be legalized by the LegalizePackedCalls pass. tir::PrimFunc CreateMainFunc(unsigned int relay_params) { tir::Stmt body = tir::SeqStmt(stmts_); @@ -765,6 +745,9 @@ class AOTExecutorCodegen : public ExprVisitor { VisitExpr(func->body); + // Create the runner function. Please note that the function is not legal yet + // because the packed calls arguments are not wrapped in TVMValues. To make this happen we need + // to run the LegalizePackedCalls pass. auto prim_func = CreateMainFunc(func->params.size()); UpdateMainWorkspaceSize(prim_func, func); LoweredOutput ret; @@ -795,6 +778,13 @@ class AOTExecutorCodegen : public ExprVisitor { auto storage_rewrite = tir::transform::StorageRewrite(); mod_run = storage_rewrite(mod_run); + // Legalize AOT if needed. This means that all the packed calls + // need to be wrapped in TVMValues (unless use_unpacked_api is set) + if (!use_unpacked_api_) { + auto pack_calls = tir::transform::LegalizePackedCalls(); + mod_run = pack_calls(mod_run); + } + // Update the lowered functions auto target_host_str = target_host_->str(); if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) { diff --git a/src/tir/transforms/ir_utils.h b/src/tir/transforms/ir_utils.h index 3b4e693b820a..906ff8a38b6c 100644 --- a/src/tir/transforms/ir_utils.h +++ b/src/tir/transforms/ir_utils.h @@ -29,6 +29,8 @@ #include #include +#include +#include #include namespace tvm { @@ -161,6 +163,27 @@ inline int GetTempAllocaAlignment(DataType type, int32_t const_size) { return align; } +/*! + * \brief Create an int32 constant + * \param index the value of the constant + * \return the PrimExpr that represents the constant + */ +inline PrimExpr ConstInt32(size_t index) { + ICHECK_LE(index, std::numeric_limits::max()); + return make_const(DataType::Int(32), static_cast(index)); +} + +/*! + * \brief Allocate TVMValues on the stack + * \param type type of allocation + * \param num number of TVMValues to allocate + * \return PrimExpr representing the TVMValue + */ +inline PrimExpr StackAlloca(std::string type, size_t num) { + Array args = {StringImm(type), ConstInt32(num)}; + return Call(DataType::Handle(), builtin::tvm_stack_alloca(), args); +} + /*! * \brief Convert a IR node to be SSA form. * \param stmt The source statement to be converted. diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc new file mode 100644 index 000000000000..70dfec161c3b --- /dev/null +++ b/src/tir/transforms/legalize_packed_calls.cc @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file make_packed_call.cc + * \brief Rewrite packed calls in AOT so that the arguments are packed + */ +#include +#include +#include +#include +#include +#include + +#include + +#include "ir_utils.h" + +namespace tvm { +namespace tir { + +using InputMap = + std::unordered_map; +/** + * This is a legalization pass only used in AOT. Traverse the TIR graph to legalize + * packed calls by making its argument wrapped in TVMValues (by using tvm_set_struct built-in) + */ +class PackedCallLegalizer : public StmtExprMutator { + public: + Stmt Legalize(const InputMap& params, tir::Stmt body) { + inputs_ = params; + return StmtExprMutator::VisitStmt(body); + } + + Stmt VisitStmt_(const EvaluateNode* op) final { + if (tir::is_const_int(op->value)) return StmtExprMutator::VisitStmt_(op); + const CallNode* call = op->value.as(); + // Given a packed call f(A,B,C), we need a set of new statements + // let A_packed = set_struct(tvm_value1, A) + // let B_packed = set_struct(tvm_value2, B) + // let C_packed = set_struct(tvm_value3, C) + // call_packed(f, A_packed, B_packed, C_packed) + std::vector new_stmts; + if (call) { + if (call->op.same_as(builtin::tvm_call_cpacked())) { + Array packed_args{call->args[0]}; + for (unsigned i = 1; i < call->args.size(); i++) { + // No need to pack inputs of the prim_func + if (inputs_[call->args[i]] == true) { + packed_args.push_back(call->args[i]); + } else { + // Pack the argument inside a TVMValue + auto sid_array = tir::Var("tvm_value", DataType::Handle()); + tir::Stmt set_struct_stmt = tir::Evaluate( + tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), + {sid_array, 0, tir::builtin::kArrData, call->args[i]})); + new_stmts.push_back(LetStmt(sid_array, StackAlloca("array", 1), set_struct_stmt)); + packed_args.push_back(sid_array); + } + } + // Finally, evaluate the packed call and return a sequential statement + new_stmts.push_back(tir::Evaluate(tir::Call(call->dtype, call->op, packed_args))); + return tir::SeqStmt(new_stmts); + } + } + return StmtExprMutator::VisitStmt_(op); + } + + private: + InputMap inputs_; // Store the inputs to the primfunc that don't need to be packed. +}; + +namespace transform { + +Pass LegalizePackedCalls() { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + auto* n = f.CopyOnWrite(); + + // Create the + InputMap inputs; + for (auto i : f->params) { + inputs[i] = true; + } + n->body = PackedCallLegalizer().Legalize(inputs, std::move(n->body)); + return std::move(f); + }; + return CreatePrimFuncPass(pass_func, 0, "tir.LegalizePackedCalls", {}); +} +} // namespace transform + +} // namespace tir +} // namespace tvm diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 0e2e612e3ae8..8b70817398e4 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -34,16 +34,6 @@ namespace tvm { namespace tir { -inline PrimExpr ConstInt32(size_t index) { - ICHECK_LE(index, std::numeric_limits::max()); - return make_const(DataType::Int(32), static_cast(index)); -} - -inline PrimExpr StackAlloca(std::string type, size_t num) { - Array args = {StringImm(type), ConstInt32(num)}; - return Call(DataType::Handle(), builtin::tvm_stack_alloca(), args); -} - // Calculate the statistics of packed function. // These information are needed during codegen. class BuiltinLower : public StmtExprMutator { diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index cd91a4b53317..36eeddb17d89 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -138,35 +138,6 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { if (op->op.same_as(builtin::address_of())) { const LoadNode* l = op->args[0].as(); this->VisitExpr(l->index); - } else if (op->op.same_as(builtin::tvm_call_cpacked())) { - // Recall that the arguments of a tvm_call_cpacked are passed as - // TVMValues. But a TVMValue is only a container, that points to - // a real buffer previously allocated. We need to signal that those - // buffers need to be live at the same time (i.e., cannot be overwritten during the function - // call) - Array args = op->args; - for (auto arg : args) { - const VarNode* var = arg.as(); - if (value_to_alloc_.find(var) != value_to_alloc_.end()) { - auto allocs = value_to_alloc_[var]; - for (const VarNode* alloc : allocs) { - VisitExpr_(alloc); - } - } else { - this->VisitExpr(arg); - } - } - } else if (op->op.same_as(builtin::tvm_struct_set())) { - // If we are using a struct_set built-in, and we are setting - // a DLTensor ArrayData field, let's note down the - // buffers that the TVMValue refers to - const VarNode* var = op->args[0].as(); - const VarNode* alloc = op->args[3].as(); - const int field_id = op->args[2].as()->value; - if (var && alloc && field_id == tir::builtin::kArrData) { - value_to_alloc_[var].push_back(alloc); - } - StmtExprVisitor::VisitExpr_(op); } else { StmtExprVisitor::VisitExpr_(op); } @@ -235,13 +206,6 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { bool in_thread_env_{false}; // The scope stack. std::vector scope_; - // This is a map to connect TVMValues to real allocations. When we pass parameters - // to a tvm_call_cpacked, the data needs to be wrapped in a TVMValue. The wrapping - // happens through the tvm_struct_set built-in. This map is mapping the variable - // representing the TVMValue to the variable representing the real buffer. The live - // analysis needs to happen on the latter and not on the TVMValue which only acts as - // a container. - std::unordered_map> value_to_alloc_; }; // Verify if the statement can be run safely via inplace fashion @@ -923,11 +887,11 @@ class StoragePlanRewriter : public StmtExprMutator { // symbolic free list, for non constant items. std::list sym_free_list_; // The allocation attach map - std::unordered_map> attach_map_; + std::unordered_map > attach_map_; // The allocation assign map std::unordered_map alloc_map_; // The allocations - std::vector> alloc_vec_; + std::vector > alloc_vec_; // analyzer arith::Analyzer analyzer_; }; @@ -986,7 +950,7 @@ class VectorAllocRewriter : public StmtExprMutator { } // Internal access map - std::unordered_map> acc_map_; + std::unordered_map > acc_map_; // Variables to remap Map var_remap_; // internal analyzer From dee4a8cca734ba402a74c3cb92ddb2c9f2c399d6 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Tue, 22 Jun 2021 23:06:05 +0100 Subject: [PATCH 5/7] Add a unit test for the legalization pass Change-Id: I5b0d75380ff660dd5a0acf5b14fa84bb992fbec4 --- python/tvm/tir/transform/transform.py | 11 +++ src/tir/transforms/legalize_packed_calls.cc | 27 +++++-- .../unittest/test_aot_legalize_packed_call.py | 80 +++++++++++++++++++ 3 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tests/python/unittest/test_aot_legalize_packed_call.py diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 8a32a7e6dff0..51330f80afc6 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -451,6 +451,17 @@ def LowerTVMBuiltin(): return _ffi_api.LowerTVMBuiltin() +def LegalizePackedCalls(): + """Legalize packed calls to have its arguments wrapped in TVMValues + + Returns + ------- + fpass : tvm.transform.Pass + The result pass + """ + return _ffi_api.LegalizePackedCalls() + + def LowerIntrin(): """Lower target specific intrinsic calls. diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc index 70dfec161c3b..424da1e817b6 100644 --- a/src/tir/transforms/legalize_packed_calls.cc +++ b/src/tir/transforms/legalize_packed_calls.cc @@ -60,30 +60,41 @@ class PackedCallLegalizer : public StmtExprMutator { if (call) { if (call->op.same_as(builtin::tvm_call_cpacked())) { Array packed_args{call->args[0]}; + std::vector tvm_values; for (unsigned i = 1; i < call->args.size(); i++) { // No need to pack inputs of the prim_func if (inputs_[call->args[i]] == true) { packed_args.push_back(call->args[i]); } else { // Pack the argument inside a TVMValue - auto sid_array = tir::Var("tvm_value", DataType::Handle()); - tir::Stmt set_struct_stmt = tir::Evaluate( + std::stringstream ss; + ss << "tvm_value_" << tvm_value_index_++; + auto sid_array = tir::Var(ss.str(), DataType::Handle()); + tvm_values.push_back(sid_array); + + new_stmts.push_back(tir::Evaluate( tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(), - {sid_array, 0, tir::builtin::kArrData, call->args[i]})); - new_stmts.push_back(LetStmt(sid_array, StackAlloca("array", 1), set_struct_stmt)); + {sid_array, 0, tir::builtin::kArrData, call->args[i]}))); packed_args.push_back(sid_array); } } - // Finally, evaluate the packed call and return a sequential statement + // Evaluate the packed call new_stmts.push_back(tir::Evaluate(tir::Call(call->dtype, call->op, packed_args))); - return tir::SeqStmt(new_stmts); + tir::Stmt call_stmt = tir::SeqStmt(new_stmts); + + // Allocate the TVMValues on the stack and define the variables + for (auto v : tvm_values) { + call_stmt = LetStmt(v, StackAlloca("array", 1), call_stmt); + } + return call_stmt; } } return StmtExprMutator::VisitStmt_(op); } private: - InputMap inputs_; // Store the inputs to the primfunc that don't need to be packed. + InputMap inputs_; // Store the inputs to the primfunc that don't need to be packed. + int tvm_value_index_; // Index of the actual tvm_value variable }; namespace transform { @@ -102,6 +113,8 @@ Pass LegalizePackedCalls() { }; return CreatePrimFuncPass(pass_func, 0, "tir.LegalizePackedCalls", {}); } + +TVM_REGISTER_GLOBAL("tir.transform.LegalizePackedCalls").set_body_typed(LegalizePackedCalls); } // namespace transform } // namespace tir diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py new file mode 100644 index 000000000000..626af0c96633 --- /dev/null +++ b/tests/python/unittest/test_aot_legalize_packed_call.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-function-docstring,missing-module-docstring +import tvm +from tvm.script import ty +from tvm import te, tir +import numpy as np +import tvm.testing +import pytest + + +@tvm.script.tir +class Module: + def tir_packed_call() -> None: + A = tir.var("handle") + B = tir.var("handle") + C = tir.var("handle") + # body + tir.evaluate( + tir.tvm_call_cpacked( + "tvm_test_cpacked", + A, + B, + C, + dtype="int32", + ) + ) + + +@tvm.script.tir +class Expected: + def tir_packed_call() -> None: + A = tir.var("handle") + B = tir.var("handle") + C = tir.var("handle") + + # body + tvm_value_2 = tir.var("handle") + tvm_value_1 = tir.var("handle") + tvm_value_0 = tir.var("handle") + with tir.let(tvm_value_2, tir.tvm_stack_alloca("array", 1, dtype="handle")): + with tir.let(tvm_value_1, tir.tvm_stack_alloca("array", 1, dtype="handle")): + with tir.let(tvm_value_0, tir.tvm_stack_alloca("array", 1, dtype="handle")): + tir.evaluate(tir.tvm_struct_set(tvm_value_0, 0, 1, A, dtype="handle")) + tir.evaluate(tir.tvm_struct_set(tvm_value_1, 0, 1, B, dtype="handle")) + tir.evaluate(tir.tvm_struct_set(tvm_value_2, 0, 1, C, dtype="handle")) + tir.evaluate( + tir.tvm_call_cpacked( + "tvm_test_cpacked", + tvm_value_0, + tvm_value_1, + tvm_value_2, + dtype="int32", + ) + ) + + +def test_aot_packed_call(): + mod = Module() + expected = Expected() + out = tir.transform.LegalizePackedCalls()(mod) + tvm.ir.assert_structural_equal(expected, out, map_free_vars=True) + + +if __name__ == "__main__": + pytest.main([__file__]) From bc7ba50095e4e85638cc57b1cb71787a3244d31e Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Mon, 28 Jun 2021 18:27:58 +0100 Subject: [PATCH 6/7] rebasing Change-Id: I52ceab5cf6e9b54390cb36c18dbb8e22505d8e18 --- src/relay/backend/aot_executor_codegen.cc | 15 +++++++-------- tests/python/relay/aot/aot_test_utils.py | 5 ++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 66f3f1ecaa20..e5750fd58d42 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -49,7 +49,7 @@ namespace backend { * Struct to contain information about the intermediate tensors in the * runner function */ -struct StorageInfo { +struct TempStorageInfo { /*! \brief storage integer identifier of the particular intermediate buffer */ int sid; /*! \brief exact size of the temporary */ @@ -60,7 +60,7 @@ struct StorageInfo { using IntegerArray = Array; using TargetsMap = std::unordered_map; -using StorageMap = std::unordered_map, runtime::ObjectPtrHash, +using StorageMap = std::unordered_map, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>; /** @@ -116,7 +116,7 @@ class AOTOnDemandAllocator : public ExprVisitor { } void VisitExpr_(const TupleNode* op) final { - std::vector field_sids; + std::vector field_sids; Expr expr = GetRef(op); for (Expr field : op->fields) { auto sid = GetStorage(field); @@ -179,7 +179,7 @@ class AOTOnDemandAllocator : public ExprVisitor { * \param expr The expression. * \return The corresponding token. */ - std::vector GetStorage(const Expr& expr) { + std::vector GetStorage(const Expr& expr) { this->VisitExpr(expr); auto it = storage_device_map_.find(expr); ICHECK(it != storage_device_map_.end()); @@ -191,14 +191,14 @@ class AOTOnDemandAllocator : public ExprVisitor { * \param expr The expression. */ void CreateStorage(const ExprNode* op) { - std::vector buffers; + std::vector buffers; Expr expr = GetRef(op); int device_type = node_device_map_.count(GetRef(op)) ? node_device_map_[expr]->value : 0; if (const auto* tuple_type = op->checked_type().as()) { for (Type t : tuple_type->fields) { const auto* ttype = t.as(); ICHECK(ttype); - StorageInfo buffer; + TempStorageInfo buffer; buffer.sid = next_available_sid_++; buffer.size_bytes = GetMemorySizeBytes(ttype); buffer.dev_type = device_type; @@ -207,7 +207,7 @@ class AOTOnDemandAllocator : public ExprVisitor { } else { const auto* ttype = op->checked_type().as(); ICHECK(ttype); - StorageInfo buffer; + TempStorageInfo buffer; buffer.sid = next_available_sid_++; buffer.size_bytes = GetMemorySizeBytes(ttype); buffer.dev_type = device_type; @@ -707,7 +707,6 @@ class AOTExecutorCodegen : public ExprVisitor { /*! \brief the module name we use to mangle the function names */ String mod_name_; - public: AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host) : mod_(mod), diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 29994fb2afff..836ff4b22b20 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -37,11 +37,11 @@ from tvm.micro import export_model_library_format -<<<<<<< HEAD def mangle_name(mod_name, name): mod_name = mangle_module_name(mod_name) return mod_name + "_" + name -======= + + def convert_to_relay( tflite_model_buf, input_data, @@ -80,7 +80,6 @@ def convert_to_list(x): ) mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) return mod, params ->>>>>>> 7d02e64f6... Fix an issue with storage-rewrite pass and packed functions def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout): From 62fe73c07815c335c23b79627c10b7b81ea8882e Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Tue, 29 Jun 2021 12:15:02 +0100 Subject: [PATCH 7/7] Use common StorageInfo Change-Id: Ia8b7de1373f167ca7d0d69a99846d417405bbe48 --- src/relay/backend/aot_executor_codegen.cc | 109 ++++++++++------------ 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index e5750fd58d42..9b495adbdea8 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -45,23 +45,10 @@ namespace tvm { namespace relay { namespace backend { -/** - * Struct to contain information about the intermediate tensors in the - * runner function - */ -struct TempStorageInfo { - /*! \brief storage integer identifier of the particular intermediate buffer */ - int sid; - /*! \brief exact size of the temporary */ - int size_bytes; - /*! \brief device type of the intermediate tensor */ - int dev_type; -}; - using IntegerArray = Array; using TargetsMap = std::unordered_map; -using StorageMap = std::unordered_map, runtime::ObjectPtrHash, - runtime::ObjectPtrEqual>; +using StorageMap = + std::unordered_map; /** * This is an on demand allocator for AOT. A new temporary @@ -116,22 +103,29 @@ class AOTOnDemandAllocator : public ExprVisitor { } void VisitExpr_(const TupleNode* op) final { - std::vector field_sids; + std::vector storage_ids; + std::vector device_types; + std::vector storage_sizes_in_bytes; Expr expr = GetRef(op); for (Expr field : op->fields) { auto sid = GetStorage(field); - field_sids.insert(field_sids.end(), sid.begin(), sid.end()); + storage_ids.insert(storage_ids.end(), sid->storage_ids.begin(), sid->storage_ids.end()); + device_types.insert(device_types.end(), sid->device_types.begin(), sid->device_types.end()); + storage_sizes_in_bytes.insert(storage_sizes_in_bytes.end(), + sid->storage_sizes_in_bytes.begin(), + sid->storage_sizes_in_bytes.end()); } - - storage_device_map_[expr] = field_sids; + storage_device_map_[expr] = StorageInfo(storage_ids, device_types, storage_sizes_in_bytes); AssignReturnSid(expr); } void VisitExpr_(const TupleGetItemNode* op) final { Expr expr = GetRef(op); - const auto& sids = GetStorage(op->tuple); - ICHECK_LT(static_cast(op->index), sids.size()); - storage_device_map_[expr] = {sids[op->index]}; + auto sids = GetStorage(op->tuple); + ICHECK_LT(static_cast(op->index), sids->storage_ids.size()); + storage_device_map_[expr] = + StorageInfo({sids->storage_ids[op->index]}, {sids->device_types[op->index]}, + {sids->storage_sizes_in_bytes[op->index]}); AssignReturnSid(expr); } @@ -142,10 +136,10 @@ class AOTOnDemandAllocator : public ExprVisitor { private: void AssignReturnSid(Expr e) { if (storage_device_map_.find(e) != storage_device_map_.end()) { - auto buffers = storage_device_map_[e]; + StorageInfo& sinfo = storage_device_map_[e]; return_ids_.clear(); - for (auto buffer : buffers) { - return_ids_.push_back(buffer.sid); + for (auto sid : sinfo->storage_ids) { + return_ids_.push_back(sid); } } } @@ -179,7 +173,7 @@ class AOTOnDemandAllocator : public ExprVisitor { * \param expr The expression. * \return The corresponding token. */ - std::vector GetStorage(const Expr& expr) { + StorageInfo GetStorage(const Expr& expr) { this->VisitExpr(expr); auto it = storage_device_map_.find(expr); ICHECK(it != storage_device_map_.end()); @@ -191,29 +185,28 @@ class AOTOnDemandAllocator : public ExprVisitor { * \param expr The expression. */ void CreateStorage(const ExprNode* op) { - std::vector buffers; + std::vector storage_ids; + std::vector device_types; + std::vector storage_sizes_in_bytes; Expr expr = GetRef(op); - int device_type = node_device_map_.count(GetRef(op)) ? node_device_map_[expr]->value : 0; + int device_type_int = + node_device_map_.count(GetRef(op)) ? node_device_map_[expr]->value : 0; if (const auto* tuple_type = op->checked_type().as()) { for (Type t : tuple_type->fields) { const auto* ttype = t.as(); ICHECK(ttype); - TempStorageInfo buffer; - buffer.sid = next_available_sid_++; - buffer.size_bytes = GetMemorySizeBytes(ttype); - buffer.dev_type = device_type; - buffers.push_back(buffer); + storage_ids.push_back(next_available_sid_++); + storage_sizes_in_bytes.push_back(GetMemorySizeBytes(ttype)); + device_types.push_back(DLDeviceType(device_type_int)); } } else { const auto* ttype = op->checked_type().as(); ICHECK(ttype); - TempStorageInfo buffer; - buffer.sid = next_available_sid_++; - buffer.size_bytes = GetMemorySizeBytes(ttype); - buffer.dev_type = device_type; - buffers.push_back(buffer); + storage_ids.push_back(next_available_sid_++); + storage_sizes_in_bytes.push_back(GetMemorySizeBytes(ttype)); + device_types.push_back(DLDeviceType(device_type_int)); } - storage_device_map_[expr] = buffers; + storage_device_map_[expr] = StorageInfo(storage_ids, device_types, storage_sizes_in_bytes); } /*! \brief mapping of expression -> storageInfo*/ StorageMap storage_device_map_; @@ -253,14 +246,13 @@ class AOTExecutorCodegen : public ExprVisitor { * \brief Return a vector of variables that represents the sids for the given Relay Expr */ std::vector PackSid(Expr expr) { - auto buffers = storage_device_map_[expr]; std::vector buffer_vars; + StorageInfo& sinfo = storage_device_map_[expr]; // Note that an expression can have multiple sids associated with it // e.g., returning multiple values from a function - for (const auto& buffer : buffers) { + for (auto sid : sinfo->storage_ids) { // Determine if an sid is an output buffer - int sid = buffer.sid; auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); @@ -491,7 +483,8 @@ class AOTExecutorCodegen : public ExprVisitor { } ICHECK_GE(storage_device_map_.count(expr), 0); - auto call_dev_type = storage_device_map_[expr][0].dev_type; + StorageInfo& sinfo = storage_device_map_[expr]; + auto call_dev_type = sinfo->device_types[0]; // Normal Relay Function if (targets_.size() == 1) { // homogeneous execution. @@ -525,23 +518,22 @@ class AOTExecutorCodegen : public ExprVisitor { void VisitExpr_(const VarNode* op) override { Expr expr = GetRef(op); + StorageInfo& sinfo = storage_device_map_[expr]; // If the Var node is an output node we need to copy the content of the variable to the output // It's safe to check the SID here because Var StorageToken are never reallocated - auto buffers = storage_device_map_[expr]; - - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sinfo->storage_ids[0]); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); if (params_by_expr_.find(expr) != params_by_expr_.end()) { auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), {tir::StringImm(params_by_expr_[expr])}); CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, - /*pack_input*/ true, buffers[0].size_bytes); + /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]); } else { auto var_expr = FindExpr(expr); CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], - /*pack_input*/ true, buffers[0].size_bytes); + /*pack_input*/ true, sinfo->storage_sizes_in_bytes[0]); } } } @@ -550,21 +542,20 @@ class AOTExecutorCodegen : public ExprVisitor { Expr expr = GetRef(op); size_t index = params_.size(); std::string name = "p" + std::to_string(index); - - param_storage_ids_[name] = storage_device_map_[expr][0].sid; + StorageInfo& sinfo = storage_device_map_[expr]; + param_storage_ids_[name] = sinfo->storage_ids[0]; params_[name] = op->data; params_by_expr_.Set(expr, name); // If the Constant node is an output node we need to copy the content of the parameter to the // output A Var node can only produce a single output - auto buffers = storage_device_map_[expr]; - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), buffers[0].sid); + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sinfo->storage_ids[0]); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), {tir::StringImm(params_by_expr_[expr])}); CopyToOutput(main_signature_[input_vars_.size() + output_index], param_handle, false, - buffers[0].size_bytes); + sinfo->storage_sizes_in_bytes[0]); } } @@ -622,9 +613,9 @@ class AOTExecutorCodegen : public ExprVisitor { continue; } - for (unsigned int i = 0; i < kv.second.size(); i++) { - int size = kv.second[i].size_bytes; - int sid = kv.second[i].sid; + for (unsigned int i = 0; i < kv.second->storage_ids.size(); i++) { + int size = kv.second->storage_sizes_in_bytes[i]; + int sid = kv.second->storage_ids[i]; if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) { continue; @@ -730,9 +721,9 @@ class AOTExecutorCodegen : public ExprVisitor { // Define the storage allocator ids for (auto kv : storage_device_map_) { - for (const auto& buffer : kv.second) { - te::Var buffer_var(MakeString("sid_", buffer.sid), PointerType(PrimType(DataType::Int(8)))); - sids_table_[buffer.sid] = buffer_var; + for (auto sid : kv.second->storage_ids) { + te::Var buffer_var(MakeString("sid_", sid), PointerType(PrimType(DataType::Int(8)))); + sids_table_[sid] = buffer_var; } }