diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py index 811e205fb2b3..fefc2857230d 100644 --- a/python/tvm/relay/expr.py +++ b/python/tvm/relay/expr.py @@ -564,6 +564,10 @@ def device_types(self): def storage_sizes(self): return _ffi_api.StorageInfoStorageSizes(self) + @property + def virtual_devices(self): + return _ffi_api.StorageInfoVirtualDevices(self) + @tvm._ffi.register_object("relay.StaticMemoryPlan") class StaticMemoryPlan(Node): diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 0019b22f1a8f..dab951b7e91f 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -31,6 +31,7 @@ #include #include +#include "../../runtime/texture.h" #include "../../support/arena.h" #include "../op/annotation/annotation.h" #include "../op/call/call.h" @@ -41,6 +42,10 @@ namespace tvm { namespace relay { +using TargetsMap = Map; +using Texture2DShape = runtime::Texture2DShape; +constexpr auto Is2DStorage = runtime::IsTextureStorage; + using backend::StaticMemoryPlan; using backend::StorageInfo; using IntegerArray = Array; @@ -151,12 +156,13 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor { */ const std::vector& GetToken(const Expr& expr) { this->VisitExpr(expr); + // See through on_device calls. + Expr real_expr = IgnoreOnDevice(expr); + // Functions don't require data storage, represented by the empty token - if (expr->checked_type().as()) { + if (real_expr->checked_type().as()) { return no_tokens_; } - // See through on_device calls. - Expr real_expr = IgnoreOnDevice(expr); this->VisitExpr(real_expr); auto it = token_map_.find(real_expr.get()); ICHECK(it != token_map_.end()) << "Expression not found in storage map:" << std::endl @@ -225,6 +231,7 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor { private: // allocator support::Arena* arena_; + Map> node_storage_map_; }; /*! \brief Associate storage with every expression, reusing storage where possible. */ @@ -272,7 +279,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor { num_nodes++; storage_ids.push_back(tok->storage_id); virtual_devices.push_back(tok->virtual_device); - sid_sizes_byte.push_back(GetMemorySize(tok)); + sid_sizes_byte.push_back(allocator_.GetMemorySize(tok)); } auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices), std::move(sid_sizes_byte)); @@ -301,10 +308,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor { for (StorageToken* tok : it->second) { ICHECK(tok->virtual_device == virtual_device); if (can_realloc) { - tokens.push_back(Request(tok)); + tokens.push_back(allocator_.Request(tok)); } else { // Allocate a new token, - StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok)); + StorageToken* allocated_tok = allocator_.Alloc(tok); allocated_tok->virtual_device = tok->virtual_device; // ensure it never get de-allocated. allocated_tok->ref_counter += 1; @@ -365,108 +372,260 @@ class StorageAllocator : public StorageAllocaBaseVisitor { // check if there is orphaned output that can be released immediately. for (StorageToken* tok : token_map_.at(call_node)) { - CheckForRelease(tok); + allocator_.CheckForRelease(tok); } for (StorageToken* tok : args) { tok->ref_counter -= 1; - CheckForRelease(tok); + allocator_.CheckForRelease(tok); } } - /*! - * \brief ceil(size/word_size) to get number of words. - * \param size The original size. - * \param word_size The element size. - */ - static int64_t DivRoundUp(int64_t size, int64_t word_size) { - return (size + word_size - 1) / word_size; - } - /*! - * \brief Get the memory requirement. - * \param prototype The prototype token. - * \return The required memory size. - * - * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc, - * CalculateRelayExprSizeBytes in utils.cc + /** + * @brief Memory manager for flattened 1d memory (buffers) */ - static int64_t GetMemorySize(StorageToken* prototype) { - TensorType ttype = prototype->ttype; - ICHECK(ttype.defined()); - int64_t size = 1; - for (IndexExpr dim : ttype->shape) { - const int64_t* pval = tir::as_const_int(dim); - ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; - ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; - size *= pval[0]; + class TokenAllocator1D { + public: + /*! + * \brief ceil(size/word_size) to get number of words. + * \param size The original size. + * \param word_size The element size. + */ + static size_t DivRoundUp(size_t size, size_t word_size) { + return (size + word_size - 1) / word_size; } - size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); - return size; - } - /*! - * \brief Request a storage token for a given prototype. - * \param prototype. The prototype storage token. - * \return The result token. - */ - StorageToken* Request(StorageToken* prototype) { - // calculate the size; - size_t size = GetMemorySize(prototype); - // search memory block in [size / match_range_, size * match_range_) - if (match_range_ == 0) { - return this->Alloc(prototype, size); + + /*! + * \brief Get the memory requirement. + * \param prototype The prototype token. + * \return The required memory size. + * + * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc, + * CalculateRelayExprSizeBytes in utils.cc + */ + size_t GetMemorySize(StorageToken* prototype) { + TensorType ttype = prototype->ttype; + ICHECK(ttype.defined()); + size_t size = 1; + for (IndexExpr dim : ttype->shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); + return size; } - auto begin = free_.lower_bound(size / match_range_); - auto mid = free_.lower_bound(size); - auto end = free_.upper_bound(size * match_range_); - // search for memory blocks larger than requested - for (auto it = mid; it != end; ++it) { - StorageToken* tok = it->second; - if (!tok->is_compatible(*prototype)) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // find a exact match, erase from map and return - free_.erase(it); - return tok; + /*! + * \brief Request a storage token for a given prototype. + * \param prototype. The prototype storage token. + * \return The result token. + */ + StorageToken* Request(StorageToken* prototype) { + // calculate the size; + size_t size = GetMemorySize(prototype); + // search memory block in [size / match_range_, size * match_range_) + if (match_range_ == 0) { + return nullptr; + } + auto begin = free_.lower_bound(size / match_range_); + auto mid = free_.lower_bound(size); + auto end = free_.upper_bound(size * match_range_); + // search for memory blocks larger than requested + for (auto it = mid; it != end; ++it) { + StorageToken* tok = it->second; + if (!tok->is_compatible(*prototype)) continue; + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // find a exact match, erase from map and return + free_.erase(it); + return tok; + } + // then search for memory blocks smaller than requested space + for (auto it = mid; it != begin;) { + --it; + StorageToken* tok = it->second; + if (!tok->is_compatible(*prototype)) continue; + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // erase from map and return + free_.erase(it); + return tok; + } + return nullptr; } - // then search for memory blocks smaller than requested space - for (auto it = mid; it != begin;) { - --it; - StorageToken* tok = it->second; - if (!tok->is_compatible(*prototype)) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // erase from map and return - free_.erase(it); - return tok; + /*! + * \brief Alloacte a storage token by consuming prototype + * \param prototype The prototype token. + * \param size The size of memory being requested. + */ + StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) { + size_t size = GetMemorySize(prototype); + prototype->max_bytes = size; + prototype->storage_id = storage_id; + data_.push_back(prototype); + return prototype; } - // cannot find anything return a new one. - return this->Alloc(prototype, size); - } - /*! - * \brief Allocate a storage token by consuming prototype - * \param prototype The prototype token. - * \param size The size of memory being requested. - */ - StorageToken* Alloc(StorageToken* prototype, size_t size) { - prototype->max_bytes = size; - prototype->storage_id = static_cast(data_.size()); - data_.push_back(prototype); - return prototype; - } - /*! - * \brief Check if we can release token. - * \param tok The token to be released. + /*! + * \brief Check if we can release token. + * \param tok The token to be released. + */ + void CheckForRelease(StorageToken* tok) { + ICHECK_GE(tok->storage_id, 0); + ICHECK_GE(tok->ref_counter, 0); + if (tok->ref_counter == 0) { + free_.insert({tok->max_bytes, tok}); + } + } + + private: + // scale used for rough match + const size_t match_range_{16}; + // free list of storage entry + std::multimap free_; + // all the storage resources available + std::vector data_; + }; + + /** + * @brief Memory manager for 2d memory (textures) */ - void CheckForRelease(StorageToken* tok) { - ICHECK_GE(tok->storage_id, 0); - ICHECK_GE(tok->ref_counter, 0); - if (tok->ref_counter == 0) { - free_.insert({tok->max_bytes, tok}); + class TokenAllocator2D { + public: + /*! + * \brief Request a storage token for a given prototype. + * \param prototype. The prototype storage token. + * \return The result token. + */ + StorageToken* Request(StorageToken* prototype) { + auto shape = GetSize2D(prototype); + int64_t requested_size = shape.height * shape.width; + int64_t min_added_size = std::numeric_limits::max(); + int64_t min_wasted_size = std::numeric_limits::max(); + int64_t best_storage_id = -1; + MemBlock best_mem, new_mem; + for (int64_t free_id : free_list_) { + MemBlock& cached = blocks_[free_id]; + // Can only reuse texture 2d blocks of the same type + if (cached.token_->ttype->dtype != prototype->ttype->dtype) { + continue; + } + int64_t cached_size = cached.x_ * cached.y_; + new_mem.x_ = std::max(cached.x_, shape.width); + new_mem.y_ = std::max(cached.y_, shape.height); + int64_t expanded_size = new_mem.x_ * new_mem.y_; + int64_t added_size = expanded_size - cached_size; + int64_t wasted_size = expanded_size - requested_size; + // Prioritize minimization of added size first, then minimize + // wasted size among blocks which would not require expansion + if ((min_added_size > 0 && added_size < min_added_size) || + (min_added_size == 0 && wasted_size < min_wasted_size)) { + min_added_size = added_size; + min_wasted_size = wasted_size; + best_storage_id = free_id; + best_mem = new_mem; + } + } + + if (min_added_size <= requested_size) { + best_mem.token_ = blocks_[best_storage_id].token_; + // Reset the reference counter of the now live token + best_mem.token_->ref_counter = prototype->ref_counter; + blocks_[best_storage_id] = best_mem; + free_list_.erase(best_storage_id); + return best_mem.token_; + } + return nullptr; } - } + /*! + * \brief Alloacte a storage token by consuming prototype + * \param prototype The prototype token. + * \param size The size of memory being requested. + */ + StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) { + auto shape = GetSize2D(prototype); + MemBlock block; + block.x_ = shape.width; + block.y_ = shape.height; + prototype->storage_id = storage_id; + block.token_ = prototype; + blocks_[prototype->storage_id] = block; + return prototype; + } + /*! + * \brief Check if we can release token. + * \param tok The token to be released. + */ + void CheckForRelease(StorageToken* tok) { + ICHECK_GE(tok->storage_id, 0); + ICHECK_GE(tok->ref_counter, 0); + if (tok->ref_counter == 0) { + free_list_.insert(tok->storage_id); + } + } + /*! + * \brief Get the texture 2d size requirement + * \param prototype The prototype token. + * \return The required texture 2d memory size in (width, height, channel). + */ + Texture2DShape GetSize2D(StorageToken* prototype) { + TensorType ttype = prototype->ttype; + ICHECK(ttype.defined()); + size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(), + prototype->virtual_device->memory_scope); + struct Shape { + const Array& shape; + int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); } + }; + return runtime::ApplyTexture2DFlattening(Shape{ttype->shape}, ttype->shape.size(), + axis); + } + + private: + struct MemBlock { + StorageToken* token_; + int64_t x_; + int64_t y_; + }; + + std::unordered_map blocks_; + std::unordered_set free_list_; + }; + + class TokenAllocator { + public: + StorageToken* Alloc(StorageToken* proto) { + return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++) + : token_1d_.Alloc(proto, storage_ids_++); + } + StorageToken* Request(StorageToken* proto) { + StorageToken* token = + Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto); + return token ? token : this->Alloc(proto); + } + void CheckForRelease(StorageToken* tok) { + return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok); + } + + size_t GetMemorySize(StorageToken* tok) { + // TODO(amalyshe): figure out who requries sizes and for what + // size in case of texture is not enough - we can return any value if it + // assumed to be used for memory allocatoion or we can return real size + // if it is just for information + return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok); + } + static bool Is2DStorage(StorageToken* tok) { + return relay::Is2DStorage(tok->virtual_device->memory_scope); + } + + private: + int64_t storage_ids_{0}; + TokenAllocator1D token_1d_; + TokenAllocator2D token_2d_; + }; private: // allocator @@ -479,6 +638,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor { std::vector data_; /*! \brief internal prototype token map */ std::unordered_map> prototype_; + /*! \brief token allocator for optimizing 1d and 2d token alloc requests */ + TokenAllocator allocator_; }; StaticMemoryPlan GraphPlanMemory(const Function& func) { return StorageAllocator().Plan(func); } diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc index fe8127d60dc9..340986770e93 100644 --- a/src/relay/backend/utils.cc +++ b/src/relay/backend/utils.cc @@ -114,6 +114,14 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageSizes").set_body_typed([](Storag return storage_sizes_in_bytes; }); +TVM_REGISTER_GLOBAL("relay.ir.StorageInfoVirtualDevices").set_body_typed([](StorageInfo si) { + Array virtual_devices; + for (auto id : si->virtual_devices) { + virtual_devices.push_back(id); + } + return virtual_devices; +}); + TVM_REGISTER_NODE_TYPE(StaticMemoryPlanNode); StaticMemoryPlan::StaticMemoryPlan(Map expr_to_storage_info) { diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py index b797e4ce9dcb..0522c0db1075 100644 --- a/tests/python/relay/test_backend_graph_executor.py +++ b/tests/python/relay/test_backend_graph_executor.py @@ -184,6 +184,101 @@ def test_plan_memory(): ) +def test_plan_2d_memory(): + """Verification if GraphPlanMemory manages 2d memory reffered as + global.texture* memory scopes in json file.""" + global_virtual_device = tvm.target.VirtualDevice(memory_scope="global") + texture_virtual_device = tvm.target.VirtualDevice(memory_scope="global.texture") + metatable = { + "VirtualDevice": [ + global_virtual_device, + texture_virtual_device, + ] + } + + mod = tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%data1: Tensor[(1, 32, 40, 40), float32], + %data2: Tensor[(1, 32, 40, 40), float32]) { + %0 = fn (%a, Primitive=1) { + layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c") + }; + %1 = %0(%data1); + %3 = %0(%data2); + %5 = fn (%a {virtual_device=meta[VirtualDevice][0]}, // global + %b {virtual_device=meta[VirtualDevice][0]}, // global + virtual_device=meta[VirtualDevice][1], // texture + Primitive=1) { + add(%a, %b) + }; + %6 = %5(%1, %3); + %7 = fn (%a {virtual_device=meta[VirtualDevice][1]}, // texture + %b {virtual_device=meta[VirtualDevice][0]}, // global + virtual_device=meta[VirtualDevice][1], // texture + Primitive=1) { + add(%a, %b) + }; + %8 = %7(%6, %3); + %9 = fn (%a {virtual_device=meta[VirtualDevice][1]}, // texture + %b {virtual_device=meta[VirtualDevice][1]}, // texture + virtual_device=meta[VirtualDevice][1], // texture + Primitive=1) { + add(%a, %b) + }; + %10 = %9(%8, %6); + %11 = fn (%a, + virtual_device=meta[VirtualDevice][0], // global + Primitive=1) { + layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW") + }; + %11(%10) + } + """, + "from_string", + None, + metatable, + ) + + GPU_DEVICE = tvm.device("cuda") + HOST_TARGET = tvm.target.Target("llvm") + GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET) + GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET) # device_type=2 + CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": GPU.device_type_int}) + config = tvm.target.make_compilation_config(CTXT, GPU_TARGET) + mod = relay.transform.InferType()(mod) + # PlanDevices should succeed. + mod = relay.transform.PlanDevices(config)(mod) + + func = mod["main"] + memory_plan = relay.backend._backend.GraphPlanMemory(func) + virtual_devices = {} + + # We do not have execution ordered information, the only order that we can stick + # in this place - storage_id + # for above graph we know that + # We have + # - 8 manageable storages for above graph + # - 5 of them are buffers + # - 3 of them are textures (2d storages) + # - 1 of buffer will be reused, since we have storage id maped data, we will have 4th + # storage id reuesed and hidden in virtual_devices map + # - no textures are reused so far + for k, v in memory_plan.expr_to_storage_info.items(): + virtual_devices[v.storage_ids[0]] = v.virtual_devices[0].memory_scope + + # Check the scopes according to abvoce expectaions + assert ( + virtual_devices[0] == "global" + and virtual_devices[1] == "global" + and virtual_devices[2] == "global" + and virtual_devices[3] == "global" + and virtual_devices[4] == "global.texture" + and virtual_devices[5] == "global.texture" + and virtual_devices[6] == "global.texture" + ) + + def test_reshape_nop(): # test that reshape can be turned into nop x = relay.var("x", shape=(10, 4))