From a69c1a444eab9510bda1fac564e5f97fd335521a Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 2 Mar 2021 14:34:55 -0800 Subject: [PATCH] Update memory planner to optimize 2d memory allocations and add downstream support in graph runtime storage setup. --- src/relay/backend/graph_plan_memory.cc | 342 +++++++++++++++++-------- src/runtime/graph/graph_runtime.cc | 57 ++++- src/runtime/graph/graph_runtime.h | 3 +- 3 files changed, 279 insertions(+), 123 deletions(-) diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 15173c2c79db..2690fa847148 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -28,6 +28,7 @@ #include #include "../../support/arena.h" +#include "../../runtime/texture.h" namespace tvm { namespace relay { @@ -184,17 +185,6 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor { class StorageAllocator : public StorageAllocaBaseVisitor { public: - /*! - * \return totoal number of bytes allocated - */ - size_t TotalAllocBytes() const { - size_t total = 0; - for (const auto* p : data_) { - total += p->max_bytes; - } - return total; - } - // Run storage allocation for a function. Map > Plan(const Function& func) { prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func); @@ -238,10 +228,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor { std::vector tokens; for (StorageToken* tok : it->second) { if (can_realloc) { - tokens.push_back(Request(tok)); + tokens.push_back(allocator_.Request(tok)); } else { // Allocate a new token, - StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok)); + StorageToken* allocated_tok = allocator_.Alloc(tok); allocated_tok->device_type = tok->device_type; // ensure it never get de-allocated. allocated_tok->ref_counter += 1; @@ -263,116 +253,252 @@ class StorageAllocator : public StorageAllocaBaseVisitor { CreateToken(op, true); // check if there is orphaned output that can be released immediately. for (StorageToken* tok : token_map_.at(op)) { - CheckForRelease(tok); + allocator_.CheckForRelease(tok); } for (StorageToken* tok : args) { tok->ref_counter -= 1; - CheckForRelease(tok); + allocator_.CheckForRelease(tok); } } - /*! - * \brief ceil(size/word_size) to get number of words. - * \param size The original size. - * \param word_size The element size. - */ - static size_t DivRoundUp(size_t size, size_t word_size) { - return (size + word_size - 1) / word_size; - } - /*! - * \brief Get the memory requirement. - * \param prototype The prototype token. - * \return The required memory size. - */ - size_t GetMemorySize(StorageToken* prototype) { - const TensorTypeNode* ttype = prototype->ttype; - ICHECK(ttype != nullptr); - size_t size = 1; - for (IndexExpr dim : ttype->shape) { - const int64_t* pval = tir::as_const_int(dim); - ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; - ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; - size *= static_cast(pval[0]); + + private: + class TokenAllocator1D { + public: + /*! + * \brief Request a storage token for a given prototype. + * \param prototype. The prototype storage token. + * \return The result token. + */ + StorageToken* Request(StorageToken* prototype) { + // calculate the size; + size_t size = GetMemorySize(prototype); + // search memory block in [size / match_range_, size * match_range_) + if (match_range_ == 0) { + return nullptr; + } + auto begin = free_.lower_bound(size / match_range_); + auto mid = free_.lower_bound(size); + auto end = free_.upper_bound(size * match_range_); + // search for memory blocks larger than requested + for (auto it = mid; it != end; ++it) { + StorageToken* tok = it->second; + if (tok->device_type != prototype->device_type) continue; + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // find a exact match, erase from map and return + free_.erase(it); + return tok; + } + // then search for memory blocks smaller than requested space + for (auto it = mid; it != begin;) { + --it; + StorageToken* tok = it->second; + if (tok->device_type != prototype->device_type) continue; + ICHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // erase from map and return + free_.erase(it); + return tok; + } + return nullptr; } - size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); - return size; - } - /*! - * \brief Request a storage token for a given prototype. - * \param prototype. The prototype storage token. - * \return The result token. - */ - StorageToken* Request(StorageToken* prototype) { - // calculate the size; - size_t size = GetMemorySize(prototype); - // search memory block in [size / match_range_, size * match_range_) - if (match_range_ == 0) { - return this->Alloc(prototype, size); + /*! + * \brief Alloacte a storage token by consuming prototype + * \param prototype The prototype token. + * \param size The size of memory being requested. + */ + StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) { + size_t size = GetMemorySize(prototype); + prototype->max_bytes = size; + prototype->storage_id = storage_id; + data_.push_back(prototype); + return prototype; } - auto begin = free_.lower_bound(size / match_range_); - auto mid = free_.lower_bound(size); - auto end = free_.upper_bound(size * match_range_); - // search for memory blocks larger than requested - for (auto it = mid; it != end; ++it) { - StorageToken* tok = it->second; - if (tok->device_type != prototype->device_type) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // find a exact match, erase from map and return - free_.erase(it); - return tok; + /*! + * \brief Check if we can release token. + * \param tok The token to be released. + */ + void CheckForRelease(StorageToken* tok) { + ICHECK_GE(tok->storage_id, 0); + ICHECK_GE(tok->ref_counter, 0); + if (tok->ref_counter == 0) { + free_.insert({tok->max_bytes, tok}); + } } - // then search for memory blocks smaller than requested space - for (auto it = mid; it != begin;) { - --it; - StorageToken* tok = it->second; - if (tok->device_type != prototype->device_type) continue; - ICHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // erase from map and return - free_.erase(it); - return tok; + /*! + * \return totoal number of bytes allocated + */ + size_t TotalAllocBytes() const { + size_t total = 0; + for (const auto* p : data_) { + total += p->max_bytes; + } + return total; } - // cannot find anything return a new one. - return this->Alloc(prototype, size); - } - /*! - * \brief Allocate a storage token by consuming prototype - * \param prototype The prototype token. - * \param size The size of memory being requested. - */ - StorageToken* Alloc(StorageToken* prototype, size_t size) { - prototype->max_bytes = size; - prototype->storage_id = static_cast(data_.size()); - data_.push_back(prototype); - return prototype; - } - /*! - * \brief Check if we can release token. - * \param tok The token to be released. - */ - void CheckForRelease(StorageToken* tok) { - ICHECK_GE(tok->storage_id, 0); - ICHECK_GE(tok->ref_counter, 0); - if (tok->ref_counter == 0) { - free_.insert({tok->max_bytes, tok}); + /*! + * \brief ceil(size/word_size) to get number of words. + * \param size The original size. + * \param word_size The element size. + */ + static size_t DivRoundUp(size_t size, size_t word_size) { + return (size + word_size - 1) / word_size; } - } + /*! + * \brief Get the memory requirement. + * \param prototype The prototype token. + * \return The required memory size. + */ + size_t GetMemorySize(StorageToken* prototype) { + const TensorTypeNode* ttype = prototype->ttype; + ICHECK(ttype != nullptr); + size_t size = 1; + for (IndexExpr dim : ttype->shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); + return size; + } + private: + // scale used for rough match + const size_t match_range_{16}; + // free list of storage entry + std::multimap free_; + // all the storage resources available + std::vector data_; + }; + + class TokenAllocator2D { + public: + /*! + * \brief Request a storage token for a given prototype. + * \param prototype. The prototype storage token. + * \return The result token. + */ + StorageToken* Request(StorageToken* prototype) { + auto shape = GetSize2D(prototype); + int64_t requested_size = shape.height * shape.width; + int64_t min_added_size = std::numeric_limits::max(); + int64_t min_wasted_size = std::numeric_limits::max(); + int64_t best_storage_id = -1; + MemBlock best_mem, new_mem; + for (int64_t free_id : free_list_) { + MemBlock& cached = blocks_[free_id]; + // Can only reuse texture 2d blocks of the same type + if (cached.token_->ttype->dtype != prototype->ttype->dtype) { + continue; + } + int64_t cached_size = cached.x_ * cached.y_; + new_mem.x_ = std::max(cached.x_, shape.width); + new_mem.y_ = std::max(cached.y_, shape.height); + int64_t expanded_size = new_mem.x_ * new_mem.y_; + int64_t added_size = expanded_size - cached_size; + int64_t wasted_size = expanded_size - requested_size; + // Prioritize minimization of added size first, then minimize + // wasted size among blocks which would not require expansion + if ((min_added_size > 0 && added_size < min_added_size) || + (min_added_size == 0 && wasted_size < min_wasted_size)) { + min_added_size = added_size; + min_wasted_size = wasted_size; + best_storage_id = free_id; + best_mem = new_mem; + } + } + + if (min_added_size <= requested_size) { + best_mem.token_ = blocks_[best_storage_id].token_; + // Reset the reference counter of the now live token + best_mem.token_->ref_counter = prototype->ref_counter; + blocks_[best_storage_id] = best_mem; + free_list_.erase(best_storage_id); + return best_mem.token_; + } + return nullptr; + } + /*! + * \brief Alloacte a storage token by consuming prototype + * \param prototype The prototype token. + * \param size The size of memory being requested. + */ + StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) { + auto shape = GetSize2D(prototype); + MemBlock block; + block.x_ = shape.width; + block.y_ = shape.height; + prototype->storage_id = storage_id; + block.token_ = prototype; + blocks_[prototype->storage_id] = block; + return prototype; + } + /*! + * \brief Check if we can release token. + * \param tok The token to be released. + */ + void CheckForRelease(StorageToken* tok) { + ICHECK_GE(tok->storage_id, 0); + ICHECK_GE(tok->ref_counter, 0); + if (tok->ref_counter == 0) { + free_list_.insert(tok->storage_id); + } + } + /*! + * \brief Get the texture 2d size requirement + * \param prototype The prototype token. + * \return The required texture 2d memory size in (width, height, channel). + */ + Texture2DShape GetSize2D(StorageToken* prototype) { + const TensorTypeNode* ttype = prototype->ttype; + ICHECK(ttype != nullptr); + size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(), prototype->storage_scope); + struct Shape { + const Array& shape; + int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); } + }; + return runtime::ApplyTexture2DFlattening(Shape{ttype->shape}, ttype->shape.size(), axis); + } + private: + struct MemBlock { + StorageToken* token_; + int64_t x_; + int64_t y_; + }; + + std::unordered_map blocks_; + std::unordered_set free_list_; + }; + + class TokenAllocator { + public: + StorageToken* Alloc(StorageToken* proto) { + return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++) : token_1d_.Alloc(proto, storage_ids_++); + } + StorageToken* Request(StorageToken* proto) { + StorageToken* token = Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto); + return token ? token : this->Alloc(proto); + } + void CheckForRelease(StorageToken* tok) { + return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok); + } + static bool Is2DStorage(StorageToken* tok) { return relay::Is2DStorage(tok->storage_scope); } + + private: + int64_t storage_ids_{0}; + TokenAllocator1D token_1d_; + TokenAllocator2D token_2d_; + }; - private: // allocator support::Arena arena_; - // scale used for rough match - size_t match_range_{16}; - // free list of storage entry - std::multimap free_; - // all the storage resources available - std::vector data_; /*! \brief internal prototype token map */ std::unordered_map > prototype_; + /*! \brief token allocator for optimizing 1d and 2d token alloc requests */ + TokenAllocator allocator_; }; Map > GraphPlanMemory(const Function& func) { diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 6c51e711aef1..e50d615f147b 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -39,6 +39,7 @@ #include #include "../file_utils.h" +#include "../texture.h" namespace tvm { namespace runtime { @@ -48,6 +49,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) { if (align < kAllocAlignment) return kAllocAlignment; return align; } +constexpr auto Is2DStorage = IsTextureStorage; } // namespace details /*! @@ -279,24 +281,18 @@ void GraphRuntime::SetupStorage() { // Find the maximum space size. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; + ICHECK_GE(storage_id, 0) << "Do not support runtime shape op"; + std::string storage_scope = attrs_.storage_scope[i]; // Use the fallback device if no device index is available. int device_type = static_cast(ctxs_[0].device_type); if (!attrs_.device_index.empty()) { device_type = attrs_.device_index[i]; } - size_t size = 1; - for (int64_t sz : attrs_.shape[i]) { - size *= static_cast(sz); - } - ICHECK_GE(storage_id, 0) << "Do not support runtime shape op"; - DLDataType t = vtype[i]; - size_t bits = t.bits * t.lanes; - ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); - size_t bytes = ((bits + 7U) / 8U) * size; uint32_t sid = static_cast(storage_id); + if (sid >= pool_entry.size()) { - pool_entry.resize(sid + 1, {0, -1}); + pool_entry.resize(sid + 1, {-1, {0}, {}}); } else { ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type) << "The same pool entry cannot be assigned to multiple devices"; @@ -313,8 +309,38 @@ void GraphRuntime::SetupStorage() { pool_entry[sid].linked_param = lookup_rv; } pool_entry[sid].param_data_entry = i; - pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); pool_entry[sid].device_type = device_type; + pool_entry[sid].scope = storage_scope; + + DLDataType t = vtype[i]; + if (!details::Is2DStorage(storage_scope)) { + size_t size = 1; + for (int64_t sz : attrs_.shape[i]) { + size *= static_cast(sz); + } + size_t bits = t.bits * t.lanes; + ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); + int64_t bytes = ((bits + 7U) / 8U) * size; + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes); + pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1}; + } else { + if (pool_entry[sid].shape.size() == 1) { + pool_entry[sid].shape.resize(3, 0); + } + size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope); + auto shape = ApplyTexture2DFlattening(attrs_.shape[i], attrs_.shape[i].size(), axis); + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height); + pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width); + CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel) + << pool_entry[sid].shape[2] << " != " << shape.channel + << ", texture channel length must be consistent within a storage pool"; + pool_entry[sid].shape[2] = shape.channel; + CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t)) + << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t) + << ", pool entry for 2d texure allocations must be of the same type;" + << " downstream error from memory planner likely"; + pool_entry[sid].dtype = t; + } } // Allocate the space. @@ -328,9 +354,12 @@ void GraphRuntime::SetupStorage() { if (pit.linked_param.defined()) { storage_pool_.push_back(pit.linked_param); } else { - std::vector shape; - shape.push_back(static_cast(pit.size + 3) / 4); - storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx)); + std::vector shape = pit.shape; + if (shape.size() == 1) { + shape[0] = (shape[0] + 3) / 4; + } + Optional mem_scope = String(pit.scope); + storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, ctx, mem_scope)); } } diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h index a1e2ee3b5d74..c7e5871dbc17 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph/graph_runtime.h @@ -180,8 +180,9 @@ class TVM_DLL GraphRuntime : public ModuleNode { protected: // Memory pool entry. struct PoolEntry { - size_t size; int device_type; + std::vector shape; + DLDataType dtype; int param_data_entry; NDArray linked_param; // PoolEntry(int s, int dev_type, void* pre_linked_param) :