From a69c1a444eab9510bda1fac564e5f97fd335521a Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 2 Mar 2021 14:34:55 -0800
Subject: [PATCH] Update memory planner to optimize 2d memory allocations and
 add downstream support in graph runtime storage setup.

---
 src/relay/backend/graph_plan_memory.cc | 342 +++++++++++++++++--------
 src/runtime/graph/graph_runtime.cc     |  57 ++++-
 src/runtime/graph/graph_runtime.h      |   3 +-
 3 files changed, 279 insertions(+), 123 deletions(-)
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 15173c2c79db..2690fa847148 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -28,6 +28,7 @@
 #include <tvm/tir/op.h>
 
 #include "../../support/arena.h"
+#include "../../runtime/texture.h"
 
 namespace tvm {
 namespace relay {
@@ -184,17 +185,6 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
 
 class StorageAllocator : public StorageAllocaBaseVisitor {
  public:
-  /*!
-   * \return totoal number of bytes allocated
-   */
-  size_t TotalAllocBytes() const {
-    size_t total = 0;
-    for (const auto* p : data_) {
-      total += p->max_bytes;
-    }
-    return total;
-  }
-
   // Run storage allocation for a function.
   Map<Expr, Array<IntegerArray> > Plan(const Function& func) {
     prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
@@ -238,10 +228,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     std::vector<StorageToken*> tokens;
     for (StorageToken* tok : it->second) {
       if (can_realloc) {
-        tokens.push_back(Request(tok));
+        tokens.push_back(allocator_.Request(tok));
       } else {
         // Allocate a new token,
-        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        StorageToken* allocated_tok = allocator_.Alloc(tok);
         allocated_tok->device_type = tok->device_type;
         // ensure it never get de-allocated.
         allocated_tok->ref_counter += 1;
@@ -263,116 +253,252 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     CreateToken(op, true);
     // check if there is orphaned output that can be released immediately.
     for (StorageToken* tok : token_map_.at(op)) {
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
     for (StorageToken* tok : args) {
       tok->ref_counter -= 1;
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
   }
-  /*!
-   * \brief ceil(size/word_size) to get number of words.
-   * \param size The original size.
-   * \param word_size The element size.
-   */
-  static size_t DivRoundUp(size_t size, size_t word_size) {
-    return (size + word_size - 1) / word_size;
-  }
-  /*!
-   * \brief Get the memory requirement.
-   * \param prototype The prototype token.
-   * \return The required memory size.
-   */
-  size_t GetMemorySize(StorageToken* prototype) {
-    const TensorTypeNode* ttype = prototype->ttype;
-    ICHECK(ttype != nullptr);
-    size_t size = 1;
-    for (IndexExpr dim : ttype->shape) {
-      const int64_t* pval = tir::as_const_int(dim);
-      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-      size *= static_cast<size_t>(pval[0]);
+
+ private:
+  class TokenAllocator1D {
+  public:
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      // calculate the size;
+      size_t size = GetMemorySize(prototype);
+      // search memory block in [size / match_range_, size * match_range_)
+      if (match_range_ == 0) {
+        return nullptr;
+      }
+      auto begin = free_.lower_bound(size / match_range_);
+      auto mid = free_.lower_bound(size);
+      auto end = free_.upper_bound(size * match_range_);
+      // search for memory blocks larger than requested
+      for (auto it = mid; it != end; ++it) {
+        StorageToken* tok = it->second;
+        if (tok->device_type != prototype->device_type) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // find a exact match, erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      // then search for memory blocks smaller than requested space
+      for (auto it = mid; it != begin;) {
+        --it;
+        StorageToken* tok = it->second;
+        if (tok->device_type != prototype->device_type) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      return nullptr;
     }
-    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
-    return size;
-  }
-  /*!
-   * \brief Request a storage token for a given prototype.
-   * \param prototype. The prototype storage token.
-   * \return The result token.
-   */
-  StorageToken* Request(StorageToken* prototype) {
-    // calculate the size;
-    size_t size = GetMemorySize(prototype);
-    // search memory block in [size / match_range_, size * match_range_)
-    if (match_range_ == 0) {
-      return this->Alloc(prototype, size);
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      size_t size = GetMemorySize(prototype);
+      prototype->max_bytes = size;
+      prototype->storage_id = storage_id;
+      data_.push_back(prototype);
+      return prototype;
     }
-    auto begin = free_.lower_bound(size / match_range_);
-    auto mid = free_.lower_bound(size);
-    auto end = free_.upper_bound(size * match_range_);
-    // search for memory blocks larger than requested
-    for (auto it = mid; it != end; ++it) {
-      StorageToken* tok = it->second;
-      if (tok->device_type != prototype->device_type) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_.insert({tok->max_bytes, tok});
+      }
     }
-    // then search for memory blocks smaller than requested space
-    for (auto it = mid; it != begin;) {
-      --it;
-      StorageToken* tok = it->second;
-      if (tok->device_type != prototype->device_type) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \return totoal number of bytes allocated
+     */
+    size_t TotalAllocBytes() const {
+      size_t total = 0;
+      for (const auto* p : data_) {
+        total += p->max_bytes;
+      }
+      return total;
     }
-    // cannot find anything return a new one.
-    return this->Alloc(prototype, size);
-  }
-  /*!
-   * \brief Allocate a storage token by consuming prototype
-   * \param prototype The prototype token.
-   * \param size The size of memory being requested.
-   */
-  StorageToken* Alloc(StorageToken* prototype, size_t size) {
-    prototype->max_bytes = size;
-    prototype->storage_id = static_cast<int64_t>(data_.size());
-    data_.push_back(prototype);
-    return prototype;
-  }
-  /*!
-   * \brief Check if we can release token.
-   * \param tok The token to be released.
-   */
-  void CheckForRelease(StorageToken* tok) {
-    ICHECK_GE(tok->storage_id, 0);
-    ICHECK_GE(tok->ref_counter, 0);
-    if (tok->ref_counter == 0) {
-      free_.insert({tok->max_bytes, tok});
+    /*!
+     * \brief ceil(size/word_size) to get number of words.
+     * \param size The original size.
+     * \param word_size The element size.
+     */
+    static size_t DivRoundUp(size_t size, size_t word_size) {
+      return (size + word_size - 1) / word_size;
     }
-  }
+    /*!
+     * \brief Get the memory requirement.
+     * \param prototype The prototype token.
+     * \return The required memory size.
+     */
+    size_t GetMemorySize(StorageToken* prototype) {
+      const TensorTypeNode* ttype = prototype->ttype;
+      ICHECK(ttype != nullptr);
+      size_t size = 1;
+      for (IndexExpr dim : ttype->shape) {
+        const int64_t* pval = tir::as_const_int(dim);
+        ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+        ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+        size *= static_cast<size_t>(pval[0]);
+      }
+      size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+      return size;
+    }
+  private:
+    // scale used for rough match
+    const size_t match_range_{16};
+    // free list of storage entry
+    std::multimap<size_t, StorageToken*> free_;
+    // all the storage resources available
+    std::vector<StorageToken*> data_;
+  };
+
+  class TokenAllocator2D {
+  public:
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      auto shape = GetSize2D(prototype);
+      int64_t requested_size = shape.height * shape.width;
+      int64_t min_added_size = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+      int64_t best_storage_id = -1;
+      MemBlock best_mem, new_mem;
+      for (int64_t free_id : free_list_) {
+        MemBlock& cached = blocks_[free_id];
+        // Can only reuse texture 2d blocks of the same type
+        if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
+          continue;
+        }
+        int64_t cached_size = cached.x_ * cached.y_;
+        new_mem.x_ = std::max(cached.x_, shape.width);
+        new_mem.y_ = std::max(cached.y_, shape.height);
+        int64_t expanded_size = new_mem.x_ * new_mem.y_;
+        int64_t added_size = expanded_size - cached_size;
+        int64_t wasted_size = expanded_size - requested_size;
+        // Prioritize minimization of added size first, then minimize
+        // wasted size among blocks which would not require expansion
+        if ((min_added_size > 0 && added_size < min_added_size) ||
+            (min_added_size == 0 && wasted_size < min_wasted_size)) {
+          min_added_size = added_size;
+          min_wasted_size = wasted_size;
+          best_storage_id = free_id;
+          best_mem = new_mem;
+        }
+      }
+
+      if (min_added_size <= requested_size) {
+        best_mem.token_ = blocks_[best_storage_id].token_;
+        // Reset the reference counter of the now live token
+        best_mem.token_->ref_counter = prototype->ref_counter;
+        blocks_[best_storage_id] = best_mem;
+        free_list_.erase(best_storage_id);
+        return best_mem.token_;
+      }
+      return nullptr;
+    }
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      auto shape = GetSize2D(prototype);
+      MemBlock block;
+      block.x_ = shape.width;
+      block.y_ = shape.height;
+      prototype->storage_id = storage_id;
+      block.token_ = prototype;
+      blocks_[prototype->storage_id] = block;
+      return prototype;
+    }
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_list_.insert(tok->storage_id);
+      }
+    }
+    /*!
+     * \brief Get the texture 2d size requirement
+     * \param prototype The prototype token.
+     * \return The required texture 2d memory size in (width, height, channel).
+     */
+    Texture2DShape GetSize2D(StorageToken* prototype) {
+      const TensorTypeNode* ttype = prototype->ttype;
+      ICHECK(ttype != nullptr);
+      size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(), prototype->storage_scope);
+      struct Shape {
+        const Array<PrimExpr>& shape;
+        int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+      };
+      return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(), axis);
+    }
+  private:
+    struct MemBlock {
+      StorageToken* token_;
+      int64_t x_;
+      int64_t y_;
+    };
+
+    std::unordered_map<int64_t, MemBlock> blocks_;
+    std::unordered_set<int64_t> free_list_;
+  };
+
+  class TokenAllocator {
+  public:
+    StorageToken* Alloc(StorageToken* proto) {
+      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++) : token_1d_.Alloc(proto, storage_ids_++);
+    }
+    StorageToken* Request(StorageToken* proto) {
+      StorageToken* token = Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
+      return token ? token : this->Alloc(proto);
+    }
+    void CheckForRelease(StorageToken* tok) {
+      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
+    }
+    static bool Is2DStorage(StorageToken* tok) { return relay::Is2DStorage(tok->storage_scope); }
+
+  private:
+    int64_t storage_ids_{0};
+    TokenAllocator1D token_1d_;
+    TokenAllocator2D token_2d_;
+  };
 
- private:
   // allocator
   support::Arena arena_;
-  // scale used for rough match
-  size_t match_range_{16};
-  // free list of storage entry
-  std::multimap<size_t, StorageToken*> free_;
-  // all the storage resources available
-  std::vector<StorageToken*> data_;
   /*! \brief internal prototype token map */
   std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
+  /*! \brief token allocator for optimizing 1d and 2d token alloc requests */
+  TokenAllocator allocator_;
 };
 
 Map<Expr, Array<IntegerArray> > GraphPlanMemory(const Function& func) {
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 6c51e711aef1..e50d615f147b 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -39,6 +39,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../texture.h"
 
 namespace tvm {
 namespace runtime {
@@ -48,6 +49,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   if (align < kAllocAlignment) return kAllocAlignment;
   return align;
 }
+constexpr auto Is2DStorage = IsTextureStorage;
 }  // namespace details
 
 /*!
@@ -279,24 +281,18 @@ void GraphRuntime::SetupStorage() {
   // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
+    std::string storage_scope = attrs_.storage_scope[i];
     // Use the fallback device if no device index is available.
     int device_type = static_cast<int>(ctxs_[0].device_type);
     if (!attrs_.device_index.empty()) {
       device_type = attrs_.device_index[i];
     }
-    size_t size = 1;
-    for (int64_t sz : attrs_.shape[i]) {
-      size *= static_cast<size_t>(sz);
-    }
-    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
-    DLDataType t = vtype[i];
-    size_t bits = t.bits * t.lanes;
-    ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-    size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
+
     if (sid >= pool_entry.size()) {
-      pool_entry.resize(sid + 1, {0, -1});
+      pool_entry.resize(sid + 1, {-1, {0}, {}});
     } else {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
@@ -313,8 +309,38 @@ void GraphRuntime::SetupStorage() {
       pool_entry[sid].linked_param = lookup_rv;
     }
     pool_entry[sid].param_data_entry = i;
-    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
+    pool_entry[sid].scope = storage_scope;
+
+    DLDataType t = vtype[i];
+    if (!details::Is2DStorage(storage_scope)) {
+      size_t size = 1;
+      for (int64_t sz : attrs_.shape[i]) {
+        size *= static_cast<size_t>(sz);
+      }
+      size_t bits = t.bits * t.lanes;
+      ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
+      int64_t bytes = ((bits + 7U) / 8U) * size;
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
+      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
+    } else {
+      if (pool_entry[sid].shape.size() == 1) {
+        pool_entry[sid].shape.resize(3, 0);
+      }
+      size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope);
+      auto shape = ApplyTexture2DFlattening<int64_t>(attrs_.shape[i], attrs_.shape[i].size(), axis);
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height);
+      pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width);
+      CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel)
+        << pool_entry[sid].shape[2] << " != " << shape.channel
+        << ",  texture channel length must be consistent within a storage pool";
+      pool_entry[sid].shape[2] = shape.channel;
+      CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t))
+        << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t)
+        << ", pool entry for 2d texure allocations must be of the same type;"
+        << " downstream error from memory planner likely";
+      pool_entry[sid].dtype = t;
+    }
   }
 
   // Allocate the space.
@@ -328,9 +354,12 @@ void GraphRuntime::SetupStorage() {
     if (pit.linked_param.defined()) {
       storage_pool_.push_back(pit.linked_param);
     } else {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+      std::vector<int64_t> shape = pit.shape;
+      if (shape.size() == 1) {
+        shape[0] = (shape[0] + 3) / 4;
+      }
+      Optional<String> mem_scope = String(pit.scope);
+      storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, ctx, mem_scope));
     }
   }
 
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index a1e2ee3b5d74..c7e5871dbc17 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -180,8 +180,9 @@ class TVM_DLL GraphRuntime : public ModuleNode {
  protected:
   // Memory pool entry.
   struct PoolEntry {
-    size_t size;
     int device_type;
+    std::vector<int64_t> shape;
+    DLDataType dtype;
     int param_data_entry;
     NDArray linked_param;
     //    PoolEntry(int s, int dev_type, void* pre_linked_param) :