From 245ed1d218eb3ed13c88f58c0604376b0572e8f1 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 27 Oct 2020 16:57:48 -0700
Subject: [PATCH 01/59] Add support for kTexture storage rank.

---
 src/runtime/thread_storage_scope.h | 7 +++++++
 src/te/operation/op_utils.cc       | 6 ++++--
 src/te/schedule/bound.cc           | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index ac8260ffbe39..611a40d996ea 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -59,6 +59,8 @@ enum class StorageRank {
   kWMMAMatrixB = 5,
   /*! \brief wmma scope memory of accumulator */
   kWMMAAccumulator = 6,
+  /*! \brief global scope texture memory */
+  kTexture = 7,
 };
 
 /*!
@@ -108,6 +110,8 @@ struct StorageScope {
         return "wmma.matrix_b" + tag;
       case StorageRank::kWMMAAccumulator:
         return "wmma.accumulator" + tag;
+      case StorageRank::kTexture:
+        return "texture" + tag;
       default:
         LOG(FATAL) << "unknown storage scope";
         return "";
@@ -143,6 +147,9 @@ struct StorageScope {
     } else if (s.compare(0, 16, "wmma.accumulator") == 0) {
       r.rank = StorageRank::kWMMAAccumulator;
       r.tag = s.substr(16, std::string::npos);
+    } else if (s.compare(0, 7, "texture") == 0) {
+      r.rank = StorageRank::kTexture;
+      r.tag = s.substr(7, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index b3897e142545..de0d6b5be848 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -156,10 +156,12 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op));
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
         value_map[iv] = dom->min;
+      } else if (stage->scope == "") {
+        value_map[iv] = var;
       } else {
         runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag);
-        if (stage->scope == "" ||
-            static_cast<int>(runtime::StorageScope::Create(stage->scope).rank) <= ts.rank) {
+        runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope);
+        if (static_cast<int>(ss.rank) <= ts.rank || ss.rank == runtime::StorageRank::kTexture) {
           value_map[iv] = var;
         } else if (stage->scope == "warp" && ts.rank == 1) {
           // To determine whether a thread index is inside or outside a warp, we need
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 12c9b5538b44..c7ec8f23892c 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -66,7 +66,7 @@ bool NeedRelax(const IterVar& iv, bool found_attach,
   if (scope.rank == StorageRank::kWarp && ts.rank == 1 && ts.dim_index == 0) {
     return true;
   }
-  return static_cast<int>(scope.rank) <= ts.rank;
+  return static_cast<int>(scope.rank) <= ts.rank || scope.rank == StorageRank::kTexture;
 }
 
 // infer storage scope, if not given

From 9c0921320e4a26294547eff4c7d9ed7b92120d2c Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 2 Nov 2020 14:39:40 -0800
Subject: [PATCH 02/59] Add scaffolding for texture_flatten pass.

---
 python/tvm/tir/transform/transform.py |  20 ++++
 src/tir/transforms/texture_flatten.cc | 137 ++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 src/tir/transforms/texture_flatten.cc

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 537499a27fa9..a93cf2c3b3f8 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -94,6 +94,26 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False):
     """
     return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute)  # type: ignore
 
+def TextureFlatten(cache_line_size, create_bound_attribute=False):
+    """Flatten the multi-dimensional read/write to 1D.
+
+
+    Parameters
+    ----------
+    cache_line_size: int
+        The size of CPU cache line.
+
+    create_bound_attribute:
+        Whether to create bound attributes.
+
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.TextureFlatten(cache_line_size, create_bound_attribute)
+
 
 def InjectCopyIntrin(pragma_key: str, fintrin):
     """Inject virtual thread loops.
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
new file mode 100644
index 000000000000..b0e7bd0379dc
--- /dev/null
+++ b/src/tir/transforms/texture_flatten.cc
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file storage_flatten.cc
+ * \brief Flattens storage from multi-dimensional array to 1D buffer access
+ */
+// The pass definition originates from Halide pipeline.
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target_info.h>
+#include <tvm/te/operation.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_map>
+
+#include "../../arith/ir_visitor_with_analyzer.h"
+#include "../../runtime/thread_storage_scope.h"
+#include "arg_binder.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+using runtime::StorageRank;
+using runtime::StorageScope;
+using runtime::ThreadScope;
+
+class TextureFlattener : public StmtExprMutator {
+ public:
+  explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map, int cache_line_size,
+                            bool create_bound_attributes, IRVisitorWithAnalyzer* bound_analyzer) {}
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    if (op->attr_key == attr::realize_scope) {
+      storage_scope_[op->node.get()] = op->value.as<StringImmNode>()->value;
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferStoreNode>();
+
+    std::string storage_scope;
+    auto it = storage_scope_.find(op->buffer.get());
+    if (it != storage_scope_.end())
+    {
+      storage_scope = it->second;
+    }
+    else
+    {
+      storage_scope = op->buffer->scope;
+    }
+    if (storage_scope == "texture")
+    {
+      // TODO(csullivan): Implement texture intrinsic as builtin
+      // stmt = Evaluate(Call(op->buffer->dtype, builtin::isnan(), {op->value}));
+    }
+    return stmt;
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+    op = expr.as<BufferLoadNode>();
+
+    std::string storage_scope;
+    auto it = storage_scope_.find(op->buffer.get());
+    if (it != storage_scope_.end())
+    {
+      storage_scope = it->second;
+    }
+    else
+    {
+      storage_scope = op->buffer->scope;
+    }
+    if (storage_scope == "texture")
+    {
+      // TODO(csullivan): Implement texture intrinsic as builtin
+      // expr = Call(op->buffer->dtype, builtin::isnan(), {expr});
+    }
+    return expr;
+  }
+ private:
+  // Storage scope
+  std::unordered_map<const Object*, std::string> storage_scope_;
+};
+
+PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) {
+  auto fptr = func.CopyOnWrite();
+
+  IRVisitorWithAnalyzer bound_analyzer;
+  bound_analyzer(fptr->body);
+  fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes,
+                                &bound_analyzer)(std::move(fptr->body));
+  return func;
+}
+
+namespace transform {
+
+Pass TextureFlatten(int cache_line_size, bool create_bound_attributes) {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TextureFlatten(std::move(f), cache_line_size, create_bound_attributes);
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.TextureFlatten").set_body_typed(TextureFlatten);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm

From d76878cc246660d0ecb6c395d07b22c363336866 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 2 Nov 2020 15:50:21 -0800
Subject: [PATCH 03/59] Add scaffolding for texture allocation.

---
 src/tir/transforms/texture_flatten.cc | 36 +++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index b0e7bd0379dc..eedb2ff3bf56 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -62,6 +62,40 @@ class TextureFlattener : public StmtExprMutator {
     return StmtExprMutator::VisitStmt_(op);
   }
 
+  Stmt VisitStmt_(const BufferRealizeNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferRealizeNode>();
+
+    std::string storage_scope;
+    auto it = storage_scope_.find(op->buffer.get());
+    if (it != storage_scope_.end())
+    {
+      storage_scope = it->second;
+    }
+    else
+    {
+      storage_scope = op->buffer->scope;
+    }
+    if (storage_scope == "texture")
+    {
+      // TODO(csullivan): Implement texture intrinsic as builtin
+      // Stmt body = this->VisitStmt(op->body);
+      // Array<PrimExpr> shape;
+      // for (auto r : op->bounds) {
+      //   shape.push_back(r->extent);
+      // }
+      // if (shape.size() == 0) {
+      //   shape.push_back(make_const(DataType::Int(32), 1));
+      // }
+      // DataType storage_type = op->buffer->dtype;
+      // // TODO(csullivan): Consider check on float only
+      // stmt = Allocate(op->buffer->data, storage_type, shape,
+      //                make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body);
+      // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
+    }
+    return stmt;
+  }
+
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
@@ -111,12 +145,14 @@ class TextureFlattener : public StmtExprMutator {
 };
 
 PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) {
+  // std::cout << "Before TextureFlattening: " << func << std::endl;
   auto fptr = func.CopyOnWrite();
 
   IRVisitorWithAnalyzer bound_analyzer;
   bound_analyzer(fptr->body);
   fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes,
                                 &bound_analyzer)(std::move(fptr->body));
+  // std::cout << "After TextureFlattening: " << func << std::endl;
   return func;
 }
 

From e7c276b8fbd9afdeff1d8e83b93a19aec7dd7232 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 3 Nov 2020 14:16:54 -0800
Subject: [PATCH 04/59] Implement 2d texture flattening to builtin
 tir.text2d_alloca.

---
 include/tvm/tir/builtin.h             |  4 ++++
 src/tir/op/builtin.cc                 |  3 +++
 src/tir/transforms/texture_flatten.cc | 27 +++++++++++++--------------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 61280d33f1df..9f58985df758 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -600,6 +600,10 @@ TVM_DLL const Op& vectorcombine();
  * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA
  */
 TVM_DLL const Op& atomic_add();
+/*!
+ * \brief Create a texture 2d memory allocation
+ */
+TVM_DLL const Op& text2d_alloca();
 
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index f0ca04cbd5fd..4d6575eecaf7 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -246,6 +246,9 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine)
 TIR_DEFINE_BUILTIN_FUNC(atomic_add)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(text2d_alloca)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index eedb2ff3bf56..a38a498612e7 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -78,20 +78,19 @@ class TextureFlattener : public StmtExprMutator {
     }
     if (storage_scope == "texture")
     {
-      // TODO(csullivan): Implement texture intrinsic as builtin
-      // Stmt body = this->VisitStmt(op->body);
-      // Array<PrimExpr> shape;
-      // for (auto r : op->bounds) {
-      //   shape.push_back(r->extent);
-      // }
-      // if (shape.size() == 0) {
-      //   shape.push_back(make_const(DataType::Int(32), 1));
-      // }
-      // DataType storage_type = op->buffer->dtype;
-      // // TODO(csullivan): Consider check on float only
-      // stmt = Allocate(op->buffer->data, storage_type, shape,
-      //                make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body);
-      // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
+      Stmt body = this->VisitStmt(op->body);
+      Array<PrimExpr> shape;
+      for (auto r : op->bounds) {
+        shape.push_back(r->extent);
+      }
+      ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
+      ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
+
+      // TODO(csullivan): Consider check on float only
+      StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype));
+      Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+      stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body);
+      stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
     }
     return stmt;
   }

From bf321c9f4b061353ebb5e552bac68b2df2d7bec1 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 4 Nov 2020 10:06:01 -0800
Subject: [PATCH 05/59] Lower BufferStore/Load to builtin texture store/load.

---
 include/tvm/tir/builtin.h             | 10 ++++++++++
 src/tir/op/builtin.cc                 |  6 ++++++
 src/tir/transforms/texture_flatten.cc | 26 ++++++++++++++++++++------
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 9f58985df758..66fa069d62fa 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -605,6 +605,16 @@ TVM_DLL const Op& atomic_add();
  */
 TVM_DLL const Op& text2d_alloca();
 
+/*!
+ * \brief Store to a texture 2d memory
+ */
+TVM_DLL const Op& text2d_store();
+
+/*!
+ * \brief Load from a texture 2d memory
+ */
+TVM_DLL const Op& text2d_load();
+
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
   // array head address
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 4d6575eecaf7..ae6397ba9e5c 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -249,6 +249,12 @@ TIR_DEFINE_BUILTIN_FUNC(atomic_add)
 TIR_DEFINE_BUILTIN_FUNC(text2d_alloca)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(text2d_store)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(text2d_load)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 }  // namespace builtin
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index a38a498612e7..256ac3cda4dc 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -86,12 +86,14 @@ class TextureFlattener : public StmtExprMutator {
       ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
       ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
-      // TODO(csullivan): Consider check on float only
+      // TODO(csullivan): Consider check on float only?
       StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype));
       Array<PrimExpr> args = {dtype, shape[0], shape[1]};
       stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body);
-      stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
+      // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating
+      // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
     }
+
     return stmt;
   }
 
@@ -111,9 +113,15 @@ class TextureFlattener : public StmtExprMutator {
     }
     if (storage_scope == "texture")
     {
-      // TODO(csullivan): Implement texture intrinsic as builtin
-      // stmt = Evaluate(Call(op->buffer->dtype, builtin::isnan(), {op->value}));
+      // TODO(csullivan): Need autovectorization
+      Array<PrimExpr> args = {op->buffer->data, op->value};
+      for (auto& i : op->indices)
+      {
+        args.push_back(i);
+      }
+      stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args));
     }
+
     return stmt;
   }
 
@@ -133,9 +141,15 @@ class TextureFlattener : public StmtExprMutator {
     }
     if (storage_scope == "texture")
     {
-      // TODO(csullivan): Implement texture intrinsic as builtin
-      // expr = Call(op->buffer->dtype, builtin::isnan(), {expr});
+      // TODO(csullivan): Need autovectorization
+      Array<PrimExpr> args = {op->buffer->data};
+      for (auto& i : op->indices)
+      {
+        args.push_back(i);
+      }
+      expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
     }
+
     return expr;
   }
  private:

From 8afb61159183762996fbfe9b1442fcc7ec6d2d0c Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 4 Nov 2020 10:45:47 -0800
Subject: [PATCH 06/59] Add vectorizable attribure to texture load and store.

---
 src/tir/op/builtin.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index ae6397ba9e5c..7705369eb5c8 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -250,9 +250,11 @@ TIR_DEFINE_BUILTIN_FUNC(text2d_alloca)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_BUILTIN_FUNC(text2d_store)
+    .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_BUILTIN_FUNC(text2d_load)
+    .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 }  // namespace builtin

From 17ca755a76749354f65d561fa15dca49b1ce81ce Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 4 Nov 2020 16:57:14 -0800
Subject: [PATCH 07/59] Support auto-vectorization on the innermost (RGBA)
 axis.

---
 python/tvm/tir/transform/transform.py | 10 +----
 src/tir/transforms/texture_flatten.cc | 53 ++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index a93cf2c3b3f8..4cdf7d47856e 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -94,25 +94,19 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False):
     """
     return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute)  # type: ignore
 
-def TextureFlatten(cache_line_size, create_bound_attribute=False):
+def TextureFlatten():
     """Flatten the multi-dimensional read/write to 1D.
 
 
     Parameters
     ----------
-    cache_line_size: int
-        The size of CPU cache line.
-
-    create_bound_attribute:
-        Whether to create bound attributes.
-
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.TextureFlatten(cache_line_size, create_bound_attribute)
+    return _ffi_api.TextureFlatten()
 
 
 def InjectCopyIntrin(pragma_key: str, fintrin):
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 256ac3cda4dc..9a92476021f3 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -52,8 +52,7 @@ using runtime::ThreadScope;
 
 class TextureFlattener : public StmtExprMutator {
  public:
-  explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map, int cache_line_size,
-                            bool create_bound_attributes, IRVisitorWithAnalyzer* bound_analyzer) {}
+  explicit TextureFlattener() : needs_vectorization_(true) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::realize_scope) {
@@ -120,6 +119,10 @@ class TextureFlattener : public StmtExprMutator {
         args.push_back(i);
       }
       stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args));
+      if (needs_vectorization_)
+      {
+        loop_vars_.insert({op->indices.back().get(), true});
+      }
     }
 
     return stmt;
@@ -148,32 +151,62 @@ class TextureFlattener : public StmtExprMutator {
         args.push_back(i);
       }
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
+      if (needs_vectorization_)
+      {
+        loop_vars_.insert({op->indices.back().get(), true});
+      }
     }
 
     return expr;
   }
+
+  // Auto-vectorize texture load and store loops
+  Stmt VisitStmt_(const ForNode* op) final {
+    Stmt stmt;
+    if (!needs_vectorization_)
+    {
+      stmt = StmtMutator::VisitStmt_(op);
+    }
+    else if (op->for_type == ForType::Serial)
+    {
+      stmt = StmtMutator::VisitStmt_(op);
+      auto it = loop_vars_.find(op->loop_var.get());
+      if (it != loop_vars_.end() && it->second)
+      {
+        stmt = For(op->loop_var, op->min, op->extent, ForType::Vectorized, op->device_api, op->body);
+        stmt = StmtMutator::VisitStmt_(stmt.as<ForNode>());
+      }
+    }
+    else
+    {
+      needs_vectorization_ = false;
+      stmt = StmtMutator::VisitStmt_(op);
+      needs_vectorization_ = true;
+    }
+
+    return stmt;
+  }
+
  private:
   // Storage scope
   std::unordered_map<const Object*, std::string> storage_scope_;
+  std::unordered_map<const Object*, bool> loop_vars_;
+  bool needs_vectorization_;
 };
 
-PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) {
+PrimFunc TextureFlatten(PrimFunc func) {
   // std::cout << "Before TextureFlattening: " << func << std::endl;
   auto fptr = func.CopyOnWrite();
-
-  IRVisitorWithAnalyzer bound_analyzer;
-  bound_analyzer(fptr->body);
-  fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes,
-                                &bound_analyzer)(std::move(fptr->body));
+  fptr->body = TextureFlattener()(std::move(fptr->body));
   // std::cout << "After TextureFlattening: " << func << std::endl;
   return func;
 }
 
 namespace transform {
 
-Pass TextureFlatten(int cache_line_size, bool create_bound_attributes) {
+Pass TextureFlatten() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    return TextureFlatten(std::move(f), cache_line_size, create_bound_attributes);
+    return TextureFlatten(std::move(f));
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {});
 }

From 560baa9cb8f73cef48d46e71d2fd79fd5cea1b0d Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 9 Nov 2020 11:14:37 -0800
Subject: [PATCH 08/59] Add read/write_imagef opencl codegen for builtin
 texture load/store.

---
 src/target/source/codegen_opencl.cc   |  28 +++++++
 src/tir/transforms/texture_flatten.cc | 101 +++++++++++++++++++++++---
 src/tir/transforms/vectorize_loop.cc  |  12 +++
 3 files changed, 131 insertions(+), 10 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index edb614d9c122..cb7b0f733b1a 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -243,6 +243,34 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << " *)" << this->GetVarID(load->buffer_var.get()) << " + ";
     this->PrintExpr(load->index, os);
     os << ')';
+  } else if (op->op.same_as(builtin::text2d_store())) {
+    os << "write_imagef(";
+    this->PrintExpr(op->args[0], os);
+    os << ", ";
+    os << "(int2)(";
+    this->PrintExpr(op->args[2], os);
+    os << ", ";
+    this->PrintExpr(op->args[1], os);
+    os << "), ";
+    this->PrintExpr(op->args[3], os);
+    os << ")";
+  } else if (op->op.same_as(builtin::text2d_load())) {
+    /*
+      float4 read_imagef(read_only image2d_t image,
+      sampler_t sampler,
+      int2 coord)
+    */
+    // std::cout << "LOAD\n";
+    // std::cout << op->args << std::endl;
+    os << "read_imagef(";
+    this->PrintExpr(op->args[0], os);
+    os << ", ";
+    os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
+    os << "(int2)(";
+    this->PrintExpr(op->args[2], os);
+    os << ", ";
+    this->PrintExpr(op->args[1], os);
+    os << "))";
   } else if (op->op.same_as(builtin_call_extern_)) {
     auto func = Downcast<StringImm>(op->args[0]);
     // Enable atomics extension if used.
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 9a92476021f3..b97c32b391c9 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -62,6 +62,10 @@ class TextureFlattener : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
+    //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4);
+    // Var buffer_var(op->buffer->data->name_hint, vdtype);
+    // let_binding_.insert({op->buffer->data, buffer_var});
+
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferRealizeNode>();
 
@@ -86,8 +90,10 @@ class TextureFlattener : public StmtExprMutator {
       ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
       // TODO(csullivan): Consider check on float only?
-      StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype));
+      //StringImm dtype = StringImm(runtime::DLDataType2String(vdtype));
+      StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->data.dtype()));
       Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+
       stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body);
       // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating
       // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
@@ -96,6 +102,46 @@ class TextureFlattener : public StmtExprMutator {
     return stmt;
   }
 
+  // Stmt VisitStmt_(const BufferRealizeNode* op) final {
+  //   //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4);
+  //   // Var buffer_var(op->buffer->data->name_hint, vdtype);
+  //   // let_binding_.insert({op->buffer->data, buffer_var});
+
+  //   Stmt stmt = StmtExprMutator::VisitStmt_(op);
+  //   op = stmt.as<BufferRealizeNode>();
+
+  //   std::string storage_scope;
+  //   auto it = storage_scope_.find(op->buffer.get());
+  //   if (it != storage_scope_.end())
+  //   {
+  //     storage_scope = it->second;
+  //   }
+  //   else
+  //   {
+  //     storage_scope = op->buffer->scope;
+  //   }
+  //   if (storage_scope == "texture")
+  //   {
+  //     Stmt body = this->VisitStmt(op->body);
+  //     Array<PrimExpr> shape;
+  //     for (auto r : op->bounds) {
+  //       shape.push_back(r->extent);
+  //     }
+  //     ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
+  //     ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
+
+  //     // TODO(csullivan): Consider check on float only?
+  //     StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype));
+  //     Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+  //     stmt = Allocate(op->buffer->data, op->buffer->dtype, shape,
+  //              make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body);
+  //     // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating
+  //     //stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
+  //   }
+
+  //   return stmt;
+  // }
+
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
@@ -112,13 +158,29 @@ class TextureFlattener : public StmtExprMutator {
     }
     if (storage_scope == "texture")
     {
-      // TODO(csullivan): Need autovectorization
-      Array<PrimExpr> args = {op->buffer->data, op->value};
-      for (auto& i : op->indices)
+      Array<PrimExpr> args;
+      if (let_binding_.count(op->buffer->data))
+      {
+        args.push_back(let_binding_[op->buffer->data]);
+      }
+      else
       {
-        args.push_back(i);
+        args.push_back(op->buffer->data);
       }
-      stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args));
+      // for (auto& i : op->indices)
+      // {
+      //   args.push_back(i);
+      // }
+
+      // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct.
+      // I don't think this will work generally	when tensor dimension doesn't have (4) in the FCD.
+      for (size_t i = 0u; i < op->indices.size()-1; i++)
+      {
+        args.push_back(op->indices[i]);
+      }
+      args.push_back(op->value);
+
+      stmt = Evaluate(Call(DataType::Void(), builtin::text2d_store(), args));
       if (needs_vectorization_)
       {
         loop_vars_.insert({op->indices.back().get(), true});
@@ -144,12 +206,29 @@ class TextureFlattener : public StmtExprMutator {
     }
     if (storage_scope == "texture")
     {
-      // TODO(csullivan): Need autovectorization
-      Array<PrimExpr> args = {op->buffer->data};
-      for (auto& i : op->indices)
+      Array<PrimExpr> args;
+      if (let_binding_.count(op->buffer->data))
+      {
+        args.push_back(let_binding_[op->buffer->data]);
+      }
+      else
+      {
+        args.push_back(op->buffer->data);
+      }
+
+
+      // for (auto& i : op->indices)
+      // {
+      //   args.push_back(i);
+      // }
+
+      // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct.
+      // I don't think this will work generally	when tensor dimension doesn't have (4) in the FCD.
+      for (size_t i = 0u; i < op->indices.size()-1; i++)
       {
-        args.push_back(i);
+        args.push_back(op->indices[i]);
       }
+
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
       if (needs_vectorization_)
       {
@@ -190,6 +269,8 @@ class TextureFlattener : public StmtExprMutator {
  private:
   // Storage scope
   std::unordered_map<const Object*, std::string> storage_scope_;
+  // Let binding
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
   std::unordered_map<const Object*, bool> loop_vars_;
   bool needs_vectorization_;
 };
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 64956bc8ee54..3f33667ea2da 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -266,6 +266,18 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (op->op.same_as(builtin::if_then_else())) {
       return MutateIfThenElseExpr_(op);
     }
+    else if (op->op.same_as(builtin::text2d_load()))
+    {
+      return Call(op->dtype.with_lanes(4), op->op, op->args);
+    }
+    else if (op->op.same_as(builtin::text2d_store()))
+    {
+      int lane = 0;
+      Array<PrimExpr> value{op->args.back()};
+      Array<PrimExpr> mutated_value = MutateArray(value, &lane);
+      Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]};
+      return Call(op->dtype.with_lanes(lane), op->op, new_args);
+    }
     auto* op_ptr = op->op.as<OpNode>();
     bool vectorizable = op_ptr && op_vectorizable_.get(GetRef<Op>(op_ptr), false);
 

From 14806f51618cb8bc3d0de37e9b1d84b10a788c85 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 12 Nov 2020 11:42:04 -0800
Subject: [PATCH 09/59] Add TextureType support.

---
 include/tvm/ir/type.h                 | 49 +++++++++++++++++++++++++++
 include/tvm/ir/type_functor.h         |  4 +++
 src/ir/type.cc                        | 27 +++++++++++++++
 src/ir/type_functor.cc                | 12 +++++++
 src/printer/text_printer.h            |  1 +
 src/printer/tir_text_printer.cc       |  6 ++++
 src/printer/tvmscript_printer.cc      |  7 ++++
 src/target/source/codegen_opencl.cc   | 15 ++++++++
 src/target/source/codegen_opencl.h    |  1 +
 src/tir/op/op.cc                      |  2 +-
 src/tir/transforms/texture_flatten.cc |  6 ++--
 11 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index c772650809fa..8d073e88b0ab 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -189,6 +189,55 @@ class PointerType : public Type {
   TVM_DEFINE_OBJECT_REF_METHODS(PointerType, Type, PointerTypeNode);
 };
 
+/*!
+ * \brief Low-level texture type.
+ *
+ *  TextureType represents type hints in the TIR to be
+ *  passed to the final code generator.
+ *
+ *  TextureType should not occur in the high-level analysis.
+ *
+ * \sa TextureType
+ */
+class TextureTypeNode : public TypeNode {
+ public:
+  /*!
+   * \brief The base type of the texture.
+   */
+  Type element_type;
+
+  void VisitAttrs(AttrVisitor* v) { v->Visit("element_type", &element_type); }
+
+  bool SEqualReduce(const TextureTypeNode* other, SEqualReducer equal) const {
+    return equal(element_type, other->element_type);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(element_type); }
+
+  static constexpr const char* _type_key = "TextureType";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TextureTypeNode, TypeNode);
+};
+
+/*
+ * \brief Managed reference to TextureTypeNode.
+ * \sa TextureTypeNode
+ */
+class TextureType : public Type {
+ public:
+  /*!
+   * \brief Constructor
+   * \param element_type The base type of the texture.
+   */
+  TVM_DLL explicit TextureType(Type element_type);
+  /*!
+   * \brief Constructor
+   * \param element_type The base type of the texture.
+   */
+  TVM_DLL explicit TextureType(runtime::DataType dtype);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TextureType, Type, TextureTypeNode);
+};
+
 /*! \brief Possible kinds of TypeVars. */
 enum TypeKind : int {
   kType = 0,
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index 11bf7d4740d0..c71051e6f61c 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -89,6 +89,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
   virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const PrimTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const PointerTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TextureTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitTypeDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;  // unreachable, written to stop compiler warning
@@ -112,6 +113,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
     TVM_TYPE_FUNCTOR_DISPATCH(TypeDataNode);
     TVM_TYPE_FUNCTOR_DISPATCH(PrimTypeNode);
     TVM_TYPE_FUNCTOR_DISPATCH(PointerTypeNode);
+    TVM_TYPE_FUNCTOR_DISPATCH(TextureTypeNode);
     return vtable;
   }
 };
@@ -135,6 +137,7 @@ class TVM_DLL TypeVisitor : public TypeFunctor<void(const Type& n)> {
   void VisitType_(const TypeDataNode* op) override;
   void VisitType_(const PrimTypeNode* op) override;
   void VisitType_(const PointerTypeNode* op) override;
+  void VisitType_(const TextureTypeNode* op) override;
 };
 
 /*!
@@ -155,6 +158,7 @@ class TVM_DLL TypeMutator : public TypeFunctor<Type(const Type& n)> {
   Type VisitType_(const TypeDataNode* op) override;
   Type VisitType_(const PrimTypeNode* op) override;
   Type VisitType_(const PointerTypeNode* op) override;
+  Type VisitType_(const TextureTypeNode* op) override;
 
  private:
   Array<Type> MutateArray(Array<Type> arr);
diff --git a/src/ir/type.cc b/src/ir/type.cc
index fe8e00329bbc..5e0c8911c543 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -67,6 +67,33 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << '*';
     });
 
+TextureType::TextureType(Type element_type) {
+  ObjectPtr<TextureTypeNode> n = make_object<TextureTypeNode>();
+  n->element_type = std::move(element_type);
+  data_ = std::move(n);
+}
+TextureType::TextureType(runtime::DataType dtype) {
+  ObjectPtr<TextureTypeNode> n = make_object<TextureTypeNode>();
+  n->element_type = PrimType(dtype);
+  data_ = std::move(n);
+}
+
+
+TVM_REGISTER_NODE_TYPE(TextureTypeNode);
+
+TVM_REGISTER_GLOBAL("ir.TextureType").set_body_typed([](Type element_type) {
+    return TextureType(element_type);
+});
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<TextureTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
+        auto* node = static_cast<const TextureTypeNode*>(ref.get());
+        p->stream << "texture ";
+        p->Print(node->element_type);
+        p->stream << '*';
+    });
+
+
 TypeVar::TypeVar(String name, TypeKind kind, Span span) {
   ObjectPtr<TypeVarNode> n = make_object<TypeVarNode>();
   n->name_hint = std::move(name);
diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc
index 51d5d3778c10..e084a82ed7be 100644
--- a/src/ir/type_functor.cc
+++ b/src/ir/type_functor.cc
@@ -89,6 +89,8 @@ void TypeVisitor::VisitType_(const PrimTypeNode* op) {}
 
 void TypeVisitor::VisitType_(const PointerTypeNode* op) { this->VisitType(op->element_type); }
 
+void TypeVisitor::VisitType_(const TextureTypeNode* op) { this->VisitType(op->element_type); }
+
 Type TypeMutator::VisitType(const Type& t) {
   return t.defined() ? TypeFunctor<Type(const Type&)>::VisitType(t) : t;
 }
@@ -198,6 +200,16 @@ Type TypeMutator::VisitType_(const PointerTypeNode* op) {
   }
 }
 
+Type TypeMutator::VisitType_(const TextureTypeNode* op) {
+  Type element_type = VisitType(op->element_type);
+
+  if (element_type.same_as(op->element_type)) {
+    return GetRef<Type>(op);
+  } else {
+    return TextureType(element_type);
+  }
+}
+
 // Implements bind.
 class TypeBinder : public TypeMutator {
  public:
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 0332a2d539d2..55f68f3e36cb 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -333,6 +333,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
 
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
+  Doc VisitType_(const TextureTypeNode* node) override;
   Doc VisitType_(const TupleTypeNode* node) override;
 
   Doc PrintIRModule(const IRModule& module);
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index f232994480f8..b137ae34107d 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -613,6 +613,12 @@ Doc TIRTextPrinter::VisitType_(const PointerTypeNode* node) {
   return doc;
 }
 
+Doc TIRTextPrinter::VisitType_(const TextureTypeNode* node) {
+  Doc doc;
+  doc << "Texture(" << Print(node->element_type) << ")";
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitType_(const TupleTypeNode* node) {
   std::vector<Doc> fields;
   for (Type field : node->fields) {
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index cc7536b48cfd..39852c39b82a 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -145,6 +145,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
 
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
+  Doc VisitType_(const TextureTypeNode* node) override;
   Doc VisitType_(const TupleTypeNode* node) override;
 
   Doc PrintBody(const Stmt& body);
@@ -732,6 +733,12 @@ Doc TVMScriptPrinter::VisitType_(const PointerTypeNode* node) {
   return doc;
 }
 
+Doc TVMScriptPrinter::VisitType_(const TextureTypeNode* node) {
+  Doc doc;
+  doc << "ty.Texture[" << Print(node->element_type) << "]";
+  return doc;
+}
+
 Doc TVMScriptPrinter::VisitType_(const TupleTypeNode* node) {
   if (node->fields.empty()) {
     return Doc::Text("None");
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index cb7b0f733b1a..20cccb0b9198 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -162,6 +162,21 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type";
 }
 
+void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
+  if (auto* ptr = type.as<PrimTypeNode>()) {
+    return PrintType(ptr->dtype, os);
+  } else if (auto* ptr = type.as<PointerTypeNode>()) {
+    PrintType(ptr->element_type, os);
+    os << '*';
+  } else if (auto* ptr = type.as<TextureTypeNode>()){
+    os << "image2d_t";
+  } else if (IsVoidType(type)) {
+    os << "void";
+  } else {
+    LOG(FATAL) << "Type " << type << " does not have a corresponding C Type";
+  }
+}
+
 void CodeGenOpenCL::PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base,
                                  std::ostream& os) {  // NOLINT(*)
   if (!HandleTypeMatch(buffer, t.element_of())) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 32102fec22b9..f2b6a252f16c 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -45,6 +45,7 @@ class CodeGenOpenCL final : public CodeGenC {
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)
   void PrintType(DataType t, std::ostream& os) final;                        // NOLINT(*)
+  void PrintType(const Type& type, std::ostream& os) final;                  // NOLINT(*)
   std::string GetVecLoad(DataType t, const VarNode* buffer, PrimExpr base) final;
   void PrintVecStore(const VarNode* buffer, DataType t, PrimExpr base,
                      const std::string& value) final;  // NOLINT(*)
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index d29132450227..d03cf22094a8 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -51,7 +51,7 @@ using namespace tir;
 runtime::DataType GetRuntimeDataType(const Type& type) {
   if (auto* n = type.as<PrimTypeNode>()) {
     return n->dtype;
-  } else if (type.as<PointerTypeNode>()) {
+  } else if (type.as<PointerTypeNode>() || type.as<TextureTypeNode>()) {
     return DataType::Handle();
   } else if (IsVoidType(type)) {
     return DataType::Void();
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index b97c32b391c9..0b396f2ca56a 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -62,9 +62,9 @@ class TextureFlattener : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
-    //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4);
-    // Var buffer_var(op->buffer->data->name_hint, vdtype);
-    // let_binding_.insert({op->buffer->data, buffer_var});
+    //Var buffer_var(op->buffer->data->name_hint, DataType::Handle());
+    Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1)));
+    let_binding_.insert({op->buffer->data, buffer_var});
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferRealizeNode>();

From 0a321d1d0d8951a25bdc678d8fd55c4622081812 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 13 Nov 2020 13:02:42 -0800
Subject: [PATCH 10/59] Add InferTextureAccess pass to deduce __read_only and
 __write_only access qualifiers for texture vars. Also refactor use of
 restrict keyword to be var dependent.

---
 src/target/source/codegen_c.cc        | 10 +++-
 src/target/source/codegen_c.h         |  2 +
 src/target/source/codegen_opencl.cc   | 72 ++++++++++++++++++++++++++-
 src/target/source/codegen_opencl.h    |  7 ++-
 src/tir/transforms/texture_flatten.cc | 55 +++-----------------
 5 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index f676f0f598d8..a311111532c8 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -106,8 +106,8 @@ void CodeGenC::AddFunction(const PrimFunc& f) {
         }
       }
 
-      if (no_alias && restrict_keyword_.length() != 0) {
-        stream << ' ' << restrict_keyword_;
+      if (no_alias) {
+        PrintRestrict(v, stream);
       }
     } else {
       PrintType(GetType(v), stream);
@@ -1018,6 +1018,12 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
   return;
 }
 
+void CodeGenC::PrintRestrict(const Var& v, std::ostream& os) {
+  if (restrict_keyword_.length() != 0) {
+    os << ' ' << restrict_keyword_;
+  }
+}
+
 static bool CheckOutermostBracketMatch(const std::string& s) {
   if (!s.empty() && s.front() == '(' && s.back() == ')') {
     size_t len = s.size();
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index 6ebade7191f2..299f7e0a9cef 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -200,6 +200,8 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   virtual std::string CastFromTo(std::string value, DataType from, DataType target);
   // Get load of single element with expression
   virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os);
+  // Print restrict keyword for a given Var if applicable
+  virtual void PrintRestrict(const Var& v, std::ostream& os);
 
  protected:
   // Print reference to struct location
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 20cccb0b9198..13e65e20bcc6 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -33,12 +33,61 @@
 namespace tvm {
 namespace codegen {
 
+class InferTextureAccess : public StmtExprVisitor {
+public:
+  static constexpr const uint8_t read_access = 1;
+  static constexpr const uint8_t write_access = 2;
+
+  explicit InferTextureAccess() {}
+  std::unordered_map<const VarNode*, std::string> Infer(const Stmt& n) {
+    this->operator()(n);
+    std::unordered_map<const VarNode*, std::string> storage_scope_qualifiers;
+    for (auto& texture : var_access_map_) {
+      if (texture.second == read_access) {
+        storage_scope_qualifiers.insert({texture.first, "__read_only "});
+      }
+      else if (texture.second == write_access) {
+        storage_scope_qualifiers.insert({texture.first, "__write_only "});
+      }
+      else if (texture.second == (read_access | write_access)) {
+        storage_scope_qualifiers.insert({texture.first, ""});
+      }
+    }
+    return storage_scope_qualifiers;
+  }
+  void VisitExpr_(const CallNode* op) {
+    if (!op->args.size())
+    {
+      return;
+    }
+    if (const VarNode* buffer = op->args[0].as<VarNode>())
+    {
+      if (op->op.same_as(builtin::text2d_load())) {
+        var_access_map_[buffer] |= read_access;
+      }
+      else if (op->op.same_as(builtin::text2d_store())) {
+        var_access_map_[buffer] |= write_access;
+      }
+    }
+  }
+private:
+  std::unordered_map<const VarNode*, uint8_t> var_access_map_;
+};
+
+
 CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; }
 
 void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
+  this->SetTextureScope(InferTextureAccess().Infer(f->body));
   for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
+    if (arg->type_annotation.as<TextureTypeNode>())
+    {
+      // Storage scope qualifiers for textures are inferred
+      // and set prior function codegen.
+      continue;
+    }
+    else if (arg.dtype().is_handle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -168,7 +217,7 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
     PrintType(ptr->element_type, os);
     os << '*';
-  } else if (auto* ptr = type.as<TextureTypeNode>()){
+  } else if (type.as<TextureTypeNode>()){
     os << "image2d_t";
   } else if (IsVoidType(type)) {
     os << "void";
@@ -226,6 +275,18 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os
   } else if (scope == "shared") {
     os << "__local ";
   }
+  else
+  {
+    os << scope;
+  }
+}
+
+void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
+  // Only apply restrict qualifer for non-texture types
+  if (v->type_annotation.as<TextureTypeNode>() == nullptr)
+  {
+    os << ' ' << restrict_keyword_;
+  }
 }
 
 std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
@@ -323,6 +384,13 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // N
   }
 }
 
+void CodeGenOpenCL::SetTextureScope(const std::unordered_map<const VarNode*, std::string>& scope) { // NOLINT(*)
+  for (auto& texture : scope)
+  {
+    alloc_storage_scope_.insert(texture);
+  }
+}
+
 runtime::Module BuildOpenCL(IRModule mod, Target target) {
   using tvm::runtime::Registry;
   bool output_ssa = false;
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index f2b6a252f16c..3bd71ba9dec8 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -51,8 +51,11 @@ class CodeGenOpenCL final : public CodeGenC {
                      const std::string& value) final;  // NOLINT(*)
   // the address of load/store
   void PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base,
-                    std::ostream& os);                                        // NOLINT(*)
-  std::string CastFromTo(std::string value, DataType from, DataType target);  // NOLINT(*)
+                    std::ostream& os);                                          // NOLINT(*)
+  void PrintRestrict(const Var& v, std::ostream& os) final;                     // NOLINT(*)
+  std::string CastFromTo(std::string value, DataType from, DataType target);    // NOLINT(*)
+  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&); // NOLINT(*)
+
 
   // overload visitor
   void VisitExpr_(const CallNode* op, std::ostream& os) final;       // NOLINT(*)
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 0b396f2ca56a..047251e48f00 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -90,58 +90,19 @@ class TextureFlattener : public StmtExprMutator {
       ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
       // TODO(csullivan): Consider check on float only?
-      //StringImm dtype = StringImm(runtime::DLDataType2String(vdtype));
-      StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->data.dtype()));
-      Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+      StringImm dtype(runtime::DLDataType2String(buffer_var.dtype()));
+
+      // StringImm func("device_api.opencl.AllocImage2d");
+      // Array<PrimExpr> args = {func, dtype, shape[0], shape[1]};
+      // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body);
 
-      stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body);
-      // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating
-      // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
+      Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
 
     return stmt;
   }
 
-  // Stmt VisitStmt_(const BufferRealizeNode* op) final {
-  //   //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4);
-  //   // Var buffer_var(op->buffer->data->name_hint, vdtype);
-  //   // let_binding_.insert({op->buffer->data, buffer_var});
-
-  //   Stmt stmt = StmtExprMutator::VisitStmt_(op);
-  //   op = stmt.as<BufferRealizeNode>();
-
-  //   std::string storage_scope;
-  //   auto it = storage_scope_.find(op->buffer.get());
-  //   if (it != storage_scope_.end())
-  //   {
-  //     storage_scope = it->second;
-  //   }
-  //   else
-  //   {
-  //     storage_scope = op->buffer->scope;
-  //   }
-  //   if (storage_scope == "texture")
-  //   {
-  //     Stmt body = this->VisitStmt(op->body);
-  //     Array<PrimExpr> shape;
-  //     for (auto r : op->bounds) {
-  //       shape.push_back(r->extent);
-  //     }
-  //     ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
-  //     ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
-
-  //     // TODO(csullivan): Consider check on float only?
-  //     StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype));
-  //     Array<PrimExpr> args = {dtype, shape[0], shape[1]};
-  //     stmt = Allocate(op->buffer->data, op->buffer->dtype, shape,
-  //              make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body);
-  //     // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating
-  //     //stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt);
-  //   }
-
-  //   return stmt;
-  // }
-
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
@@ -180,7 +141,7 @@ class TextureFlattener : public StmtExprMutator {
       }
       args.push_back(op->value);
 
-      stmt = Evaluate(Call(DataType::Void(), builtin::text2d_store(), args));
+      stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args));
       if (needs_vectorization_)
       {
         loop_vars_.insert({op->indices.back().get(), true});

From b96daafa8fafa96cf431b556d297b83c41abeecb Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 19 Mar 2021 15:46:28 -0700
Subject: [PATCH 11/59] Implement texture allocation as external function in
 TIR lowering.

---
 src/target/llvm/codegen_cpu.cc          |  2 ++
 src/tir/op/runtime.cc                   | 10 +++++++
 src/tir/transforms/lower_tvm_builtin.cc | 38 +++++++++++++++++++++++++
 src/tir/transforms/texture_flatten.cc   |  6 ++--
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index ab96d6e69d14..8b01f9d9186e 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -403,6 +403,8 @@ void CodeGenCPU::InitGlobalContext(bool dynamic_lookup) {
       // Mark as context functions
       gv_func_map_["TVMBackendAllocWorkspace"] = nullptr;
       gv_func_map_["TVMBackendFreeWorkspace"] = nullptr;
+      gv_func_map_["TVMBackendAllocTexture"] = nullptr;
+      gv_func_map_["TVMBackendFreeTexture"] = nullptr;
     }
   }
 }
diff --git a/src/tir/op/runtime.cc b/src/tir/op/runtime.cc
index adabae9e75f7..2a894d00ec0c 100644
--- a/src/tir/op/runtime.cc
+++ b/src/tir/op/runtime.cc
@@ -37,5 +37,15 @@ TVM_REGISTER_OP("tir.TVMBackendFreeWorkspace")
     .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeWorkspace")
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TVM_REGISTER_OP("tir.TVMBackendAllocTexture")
+    .set_num_inputs(6)
+    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendAllocTexture")
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TVM_REGISTER_OP("tir.TVMBackendFreeTexture")
+    .set_num_inputs(3)
+    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeTexture")
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 8b70817398e4..19d434006b83 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -98,6 +98,15 @@ class BuiltinLower : public StmtExprMutator {
     }
   }
 
+  Stmt VisitStmt_(const LetStmtNode* op) final {
+    if (const CallNode* call = op->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::text2d_alloca())) {
+        return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call));
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
   Stmt VisitStmt_(const AllocateNode* op) {
     // Lower allocate to device allocate when needed.
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
@@ -184,6 +193,7 @@ class BuiltinLower : public StmtExprMutator {
       return StmtExprMutator::VisitExpr_(op);
     }
   }
+
   // call shape
   PrimExpr MakeShape(const CallNode* op) {
     // if args.size() == 0, it represents a scalar shape ()
@@ -341,6 +351,34 @@ class BuiltinLower : public StmtExprMutator {
     return Call(op->dtype, builtin::tvm_call_trace_packed_lowered(), packed_args);
   }
 
+  Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) {
+    ICHECK(device_type_.defined()) << "Unknown device type in current IR";
+    ICHECK(device_id_.defined()) << "Unknown device id in current IR";
+    Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
+
+    Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}),
+                                    throw_last_error),
+                         let->body});
+    DataType dtype = let->var->type_annotation.as<TextureTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
+    Stmt alloca = LetStmt(
+        let->var,
+        Call(let->var.dtype(), Op::Get("tir.TVMBackendAllocTexture"),
+                    {cast(DataType::Int(32), device_type_),
+                        cast(DataType::Int(32), device_id_),
+                        cast(DataType::UInt(64), call->args[0]),
+                        cast(DataType::UInt(64), call->args[1]),
+                        IntImm(DataType::Int(32), dtype.code()),
+                        IntImm(DataType::Int(32), dtype.bits())}),
+        body);
+
+    PrimExpr free_op = Call(DataType::Int(32), Op::Get("tir.TVMBackendFreeTexture"),
+                            {cast(DataType::Int(32), device_type_),
+                             cast(DataType::Int(32), device_id_), let->var});
+    Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error);
+    body = SeqStmt({alloca, free_stmt});
+    return body;
+  }
+
  private:
   bool IsArrayHandle(const PrimExpr& arg) {
     // specially set array handle.
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 047251e48f00..d8063105483d 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -90,13 +90,13 @@ class TextureFlattener : public StmtExprMutator {
       ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
       // TODO(csullivan): Consider check on float only?
-      StringImm dtype(runtime::DLDataType2String(buffer_var.dtype()));
+      // StringImm dtype(runtime::DLDataType2String(buffer_var.dtype()));
 
       // StringImm func("device_api.opencl.AllocImage2d");
       // Array<PrimExpr> args = {func, dtype, shape[0], shape[1]};
       // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body);
 
-      Array<PrimExpr> args = {dtype, shape[0], shape[1]};
+      Array<PrimExpr> args = {shape[0], shape[1]};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
 
@@ -141,7 +141,7 @@ class TextureFlattener : public StmtExprMutator {
       }
       args.push_back(op->value);
 
-      stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args));
+      stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
       if (needs_vectorization_)
       {
         loop_vars_.insert({op->indices.back().get(), true});

From 83e9af34922942175c8816c30ad85fa430ac0d68 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 18 Nov 2020 10:59:43 -0800
Subject: [PATCH 12/59] Remove commented lines.

---
 src/target/source/codegen_opencl.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 13e65e20bcc6..18afc96301b9 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -331,13 +331,6 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(op->args[3], os);
     os << ")";
   } else if (op->op.same_as(builtin::text2d_load())) {
-    /*
-      float4 read_imagef(read_only image2d_t image,
-      sampler_t sampler,
-      int2 coord)
-    */
-    // std::cout << "LOAD\n";
-    // std::cout << op->args << std::endl;
     os << "read_imagef(";
     this->PrintExpr(op->args[0], os);
     os << ", ";

From fd0d23aca93fb90a209346442b534ae3094d86b0 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 19 Nov 2020 16:36:54 -0800
Subject: [PATCH 13/59] Add nd->2d texture flattening.

---
 src/tir/ir/buffer.cc                  |   2 +
 src/tir/transforms/texture_flatten.cc | 285 +++++++++++++++++++++++---
 2 files changed, 254 insertions(+), 33 deletions(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 335ff19dd775..90560e0dcac7 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -54,6 +54,7 @@ Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, String st
                 Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault, span);
 }
 
+namespace {
 // Split the given expression w.r.t the add operator
 inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
   using namespace tir;
@@ -290,6 +291,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
     return offset;
   }
 }
+}
 
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index d8063105483d..c69c9d68d0b2 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -28,6 +28,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/target/target_info.h>
 #include <tvm/te/operation.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -37,6 +38,7 @@
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
+#include <stack>
 
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
@@ -45,10 +47,212 @@
 
 namespace tvm {
 namespace tir {
+namespace {
+  using IndexMod = tir::FloorModNode;
+  using IndexDiv = tir::FloorDivNode;
+
+// Split the given expression w.r.t the add operator
+inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
+  using namespace tir;
+  std::vector<const PrimExpr*> ret;
+  std::stack<const PrimExpr*> split_buffer;
+  split_buffer.push(&expr);
+  while (!split_buffer.empty()) {
+    const PrimExpr* top_ele = split_buffer.top();
+    split_buffer.pop();
+    auto expr_add_match = top_ele->as<AddNode>();
+    if (expr_add_match) {
+      split_buffer.push(&expr_add_match->b);
+      split_buffer.push(&expr_add_match->a);
+    } else {
+      ret.emplace_back(top_ele);
+    }
+  }
+  return ret;
+}
 
-using runtime::StorageRank;
-using runtime::StorageScope;
-using runtime::ThreadScope;
+// Searches for the following types of expr:
+//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   mod_l_expr = c
+//   mod_r_expr = k1 * k2 * ... * ki
+// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
+// Currently the we will not search the add/mult combinations exhaustively
+//   as it will take too much computation.
+inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
+                                                  const PrimExpr& mod_l_expr,
+                                                  const PrimExpr& mod_r_expr) {
+  using namespace tir;
+  const MulNode* mult_ptr = mult_expr.as<MulNode>();
+  if (!mult_ptr) return std::make_pair(false, PrimExpr());
+  PrimExpr mult_outer = mult_ptr->b;
+  const PrimExpr* inner = &(mult_ptr->a);
+  // 1. Calculate the outer multiplier
+  while (true) {
+    mult_ptr = inner->as<MulNode>();
+    if (mult_ptr) {
+      inner = &(mult_ptr->a);
+      mult_outer = mult_ptr->b * mult_outer;
+    } else {
+      break;
+    }
+  }
+  // 2. Search for the pattern c / (...) * (...) + c % (...)
+  // We match the search element with Add, Mul and Div.
+  //   If Add is found, we need to continue our search for the rhs
+  //   If Mult is found, we will expand the inner multiplication factor
+  //   If Div is found, we will go on testing whether lhs matches the lhs of mod expr
+  //      and returns the optimization result.
+  const PrimExpr* search_ptr = inner;
+  PrimExpr mult_inner;  // The inner multiplication factor
+  PrimExpr no_opt_sum;  // Sum of the exprs that cannot be optimized
+  tir::ExprDeepEqual expr_equal;
+
+  while (true) {
+    auto inner_div_ptr = search_ptr->as<IndexDiv>();
+    auto inner_mult_ptr = search_ptr->as<MulNode>();
+    auto inner_add_ptr = search_ptr->as<AddNode>();
+    if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) {
+      return std::make_pair(false, PrimExpr());
+    } else if (inner_div_ptr) {
+      PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
+      if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
+          expr_equal(inner_div_ptr->a, mod_l_expr)) {
+        // Found!
+        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
+        return std::make_pair(true, ret);
+      } else {
+        return std::make_pair(false, PrimExpr());
+      }
+    } else if (inner_mult_ptr) {
+      mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b;
+      search_ptr = &(inner_mult_ptr->a);
+    } else if (inner_add_ptr) {
+      if (mult_inner.get()) {
+        return std::make_pair(false, PrimExpr());
+      }
+      no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a;
+      search_ptr = &(inner_add_ptr->b);
+    } else {
+      LOG(FATAL) << "Unexpected search result!";
+      break;
+    }
+  }
+  return std::make_pair(false, PrimExpr());
+}
+
+// Insert the elements into the corresponding mult_exprs and mod_exprs.
+// If the element is found to match Mul, it will be pushed to the mult_exprs.
+// If the element it found to match Mod, it will be pused to the mod_exprs.
+// Otherwise, the elements will be added to the no_opt_sum variable
+inline void MergeMulModInsertElements(const std::vector<const PrimExpr*>& eles,
+                                      std::list<PrimExpr>* mult_exprs,
+                                      std::list<std::pair<PrimExpr, PrimExpr> >* mod_exprs,
+                                      PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) {
+  using namespace tir;
+  *has_mult = false;
+  *has_mod = false;
+  for (const PrimExpr* ele : eles) {
+    auto mod_ptr = ele->as<IndexMod>();
+    auto mult_ptr = ele->as<MulNode>();
+    if (mod_ptr) {
+      *has_mod = true;
+      mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b)));
+    } else if (mult_ptr) {
+      *has_mult = true;
+      mult_exprs->emplace_back(*ele);
+    } else {
+      *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele;
+    }
+  }
+}
+
+// Searches for this types of expr:
+//   (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   + c % (k1 * k2 * ... * ki)
+// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c
+// The search will be performed repeatively until no pattern is found.
+// Return: a pair with (false, Expr()) if cannot be optimized.
+//         a pair with (true, optimized_expr) if can be optimized
+inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
+  using namespace tir;
+  // 1. Prepare the lists.
+  // We store two lists, a list that contain all the elements that match Mul and
+  //                     a list that contain all the elements that match Mod.
+  // The elements in the Mod will be used to match against the elements in Mul.
+  // The result will then be split and pushed back to these two lists.
+  PrimExpr simplified_base = analyzer->Simplify(base);
+  std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
+  std::list<PrimExpr> mult_exprs;
+  std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;
+  PrimExpr no_opt_sum;
+  bool has_mult;
+  bool has_mod;
+  MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod);
+  bool find_opt = false;
+  std::list<std::pair<PrimExpr, PrimExpr> >::iterator search_mod_it = mod_exprs.begin();
+  // 2. Exhaustive Search
+  while (search_mod_it != mod_exprs.end()) {
+    std::list<PrimExpr>::iterator mult_it = mult_exprs.begin();
+    bool inner_find_opt = false;
+    while (mult_it != mult_exprs.end()) {
+      std::pair<bool, PrimExpr> ret =
+          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
+      if (ret.first) {
+        inner_find_opt = true;
+        auto temp_mod_it = search_mod_it;
+        ++search_mod_it;
+        mod_exprs.erase(temp_mod_it);
+        mult_exprs.erase(mult_it);
+        std::vector<const PrimExpr*> ret_eles = ExprSplitAddition(ret.second);
+        MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult,
+                                  &has_mod);
+        if (has_mult) {
+          search_mod_it = mod_exprs.begin();
+        } else if (has_mod && search_mod_it == mod_exprs.end()) {
+          search_mod_it--;
+        }
+        break;
+      } else {
+        ++mult_it;
+      }
+    }
+    find_opt = find_opt || inner_find_opt;
+    if (!inner_find_opt) {
+      ++search_mod_it;
+    }
+  }
+  if (!find_opt) {
+    return simplified_base;
+  }
+  for (std::list<PrimExpr>::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) {
+    no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it;
+  }
+  for (std::list<std::pair<PrimExpr, PrimExpr> >::iterator it = mod_exprs.begin();
+       it != mod_exprs.end(); ++it) {
+    no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second)
+                                  : indexmod(it->first, it->second);
+  }
+  return no_opt_sum;
+}
+
+inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) {
+  PrimExpr base = make_const(DataType::Int(32), 0); //IntImm(DataType::Int(32), 0);
+  ICHECK_EQ(shape.size(), index.size());
+  arith::Analyzer ana;
+  if (index.size() > 0) {
+    PrimExpr offset = index[0];
+    for (size_t i = 1; i < index.size(); ++i) {
+      offset = MergeMulMod(&ana, offset * shape[i] + index[i]);
+    }
+    base = base + offset;
+  }
+  return base;
+}
+
+size_t GetAxisSeparator() {
+  return 1;
+}
+}
 
 class TextureFlattener : public StmtExprMutator {
  public:
@@ -62,7 +266,6 @@ class TextureFlattener : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
-    //Var buffer_var(op->buffer->data->name_hint, DataType::Handle());
     Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1)));
     let_binding_.insert({op->buffer->data, buffer_var});
 
@@ -82,21 +285,25 @@ class TextureFlattener : public StmtExprMutator {
     if (storage_scope == "texture")
     {
       Stmt body = this->VisitStmt(op->body);
+      ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
+      ICHECK_EQ(static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
+
       Array<PrimExpr> shape;
-      for (auto r : op->bounds) {
-        shape.push_back(r->extent);
+      auto width = IntImm(DataType::Int(32), 1);
+      auto height = IntImm(DataType::Int(32), 1);
+      //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4)
+      for (size_t i = 0; i < op->bounds.size()-1; i++) {
+        if (i < GetAxisSeparator()) {
+          width *= op->bounds[i]->extent;
+        } else {
+          height *= op->bounds[i]->extent;
+        }
       }
-      ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
-      ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
-      // TODO(csullivan): Consider check on float only?
-      // StringImm dtype(runtime::DLDataType2String(buffer_var.dtype()));
+      // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
+      // ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
 
-      // StringImm func("device_api.opencl.AllocImage2d");
-      // Array<PrimExpr> args = {func, dtype, shape[0], shape[1]};
-      // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body);
-
-      Array<PrimExpr> args = {shape[0], shape[1]};
+      Array<PrimExpr> args = {width, height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
 
@@ -128,17 +335,24 @@ class TextureFlattener : public StmtExprMutator {
       {
         args.push_back(op->buffer->data);
       }
-      // for (auto& i : op->indices)
-      // {
-      //   args.push_back(i);
-      // }
-
-      // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct.
-      // I don't think this will work generally	when tensor dimension doesn't have (4) in the FCD.
-      for (size_t i = 0u; i < op->indices.size()-1; i++)
+
+      Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
+      for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
       {
-        args.push_back(op->indices[i]);
+        if (i < GetAxisSeparator()) {
+          row_dims.push_back(op->buffer->shape[i]);
+          row_indices.push_back(op->indices[i]);
+        } else {
+          col_dims.push_back(op->buffer->shape[i]);
+          col_indices.push_back(op->indices[i]);
+        }
       }
+
+      PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
+      PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
+
+      args.push_back(row_offset);
+      args.push_back(col_offset);
       args.push_back(op->value);
 
       stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
@@ -178,18 +392,23 @@ class TextureFlattener : public StmtExprMutator {
       }
 
 
-      // for (auto& i : op->indices)
-      // {
-      //   args.push_back(i);
-      // }
-
-      // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct.
-      // I don't think this will work generally	when tensor dimension doesn't have (4) in the FCD.
-      for (size_t i = 0u; i < op->indices.size()-1; i++)
+      Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
+      for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
       {
-        args.push_back(op->indices[i]);
+        if (i < GetAxisSeparator()) {
+          row_dims.push_back(op->buffer->shape[i]);
+          row_indices.push_back(op->indices[i]);
+        } else {
+          col_dims.push_back(op->buffer->shape[i]);
+          col_indices.push_back(op->indices[i]);
+        }
       }
 
+      PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
+      PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
+      args.push_back(row_offset);
+      args.push_back(col_offset);
+
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
       if (needs_vectorization_)
       {

From 56451127617e89b8248bafee8832f0e1fd91e5ec Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 2 Dec 2020 17:00:13 -0800
Subject: [PATCH 14/59] Bug fixes in opencl codegen (row<>col, access quals.)

---
 src/target/source/codegen_opencl.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 18afc96301b9..8a82e9e78b0d 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -44,10 +44,10 @@ class InferTextureAccess : public StmtExprVisitor {
     std::unordered_map<const VarNode*, std::string> storage_scope_qualifiers;
     for (auto& texture : var_access_map_) {
       if (texture.second == read_access) {
-        storage_scope_qualifiers.insert({texture.first, "__read_only "});
+        storage_scope_qualifiers.insert({texture.first, "texture_read"});
       }
       else if (texture.second == write_access) {
-        storage_scope_qualifiers.insert({texture.first, "__write_only "});
+        storage_scope_qualifiers.insert({texture.first, "texture_write"});
       }
       else if (texture.second == (read_access | write_access)) {
         storage_scope_qualifiers.insert({texture.first, ""});
@@ -274,10 +274,10 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os
     os << "__global ";
   } else if (scope == "shared") {
     os << "__local ";
-  }
-  else
-  {
-    os << scope;
+  } else if (scope == "texture_read") {
+    os << "__read_only ";
+  } else if (scope == "texture_write") {
+    os << "__write_only ";
   }
 }
 
@@ -324,9 +324,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(op->args[0], os);
     os << ", ";
     os << "(int2)(";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
     this->PrintExpr(op->args[1], os);
+    os << ", ";
+    this->PrintExpr(op->args[2], os);
     os << "), ";
     this->PrintExpr(op->args[3], os);
     os << ")";
@@ -336,9 +336,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ", ";
     os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
     os << "(int2)(";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
     this->PrintExpr(op->args[1], os);
+    os << ", ";
+    this->PrintExpr(op->args[2], os);
     os << "))";
   } else if (op->op.same_as(builtin_call_extern_)) {
     auto func = Downcast<StringImm>(op->args[0]);

From 34549c7e248efe409d1f7b0a6ed92549a8d37ba0 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 9 Dec 2020 15:29:23 -0800
Subject: [PATCH 15/59] Improve texture codegen by explicitly allocating local
 vector for the texture load. Also support indexing individual elements of the
 RGBA vector.

---
 src/target/source/codegen_opencl.cc   | 44 +++++++++++++++++++++------
 src/target/source/codegen_opencl.h    |  3 ++
 src/tir/transforms/texture_flatten.cc |  2 +-
 src/tir/transforms/vectorize_loop.cc  |  8 ++++-
 4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 8a82e9e78b0d..010f30890217 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -305,6 +305,12 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType
   return os.str();
 }
 
+void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
+  stored_value_ = op->value;
+  CodeGenC::VisitStmt_(op);
+  stored_value_ = PrimExpr(nullptr);
+}
+
 void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
   if (op->op.same_as(builtin::address_of())) {
     // Overload tvm_address_of to add storage scope (e.g. __global).
@@ -331,15 +337,35 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(op->args[3], os);
     os << ")";
   } else if (op->op.same_as(builtin::text2d_load())) {
-    os << "read_imagef(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
-    os << "(int2)(";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << "))";
+    std::stringstream ss;
+    ss << "read_imagef(";
+    this->PrintExpr(op->args[0], ss);
+    ss << ", ";
+    ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
+    ss << "(int2)(";
+    this->PrintExpr(op->args[1], ss);
+    ss << ", ";
+    this->PrintExpr(op->args[2], ss);
+    ss << "))";
+
+    // Only use local SSA if texture is not already being stored
+    auto value = GetRef<Call>(stored_value_.as<CallNode>());
+    if (value.same_as(GetRef<Call>(op)))
+    {
+      os << ss.str();
+    } else {
+      std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4));
+      if (op->args.back().as<RampNode>())
+      {
+        os << rhs;
+      } else {
+        os << "((";
+        this->PrintType(op->dtype.with_lanes(1), os);
+        os << "*)&" << rhs << ")[";
+        this->PrintExpr(op->args.back(), os);
+        os << "]";
+      }
+    }
   } else if (op->op.same_as(builtin_call_extern_)) {
     auto func = Downcast<StringImm>(op->args[0]);
     // Enable atomics extension if used.
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 3bd71ba9dec8..374ae4ae56b6 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -61,6 +61,8 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const CallNode* op, std::ostream& os) final;       // NOLINT(*)
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;  // NOLINT(*)
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;   // NOLINT(*)
+  void VisitStmt_(const StoreNode* op) final;                        // NOLINT(*)
+
 
  private:
   // whether enable fp16 and fp64 extension
@@ -68,6 +70,7 @@ class CodeGenOpenCL final : public CodeGenC {
   bool enable_fp64_{false};
   // Whether to enable atomics extension.
   bool enable_atomics_{false};
+  PrimExpr stored_value_{nullptr};
 };
 
 }  // namespace codegen
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index c69c9d68d0b2..953e6664940d 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -408,7 +408,7 @@ class TextureFlattener : public StmtExprMutator {
       PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
       args.push_back(row_offset);
       args.push_back(col_offset);
-
+      args.push_back(op->indices.back());
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
       if (needs_vectorization_)
       {
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 3f33667ea2da..9943d1e37938 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -268,11 +268,17 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     }
     else if (op->op.same_as(builtin::text2d_load()))
     {
-      return Call(op->dtype.with_lanes(4), op->op, op->args);
+      int lane = 0;
+      Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
+      auto new_args = op->args;
+      new_args.pop_back();
+      new_args.push_back(fcd[0]);
+      return Call(op->dtype.with_lanes(4), op->op, new_args);
     }
     else if (op->op.same_as(builtin::text2d_store()))
     {
       int lane = 0;
+      // Vectorize the value to store
       Array<PrimExpr> value{op->args.back()};
       Array<PrimExpr> mutated_value = MutateArray(value, &lane);
       Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]};

From 623f2eb45a2bb1ed9bac1aa3755a790ec4202b6f Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 8 Dec 2020 14:30:11 -0800
Subject: [PATCH 16/59] Remove automatic vectorization code as it is no longer
 needed.

---
 src/tir/transforms/texture_flatten.cc | 39 +--------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 953e6664940d..28bbf3e61cb5 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -256,7 +256,7 @@ size_t GetAxisSeparator() {
 
 class TextureFlattener : public StmtExprMutator {
  public:
-  explicit TextureFlattener() : needs_vectorization_(true) {}
+  explicit TextureFlattener() {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::realize_scope) {
@@ -356,10 +356,6 @@ class TextureFlattener : public StmtExprMutator {
       args.push_back(op->value);
 
       stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
-      if (needs_vectorization_)
-      {
-        loop_vars_.insert({op->indices.back().get(), true});
-      }
     }
 
     return stmt;
@@ -410,49 +406,16 @@ class TextureFlattener : public StmtExprMutator {
       args.push_back(col_offset);
       args.push_back(op->indices.back());
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
-      if (needs_vectorization_)
-      {
-        loop_vars_.insert({op->indices.back().get(), true});
-      }
     }
 
     return expr;
   }
 
-  // Auto-vectorize texture load and store loops
-  Stmt VisitStmt_(const ForNode* op) final {
-    Stmt stmt;
-    if (!needs_vectorization_)
-    {
-      stmt = StmtMutator::VisitStmt_(op);
-    }
-    else if (op->for_type == ForType::Serial)
-    {
-      stmt = StmtMutator::VisitStmt_(op);
-      auto it = loop_vars_.find(op->loop_var.get());
-      if (it != loop_vars_.end() && it->second)
-      {
-        stmt = For(op->loop_var, op->min, op->extent, ForType::Vectorized, op->device_api, op->body);
-        stmt = StmtMutator::VisitStmt_(stmt.as<ForNode>());
-      }
-    }
-    else
-    {
-      needs_vectorization_ = false;
-      stmt = StmtMutator::VisitStmt_(op);
-      needs_vectorization_ = true;
-    }
-
-    return stmt;
-  }
-
  private:
   // Storage scope
   std::unordered_map<const Object*, std::string> storage_scope_;
   // Let binding
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
-  std::unordered_map<const Object*, bool> loop_vars_;
-  bool needs_vectorization_;
 };
 
 PrimFunc TextureFlatten(PrimFunc func) {

From 5f9ebd1ae78daeb48a17c25b5d6fa48ad6b57a70 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 15 Dec 2020 15:10:22 -0800
Subject: [PATCH 17/59] Improve SSA local use when storing texture read to
 scalar buffer.

---
 src/target/source/codegen_opencl.cc | 28 ++++++++++++++++++++++------
 src/target/source/codegen_opencl.h  |  4 +++-
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 010f30890217..6367bd1b40d8 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -306,9 +306,26 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType
 }
 
 void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
-  stored_value_ = op->value;
+  if (auto call = op->value.as<CallNode>()) {
+    if (call->op.same_as(builtin::text2d_load())) {
+      need_texture_ssa_ = false;
+      // If storing a texture load into a buffer, don't use an
+      // intermediate local unless the buffer allocation is a
+      // single element selected from the texture read.
+      auto it = allocation_size_.find(op->buffer_var.get());
+      if (it != allocation_size_.end() && it->second == 1)
+      {
+          need_texture_ssa_ = true;
+      }
+    }
+  }
+  CodeGenC::VisitStmt_(op);
+  need_texture_ssa_ = true;
+}
+
+void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) {
+  allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()});
   CodeGenC::VisitStmt_(op);
-  stored_value_ = PrimExpr(nullptr);
 }
 
 void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
@@ -349,11 +366,8 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     ss << "))";
 
     // Only use local SSA if texture is not already being stored
-    auto value = GetRef<Call>(stored_value_.as<CallNode>());
-    if (value.same_as(GetRef<Call>(op)))
+    if (need_texture_ssa_)
     {
-      os << ss.str();
-    } else {
       std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4));
       if (op->args.back().as<RampNode>())
       {
@@ -365,6 +379,8 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
         this->PrintExpr(op->args.back(), os);
         os << "]";
       }
+    } else {
+      os << ss.str();
     }
   } else if (op->op.same_as(builtin_call_extern_)) {
     auto func = Downcast<StringImm>(op->args[0]);
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 374ae4ae56b6..399dc6c4c007 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -62,6 +62,7 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;  // NOLINT(*)
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;   // NOLINT(*)
   void VisitStmt_(const StoreNode* op) final;                        // NOLINT(*)
+  void VisitStmt_(const AllocateNode* op) final;                     // NOLINT(*)
 
 
  private:
@@ -70,7 +71,8 @@ class CodeGenOpenCL final : public CodeGenC {
   bool enable_fp64_{false};
   // Whether to enable atomics extension.
   bool enable_atomics_{false};
-  PrimExpr stored_value_{nullptr};
+  bool need_texture_ssa_{true};
+  std::unordered_map<const Object*, int32_t> allocation_size_;
 };
 
 }  // namespace codegen

From d8fbcfdb56c806882866ddd3291c51da26ccfb40 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 10 Dec 2020 11:19:30 -0800
Subject: [PATCH 18/59] Define texture flattening convention such that the
 outer Nd-1 axes are stored as rows, and the last axis is stored as columns.

---
 src/tir/transforms/texture_flatten.cc | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 28bbf3e61cb5..0ca908826dac 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -249,8 +249,16 @@ inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExp
   return base;
 }
 
-size_t GetAxisSeparator() {
-  return 1;
+size_t GetAxisSeparator(size_t shape_rank) {
+  // Convention is that shape is packed with the last axis
+  // as RGBA (length 4) and the second to last axis
+  // will be the packed texure columns. All other
+  // axes are packed into rows.
+  //
+  // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c]
+  //
+
+  return shape_rank - 2;
 }
 }
 
@@ -293,7 +301,7 @@ class TextureFlattener : public StmtExprMutator {
       auto height = IntImm(DataType::Int(32), 1);
       //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4)
       for (size_t i = 0; i < op->bounds.size()-1; i++) {
-        if (i < GetAxisSeparator()) {
+        if (i < GetAxisSeparator(op->bounds.size())) {
           width *= op->bounds[i]->extent;
         } else {
           height *= op->bounds[i]->extent;
@@ -339,7 +347,7 @@ class TextureFlattener : public StmtExprMutator {
       Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
       for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
       {
-        if (i < GetAxisSeparator()) {
+        if (i < GetAxisSeparator(op->buffer->shape.size())) {
           row_dims.push_back(op->buffer->shape[i]);
           row_indices.push_back(op->indices[i]);
         } else {
@@ -391,7 +399,7 @@ class TextureFlattener : public StmtExprMutator {
       Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
       for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
       {
-        if (i < GetAxisSeparator()) {
+        if (i < GetAxisSeparator(op->buffer->shape.size())) {
           row_dims.push_back(op->buffer->shape[i]);
           row_indices.push_back(op->indices[i]);
         } else {

From b81620fe5decc10cdeac266751a1c1bcb60de36b Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 19 Mar 2021 15:49:22 -0700
Subject: [PATCH 19/59] Add tir lowering and opencl codegen support for float16
 textures.

---
 src/target/source/codegen_opencl.cc   | 17 +++++++++++++++--
 src/tir/transforms/texture_flatten.cc | 10 +++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 6367bd1b40d8..d0efbdeaec88 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -343,7 +343,15 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(load->index, os);
     os << ')';
   } else if (op->op.same_as(builtin::text2d_store())) {
-    os << "write_imagef(";
+    auto* texture_type  = op->args[0].as<VarNode>()->type_annotation.as<TextureTypeNode>();
+    ICHECK(texture_type != nullptr) << "builtin::text2d_store() only supports storing to texture buffers";
+    DataType buffer_type = texture_type->element_type.as<PrimTypeNode>()->dtype;
+    if (buffer_type.is_float16()) {
+      os << "write_imageh(";
+    }
+    else if (buffer_type.is_float()) {
+      os << "write_imagef(";
+    }
     this->PrintExpr(op->args[0], os);
     os << ", ";
     os << "(int2)(";
@@ -355,7 +363,12 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::text2d_load())) {
     std::stringstream ss;
-    ss << "read_imagef(";
+    if (op->dtype.is_float16()) {
+      ss << "read_imageh(";
+    }
+    else if (op->dtype.is_float()) {
+      ss << "read_imagef(";
+    }
     this->PrintExpr(op->args[0], ss);
     ss << ", ";
     ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 0ca908826dac..c144e8ac5742 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -274,7 +274,7 @@ class TextureFlattener : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
-    Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1)));
+    Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype));
     let_binding_.insert({op->buffer->data, buffer_var});
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
@@ -299,7 +299,8 @@ class TextureFlattener : public StmtExprMutator {
       Array<PrimExpr> shape;
       auto width = IntImm(DataType::Int(32), 1);
       auto height = IntImm(DataType::Int(32), 1);
-      //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4)
+      // TODO(csulivan): We do not currently handle the case where
+      // the last dimension isn't previously set to a vector(4)
       for (size_t i = 0; i < op->bounds.size()-1; i++) {
         if (i < GetAxisSeparator(op->bounds.size())) {
           width *= op->bounds[i]->extent;
@@ -308,9 +309,6 @@ class TextureFlattener : public StmtExprMutator {
         }
       }
 
-      // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported";
-      // ICHECK_EQ(static_cast<int>(shape[2].as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
-
       Array<PrimExpr> args = {width, height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
@@ -427,10 +425,8 @@ class TextureFlattener : public StmtExprMutator {
 };
 
 PrimFunc TextureFlatten(PrimFunc func) {
-  // std::cout << "Before TextureFlattening: " << func << std::endl;
   auto fptr = func.CopyOnWrite();
   fptr->body = TextureFlattener()(std::move(fptr->body));
-  // std::cout << "After TextureFlattening: " << func << std::endl;
   return func;
 }
 

From 557e07dfd863b6257180be437a19cd67d6386765 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 4 Feb 2021 15:31:15 -0800
Subject: [PATCH 20/59] Disable SSA when texture load is immediately casted.

---
 src/target/source/codegen_opencl.cc | 12 +++++++++++-
 src/target/source/codegen_opencl.h  |  5 +++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index d0efbdeaec88..b8ff1d451445 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -315,7 +315,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
       auto it = allocation_size_.find(op->buffer_var.get());
       if (it != allocation_size_.end() && it->second == 1)
       {
-          need_texture_ssa_ = true;
+        need_texture_ssa_ = true;
       }
     }
   }
@@ -323,6 +323,16 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
   need_texture_ssa_ = true;
 }
 
+void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) {
+  if (auto call = op->value.as<CallNode>()) {
+    if (call->op.same_as(builtin::text2d_load())) {
+      need_texture_ssa_ = false;
+    }
+  }
+  CodeGenC::VisitExpr_(op, os);
+  need_texture_ssa_ = true;
+}
+
 void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) {
   allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()});
   CodeGenC::VisitStmt_(op);
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 399dc6c4c007..a456fdd94f5f 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -58,11 +58,12 @@ class CodeGenOpenCL final : public CodeGenC {
 
 
   // overload visitor
-  void VisitExpr_(const CallNode* op, std::ostream& os) final;       // NOLINT(*)
+  void VisitStmt_(const AllocateNode* op) final;                     // NOLINT(*)
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const CallNode* op, std::ostream& os) final;       // NOLINT(*)
+  void VisitExpr_(const CastNode* op, std::ostream& os) final;       // NOLINT(*)
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;   // NOLINT(*)
   void VisitStmt_(const StoreNode* op) final;                        // NOLINT(*)
-  void VisitStmt_(const AllocateNode* op) final;                     // NOLINT(*)
 
 
  private:

From 19469c61f47372b9f933a932ef9364528dc04982 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 1 Mar 2021 15:08:51 -0800
Subject: [PATCH 21/59] Allow RGBA extent to be of length 1.

---
 src/tir/transforms/texture_flatten.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index c144e8ac5742..5de637fd149e 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -294,7 +294,8 @@ class TextureFlattener : public StmtExprMutator {
     {
       Stmt body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
-      ICHECK_EQ(static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)";
+      int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
+      ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)";
 
       Array<PrimExpr> shape;
       auto width = IntImm(DataType::Int(32), 1);

From 8b3b3ded0ec327d9fa14aff23c8020b77559efa7 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 19 Mar 2021 15:54:15 -0700
Subject: [PATCH 22/59] Add pass to forward externally allocated textures in
 place of textures realized from cache_read. Fix to better follow indexing
 spec.

---
 src/tir/transforms/texture_flatten.cc | 211 +++++++++++++++++---------
 1 file changed, 141 insertions(+), 70 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 5de637fd149e..f73208e5759e 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -18,10 +18,9 @@
  */
 
 /*!
- * \file storage_flatten.cc
- * \brief Flattens storage from multi-dimensional array to 1D buffer access
+ * \file texture_flatten.cc
+ * \brief Flattens texture from multi-dimensional array to 2D buffer access
  */
-// The pass definition originates from Halide pipeline.
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/runtime/device_api.h>
@@ -262,37 +261,70 @@ size_t GetAxisSeparator(size_t shape_rank) {
 }
 }
 
-class TextureFlattener : public StmtExprMutator {
+class TextureLoweringBase : public StmtExprMutator {
  public:
-  explicit TextureFlattener() {}
+  explicit TextureLoweringBase(const Map<Var, Buffer>& extern_buffer_map) {
+    for (auto kv : extern_buffer_map) {
+      extern_buf_.insert(kv.second);
+    }
+  }
 
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
+  virtual Stmt VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::realize_scope) {
-      storage_scope_[op->node.get()] = op->value.as<StringImmNode>()->value;
+      std::string realize_scope = op->value.as<StringImmNode>()->value;
+      // If realize_scope for external buffer is unset, infer from buffer scope
+      if (realize_scope == "" && op->body->IsInstance<BufferRealizeNode>()) {
+        const auto* realize = Downcast<BufferRealize>(op->body).get();
+        if (extern_buf_.count(realize->buffer)) {
+          realize_scope = realize->buffer->scope;
+        }
+      }
+      storage_scope_[op->node.get()] = realize_scope;
     }
     return StmtExprMutator::VisitStmt_(op);
   }
 
+ protected:
+
+  std::string GetStorageScope(const Buffer& buffer) {
+    std::string storage_scope;
+    auto it = storage_scope_.find(buffer.get());
+    // If buffer has a realize_scope attr return it
+    if (it != storage_scope_.end()) {
+      storage_scope = it->second;
+    } else {
+      storage_scope = buffer->scope;
+    }
+    return storage_scope;
+  }
+
+  // Buffer set
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
+  // Storage scope
+  std::unordered_map<const Object*, std::string> storage_scope_;
+};
+
+class TextureFlattener : public TextureLoweringBase {
+ public:
+  explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map,
+                            const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_)
+    : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;}
+
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
+    if (extern_buf_.count(op->buffer)) {
+      return this->VisitStmt(op->body);
+    }
+
     Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype));
     let_binding_.insert({op->buffer->data, buffer_var});
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferRealizeNode>();
+    Stmt body = this->VisitStmt(op->body);
 
-    std::string storage_scope;
-    auto it = storage_scope_.find(op->buffer.get());
-    if (it != storage_scope_.end())
-    {
-      storage_scope = it->second;
-    }
-    else
-    {
-      storage_scope = op->buffer->scope;
-    }
-    if (storage_scope == "texture")
-    {
-      Stmt body = this->VisitStmt(op->body);
+    std::string storage_scope = GetStorageScope(op->buffer);
+    if (storage_scope == "texture") {
+      body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
       ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)";
@@ -304,9 +336,9 @@ class TextureFlattener : public StmtExprMutator {
       // the last dimension isn't previously set to a vector(4)
       for (size_t i = 0; i < op->bounds.size()-1; i++) {
         if (i < GetAxisSeparator(op->bounds.size())) {
-          width *= op->bounds[i]->extent;
-        } else {
           height *= op->bounds[i]->extent;
+        } else {
+          width *= op->bounds[i]->extent;
         }
       }
 
@@ -321,37 +353,23 @@ class TextureFlattener : public StmtExprMutator {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
 
-    std::string storage_scope;
-    auto it = storage_scope_.find(op->buffer.get());
-    if (it != storage_scope_.end())
-    {
-      storage_scope = it->second;
-    }
-    else
-    {
-      storage_scope = op->buffer->scope;
-    }
-    if (storage_scope == "texture")
-    {
+    std::string storage_scope = GetStorageScope(op->buffer);
+    if (storage_scope == "texture") {
       Array<PrimExpr> args;
-      if (let_binding_.count(op->buffer->data))
-      {
+      if (let_binding_.count(op->buffer->data)) {
         args.push_back(let_binding_[op->buffer->data]);
-      }
-      else
-      {
+      } else {
         args.push_back(op->buffer->data);
       }
 
       Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
-      for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
-      {
+      for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
         if (i < GetAxisSeparator(op->buffer->shape.size())) {
-          row_dims.push_back(op->buffer->shape[i]);
-          row_indices.push_back(op->indices[i]);
-        } else {
           col_dims.push_back(op->buffer->shape[i]);
           col_indices.push_back(op->indices[i]);
+        } else {
+          row_dims.push_back(op->buffer->shape[i]);
+          row_indices.push_back(op->indices[i]);
         }
       }
 
@@ -372,38 +390,29 @@ class TextureFlattener : public StmtExprMutator {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<BufferLoadNode>();
 
-    std::string storage_scope;
-    auto it = storage_scope_.find(op->buffer.get());
-    if (it != storage_scope_.end())
-    {
-      storage_scope = it->second;
+    auto buffer = op->buffer;
+    if (buffer_binds_.count(op->buffer)) {
+      buffer = buffer_binds_[op->buffer];
     }
-    else
-    {
-      storage_scope = op->buffer->scope;
-    }
-    if (storage_scope == "texture")
-    {
+
+    std::string storage_scope = GetStorageScope(buffer);
+    if (storage_scope == "texture") {
       Array<PrimExpr> args;
-      if (let_binding_.count(op->buffer->data))
-      {
+      if (let_binding_.count(op->buffer->data)) {
         args.push_back(let_binding_[op->buffer->data]);
-      }
-      else
-      {
-        args.push_back(op->buffer->data);
+      } else {
+        args.push_back(buffer->data);
       }
 
 
       Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
-      for (size_t i = 0; i < op->buffer->shape.size()-1; i++)
-      {
+      for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
         if (i < GetAxisSeparator(op->buffer->shape.size())) {
-          row_dims.push_back(op->buffer->shape[i]);
-          row_indices.push_back(op->indices[i]);
-        } else {
           col_dims.push_back(op->buffer->shape[i]);
           col_indices.push_back(op->indices[i]);
+        } else {
+          row_dims.push_back(op->buffer->shape[i]);
+          row_indices.push_back(op->indices[i]);
         }
       }
 
@@ -418,16 +427,78 @@ class TextureFlattener : public StmtExprMutator {
     return expr;
   }
 
- private:
-  // Storage scope
-  std::unordered_map<const Object*, std::string> storage_scope_;
+ protected:
+
   // Let binding
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
+  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_binds_;
 };
 
+
+class ExternalBufferForwarding : public TextureLoweringBase {
+ public:
+  explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map)
+    : TextureLoweringBase(extern_buffer_map) {;}
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    Stmt stmt = TextureLoweringBase::VisitStmt_(op);
+    if (op->attr_key == attr::realize_scope) {
+      if (op->body->IsInstance<BufferRealizeNode>()) {
+        const auto* realize = Downcast<BufferRealize>(op->body).get();
+        std::string realize_scope = GetStorageScope(realize->buffer);
+        if (realize_scope == "texture" && extern_buffer_copy_.count(realize->buffer)) {
+          return realize_attrs_.back();
+        } else {
+          if (realize_attrs_.size()) {
+            realize_attrs_.pop_back();
+          }
+          realize_attrs_.push_back(stmt);
+        }
+        return stmt;
+      }
+    }
+
+    return stmt;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferStoreNode>();
+
+    if (auto load = op->value.as<BufferLoadNode>()) {
+      if (extern_buf_.count(load->buffer)) {
+        // If the buffer to load and the buffer to store to are both texture
+        // check for identical access
+        if (GetStorageScope(load->buffer) == "texture" && GetStorageScope(op->buffer) == "texture") {
+          auto store_index = SimplifyOffset(op->buffer->shape, op->indices);
+          auto load_index = SimplifyOffset(load->buffer->shape, load->indices);
+          if (arith::Analyzer().CanProve(store_index == load_index)) {
+            extern_buffer_copy_.insert(op->buffer);
+            buffer_map_.insert({op->buffer, load->buffer});
+          }
+        }
+      }
+    }
+
+    return stmt;
+  }
+
+  const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& GetForwardedBuffers() {
+    return buffer_map_;
+  }
+
+ private:
+  std::deque<Stmt> realize_attrs_;
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buffer_copy_;
+  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
+};
+
+
 PrimFunc TextureFlatten(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
-  fptr->body = TextureFlattener()(std::move(fptr->body));
+  ExternalBufferForwarding forward(fptr->buffer_map);
+  fptr->body = forward(std::move(fptr->body));
+  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers())(std::move(fptr->body));
   return func;
 }
 

From d7d3195cf3c1290ee0a12119efe9082106ba1c5f Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 17 Aug 2021 15:44:11 -0700
Subject: [PATCH 23/59] Add buffer_common.h to house buffer offset
 simplification routines.

---
 src/tir/ir/buffer.cc                  | 245 +-------------------------
 src/tir/transforms/texture_flatten.cc | 190 +-------------------
 2 files changed, 3 insertions(+), 432 deletions(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 90560e0dcac7..beee377d8401 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -29,17 +29,13 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
-#include <iterator>
-#include <stack>
+#include "buffer_common.h"
 
 #include "../../arith/pattern_match.h"
 
 namespace tvm {
 namespace tir {
 
-using IndexMod = tir::FloorModNode;
-using IndexDiv = tir::FloorDivNode;
-
 Array<PrimExpr> SimplifyArray(arith::Analyzer* ana, Array<PrimExpr> array) {
   for (size_t i = 0; i < array.size(); ++i) {
     array.Set(i, ana->Simplify(array[i]));
@@ -54,245 +50,6 @@ Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, String st
                 Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault, span);
 }
 
-namespace {
-// Split the given expression w.r.t the add operator
-inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
-  using namespace tir;
-  std::vector<const PrimExpr*> ret;
-  std::stack<const PrimExpr*> split_buffer;
-  split_buffer.push(&expr);
-  while (!split_buffer.empty()) {
-    const PrimExpr* top_ele = split_buffer.top();
-    split_buffer.pop();
-    auto expr_add_match = top_ele->as<AddNode>();
-    if (expr_add_match) {
-      split_buffer.push(&expr_add_match->b);
-      split_buffer.push(&expr_add_match->a);
-    } else {
-      ret.emplace_back(top_ele);
-    }
-  }
-  return ret;
-}
-
-// Searches for the following types of expr:
-//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   mod_l_expr = c
-//   mod_r_expr = k1 * k2 * ... * ki
-// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
-// Currently the we will not search the add/mult combinations exhaustively
-//   as it will take too much computation.
-inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
-                                                  const PrimExpr& mod_l_expr,
-                                                  const PrimExpr& mod_r_expr) {
-  using namespace tir;
-  const MulNode* mult_ptr = mult_expr.as<MulNode>();
-  if (!mult_ptr) return std::make_pair(false, PrimExpr());
-  PrimExpr mult_outer = mult_ptr->b;
-  const PrimExpr* inner = &(mult_ptr->a);
-  // 1. Calculate the outer multiplier
-  while (true) {
-    mult_ptr = inner->as<MulNode>();
-    if (mult_ptr) {
-      inner = &(mult_ptr->a);
-      mult_outer = mult_ptr->b * mult_outer;
-    } else {
-      break;
-    }
-  }
-  // 2. Search for the pattern c / (...) * (...) + c % (...)
-  // We match the search element with Add, Mul and Div.
-  //   If Add is found, we need to continue our search for the rhs
-  //   If Mult is found, we will expand the inner multiplication factor
-  //   If Div is found, we will go on testing whether lhs matches the lhs of mod expr
-  //      and returns the optimization result.
-  const PrimExpr* search_ptr = inner;
-  PrimExpr mult_inner;  // The inner multiplication factor
-  PrimExpr no_opt_sum;  // Sum of the exprs that cannot be optimized
-  tir::ExprDeepEqual expr_equal;
-
-  while (true) {
-    auto inner_div_ptr = search_ptr->as<IndexDiv>();
-    auto inner_mult_ptr = search_ptr->as<MulNode>();
-    auto inner_add_ptr = search_ptr->as<AddNode>();
-    if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) {
-      return std::make_pair(false, PrimExpr());
-    } else if (inner_div_ptr) {
-      PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
-      if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
-          expr_equal(inner_div_ptr->a, mod_l_expr)) {
-        // Found!
-        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
-        return std::make_pair(true, ret);
-      } else {
-        return std::make_pair(false, PrimExpr());
-      }
-    } else if (inner_mult_ptr) {
-      mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b;
-      search_ptr = &(inner_mult_ptr->a);
-    } else if (inner_add_ptr) {
-      if (mult_inner.get()) {
-        return std::make_pair(false, PrimExpr());
-      }
-      no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a;
-      search_ptr = &(inner_add_ptr->b);
-    } else {
-      LOG(FATAL) << "Unexpected search result!";
-      break;
-    }
-  }
-  return std::make_pair(false, PrimExpr());
-}
-
-// Insert the elements into the corresponding mult_exprs and mod_exprs.
-// If the element is found to match Mul, it will be pushed to the mult_exprs.
-// If the element it found to match Mod, it will be pused to the mod_exprs.
-// Otherwise, the elements will be added to the no_opt_sum variable
-inline void MergeMulModInsertElements(const std::vector<const PrimExpr*>& eles,
-                                      std::list<PrimExpr>* mult_exprs,
-                                      std::list<std::pair<PrimExpr, PrimExpr> >* mod_exprs,
-                                      PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) {
-  using namespace tir;
-  *has_mult = false;
-  *has_mod = false;
-  for (const PrimExpr* ele : eles) {
-    auto mod_ptr = ele->as<IndexMod>();
-    auto mult_ptr = ele->as<MulNode>();
-    if (mod_ptr) {
-      *has_mod = true;
-      mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b)));
-    } else if (mult_ptr) {
-      *has_mult = true;
-      mult_exprs->emplace_back(*ele);
-    } else {
-      *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele;
-    }
-  }
-}
-
-// Searches for this types of expr:
-//   (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   + c % (k1 * k2 * ... * ki)
-// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c
-// The search will be performed repeatively until no pattern is found.
-// Return: a pair with (false, Expr()) if cannot be optimized.
-//         a pair with (true, optimized_expr) if can be optimized
-inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
-  using namespace tir;
-  // 1. Prepare the lists.
-  // We store two lists, a list that contain all the elements that match Mul and
-  //                     a list that contain all the elements that match Mod.
-  // The elements in the Mod will be used to match against the elements in Mul.
-  // The result will then be split and pushed back to these two lists.
-  PrimExpr simplified_base = base;
-  arith::PVar<PrimExpr> x, y;
-  if ((floordiv(x, y) * y + floormod(x, y)).Match(simplified_base)) {
-    simplified_base = x.Eval();
-  }
-  simplified_base = analyzer->Simplify(simplified_base);
-  std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
-  std::list<PrimExpr> mult_exprs;
-  std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;
-  PrimExpr no_opt_sum;
-  bool has_mult;
-  bool has_mod;
-  MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod);
-  bool find_opt = false;
-  std::list<std::pair<PrimExpr, PrimExpr> >::iterator search_mod_it = mod_exprs.begin();
-  // 2. Exhaustive Search
-  while (search_mod_it != mod_exprs.end()) {
-    std::list<PrimExpr>::iterator mult_it = mult_exprs.begin();
-    bool inner_find_opt = false;
-    while (mult_it != mult_exprs.end()) {
-      std::pair<bool, PrimExpr> ret =
-          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
-      if (ret.first) {
-        inner_find_opt = true;
-        auto temp_mod_it = search_mod_it;
-        ++search_mod_it;
-        mod_exprs.erase(temp_mod_it);
-        mult_exprs.erase(mult_it);
-        std::vector<const PrimExpr*> ret_eles = ExprSplitAddition(ret.second);
-        MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult,
-                                  &has_mod);
-        if (has_mult) {
-          search_mod_it = mod_exprs.begin();
-        } else if (has_mod && search_mod_it == mod_exprs.end()) {
-          search_mod_it--;
-        }
-        break;
-      } else {
-        ++mult_it;
-      }
-    }
-    find_opt = find_opt || inner_find_opt;
-    if (!inner_find_opt) {
-      ++search_mod_it;
-    }
-  }
-  if (!find_opt) {
-    return simplified_base;
-  }
-  for (std::list<PrimExpr>::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) {
-    no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it;
-  }
-  for (std::list<std::pair<PrimExpr, PrimExpr> >::iterator it = mod_exprs.begin();
-       it != mod_exprs.end(); ++it) {
-    no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second)
-                                  : indexmod(it->first, it->second);
-  }
-  return no_opt_sum;
-}
-
-// The buffer offset in convention of number of elements of
-// original data ignoring number of lanes.
-// We also perform optimization to simplify the indexing expression.
-inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
-  PrimExpr base = n->elem_offset;
-  arith::Analyzer ana;
-  if (n->strides.size() == 0) {
-    // Scalar case
-    if (n->shape.size() == 0 && index.size() == 1) {
-      auto is_int = index[0].as<IntImmNode>();
-      ICHECK(is_int && is_int->value == 0);
-      base = base + index[0];
-    } else {
-      ICHECK_EQ(n->shape.size(), index.size());
-      if (index.size() > 0) {
-        PrimExpr offset = index[0];
-        for (size_t i = 1; i < index.size(); ++i) {
-          offset = MergeMulMod(&ana, offset * n->shape[i] + index[i]);
-        }
-        base = base + offset;
-      }
-    }
-  } else {
-    ICHECK_EQ(n->strides.size(), index.size());
-    if (is_zero(base)) {
-      base = MergeMulMod(&ana, index[0] * n->strides[0]);
-    } else {
-      base = MergeMulMod(&ana, base + index[0] * n->strides[0]);
-    }
-    for (size_t i = 1; i < index.size(); ++i) {
-      base = MergeMulMod(&ana, base + index[i] * n->strides[i]);
-    }
-  }
-  return base;
-}
-
-inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataType dtype) {
-  PrimExpr offset = ElemOffset(n, index);
-  if (n->dtype.lanes() != 1) {
-    offset = offset * make_const(offset.dtype(), dtype.lanes());
-  }
-  if (dtype.lanes() != 1) {
-    return tir::Ramp(offset, make_const(offset.dtype(), 1), dtype.lanes());
-  } else {
-    return offset;
-  }
-}
-}
-
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index f73208e5759e..d3544b5c02cf 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -41,201 +41,16 @@
 
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
+#include "../ir/buffer_common.h"
 #include "arg_binder.h"
 #include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
 namespace {
-  using IndexMod = tir::FloorModNode;
-  using IndexDiv = tir::FloorDivNode;
-
-// Split the given expression w.r.t the add operator
-inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
-  using namespace tir;
-  std::vector<const PrimExpr*> ret;
-  std::stack<const PrimExpr*> split_buffer;
-  split_buffer.push(&expr);
-  while (!split_buffer.empty()) {
-    const PrimExpr* top_ele = split_buffer.top();
-    split_buffer.pop();
-    auto expr_add_match = top_ele->as<AddNode>();
-    if (expr_add_match) {
-      split_buffer.push(&expr_add_match->b);
-      split_buffer.push(&expr_add_match->a);
-    } else {
-      ret.emplace_back(top_ele);
-    }
-  }
-  return ret;
-}
-
-// Searches for the following types of expr:
-//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   mod_l_expr = c
-//   mod_r_expr = k1 * k2 * ... * ki
-// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
-// Currently the we will not search the add/mult combinations exhaustively
-//   as it will take too much computation.
-inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
-                                                  const PrimExpr& mod_l_expr,
-                                                  const PrimExpr& mod_r_expr) {
-  using namespace tir;
-  const MulNode* mult_ptr = mult_expr.as<MulNode>();
-  if (!mult_ptr) return std::make_pair(false, PrimExpr());
-  PrimExpr mult_outer = mult_ptr->b;
-  const PrimExpr* inner = &(mult_ptr->a);
-  // 1. Calculate the outer multiplier
-  while (true) {
-    mult_ptr = inner->as<MulNode>();
-    if (mult_ptr) {
-      inner = &(mult_ptr->a);
-      mult_outer = mult_ptr->b * mult_outer;
-    } else {
-      break;
-    }
-  }
-  // 2. Search for the pattern c / (...) * (...) + c % (...)
-  // We match the search element with Add, Mul and Div.
-  //   If Add is found, we need to continue our search for the rhs
-  //   If Mult is found, we will expand the inner multiplication factor
-  //   If Div is found, we will go on testing whether lhs matches the lhs of mod expr
-  //      and returns the optimization result.
-  const PrimExpr* search_ptr = inner;
-  PrimExpr mult_inner;  // The inner multiplication factor
-  PrimExpr no_opt_sum;  // Sum of the exprs that cannot be optimized
-  tir::ExprDeepEqual expr_equal;
-
-  while (true) {
-    auto inner_div_ptr = search_ptr->as<IndexDiv>();
-    auto inner_mult_ptr = search_ptr->as<MulNode>();
-    auto inner_add_ptr = search_ptr->as<AddNode>();
-    if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) {
-      return std::make_pair(false, PrimExpr());
-    } else if (inner_div_ptr) {
-      PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
-      if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
-          expr_equal(inner_div_ptr->a, mod_l_expr)) {
-        // Found!
-        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
-        return std::make_pair(true, ret);
-      } else {
-        return std::make_pair(false, PrimExpr());
-      }
-    } else if (inner_mult_ptr) {
-      mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b;
-      search_ptr = &(inner_mult_ptr->a);
-    } else if (inner_add_ptr) {
-      if (mult_inner.get()) {
-        return std::make_pair(false, PrimExpr());
-      }
-      no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a;
-      search_ptr = &(inner_add_ptr->b);
-    } else {
-      LOG(FATAL) << "Unexpected search result!";
-      break;
-    }
-  }
-  return std::make_pair(false, PrimExpr());
-}
-
-// Insert the elements into the corresponding mult_exprs and mod_exprs.
-// If the element is found to match Mul, it will be pushed to the mult_exprs.
-// If the element it found to match Mod, it will be pused to the mod_exprs.
-// Otherwise, the elements will be added to the no_opt_sum variable
-inline void MergeMulModInsertElements(const std::vector<const PrimExpr*>& eles,
-                                      std::list<PrimExpr>* mult_exprs,
-                                      std::list<std::pair<PrimExpr, PrimExpr> >* mod_exprs,
-                                      PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) {
-  using namespace tir;
-  *has_mult = false;
-  *has_mod = false;
-  for (const PrimExpr* ele : eles) {
-    auto mod_ptr = ele->as<IndexMod>();
-    auto mult_ptr = ele->as<MulNode>();
-    if (mod_ptr) {
-      *has_mod = true;
-      mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b)));
-    } else if (mult_ptr) {
-      *has_mult = true;
-      mult_exprs->emplace_back(*ele);
-    } else {
-      *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele;
-    }
-  }
-}
-
-// Searches for this types of expr:
-//   (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   + c % (k1 * k2 * ... * ki)
-// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c
-// The search will be performed repeatively until no pattern is found.
-// Return: a pair with (false, Expr()) if cannot be optimized.
-//         a pair with (true, optimized_expr) if can be optimized
-inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
-  using namespace tir;
-  // 1. Prepare the lists.
-  // We store two lists, a list that contain all the elements that match Mul and
-  //                     a list that contain all the elements that match Mod.
-  // The elements in the Mod will be used to match against the elements in Mul.
-  // The result will then be split and pushed back to these two lists.
-  PrimExpr simplified_base = analyzer->Simplify(base);
-  std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
-  std::list<PrimExpr> mult_exprs;
-  std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;
-  PrimExpr no_opt_sum;
-  bool has_mult;
-  bool has_mod;
-  MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod);
-  bool find_opt = false;
-  std::list<std::pair<PrimExpr, PrimExpr> >::iterator search_mod_it = mod_exprs.begin();
-  // 2. Exhaustive Search
-  while (search_mod_it != mod_exprs.end()) {
-    std::list<PrimExpr>::iterator mult_it = mult_exprs.begin();
-    bool inner_find_opt = false;
-    while (mult_it != mult_exprs.end()) {
-      std::pair<bool, PrimExpr> ret =
-          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
-      if (ret.first) {
-        inner_find_opt = true;
-        auto temp_mod_it = search_mod_it;
-        ++search_mod_it;
-        mod_exprs.erase(temp_mod_it);
-        mult_exprs.erase(mult_it);
-        std::vector<const PrimExpr*> ret_eles = ExprSplitAddition(ret.second);
-        MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult,
-                                  &has_mod);
-        if (has_mult) {
-          search_mod_it = mod_exprs.begin();
-        } else if (has_mod && search_mod_it == mod_exprs.end()) {
-          search_mod_it--;
-        }
-        break;
-      } else {
-        ++mult_it;
-      }
-    }
-    find_opt = find_opt || inner_find_opt;
-    if (!inner_find_opt) {
-      ++search_mod_it;
-    }
-  }
-  if (!find_opt) {
-    return simplified_base;
-  }
-  for (std::list<PrimExpr>::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) {
-    no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it;
-  }
-  for (std::list<std::pair<PrimExpr, PrimExpr> >::iterator it = mod_exprs.begin();
-       it != mod_exprs.end(); ++it) {
-    no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second)
-                                  : indexmod(it->first, it->second);
-  }
-  return no_opt_sum;
-}
 
 inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) {
-  PrimExpr base = make_const(DataType::Int(32), 0); //IntImm(DataType::Int(32), 0);
+  PrimExpr base = make_const(DataType::Int(32), 0);
   ICHECK_EQ(shape.size(), index.size());
   arith::Analyzer ana;
   if (index.size() > 0) {
@@ -256,7 +71,6 @@ size_t GetAxisSeparator(size_t shape_rank) {
   //
   // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c]
   //
-
   return shape_rank - 2;
 }
 }

From 6be6e6024e97d2b95c1bf5314b901cf0dd4231e7 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 5 Mar 2021 21:28:30 -0800
Subject: [PATCH 24/59] More refactor and clean up in texture lowering.

---
 src/tir/transforms/texture_flatten.cc | 89 ++++++++++-----------------
 1 file changed, 33 insertions(+), 56 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index d3544b5c02cf..cba4751bc668 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -70,7 +70,6 @@ size_t GetAxisSeparator(size_t shape_rank) {
   // axes are packed into rows.
   //
   // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c]
-  //
   return shape_rank - 2;
 }
 }
@@ -112,7 +111,7 @@ class TextureLoweringBase : public StmtExprMutator {
     return storage_scope;
   }
 
-  // Buffer set
+  // External buffer
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
   // Storage scope
   std::unordered_map<const Object*, std::string> storage_scope_;
@@ -146,8 +145,6 @@ class TextureFlattener : public TextureLoweringBase {
       Array<PrimExpr> shape;
       auto width = IntImm(DataType::Int(32), 1);
       auto height = IntImm(DataType::Int(32), 1);
-      // TODO(csulivan): We do not currently handle the case where
-      // the last dimension isn't previously set to a vector(4)
       for (size_t i = 0; i < op->bounds.size()-1; i++) {
         if (i < GetAxisSeparator(op->bounds.size())) {
           height *= op->bounds[i]->extent;
@@ -155,7 +152,6 @@ class TextureFlattener : public TextureLoweringBase {
           width *= op->bounds[i]->extent;
         }
       }
-
       Array<PrimExpr> args = {width, height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
@@ -166,34 +162,11 @@ class TextureFlattener : public TextureLoweringBase {
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
-
     std::string storage_scope = GetStorageScope(op->buffer);
+    // Lower to two dimensional access
     if (storage_scope == "texture") {
-      Array<PrimExpr> args;
-      if (let_binding_.count(op->buffer->data)) {
-        args.push_back(let_binding_[op->buffer->data]);
-      } else {
-        args.push_back(op->buffer->data);
-      }
-
-      Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
-      for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
-        if (i < GetAxisSeparator(op->buffer->shape.size())) {
-          col_dims.push_back(op->buffer->shape[i]);
-          col_indices.push_back(op->indices[i]);
-        } else {
-          row_dims.push_back(op->buffer->shape[i]);
-          row_indices.push_back(op->indices[i]);
-        }
-      }
-
-      PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
-      PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
-
-      args.push_back(row_offset);
-      args.push_back(col_offset);
+      Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->value);
-
       stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
     }
 
@@ -203,37 +176,15 @@ class TextureFlattener : public TextureLoweringBase {
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<BufferLoadNode>();
-
+    // Replace with identitcal external buffer if one exists
     auto buffer = op->buffer;
     if (buffer_binds_.count(op->buffer)) {
       buffer = buffer_binds_[op->buffer];
     }
-
+    // Lower to two dimensional access
     std::string storage_scope = GetStorageScope(buffer);
     if (storage_scope == "texture") {
-      Array<PrimExpr> args;
-      if (let_binding_.count(op->buffer->data)) {
-        args.push_back(let_binding_[op->buffer->data]);
-      } else {
-        args.push_back(buffer->data);
-      }
-
-
-      Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
-      for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
-        if (i < GetAxisSeparator(op->buffer->shape.size())) {
-          col_dims.push_back(op->buffer->shape[i]);
-          col_indices.push_back(op->indices[i]);
-        } else {
-          row_dims.push_back(op->buffer->shape[i]);
-          row_indices.push_back(op->indices[i]);
-        }
-      }
-
-      PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
-      PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
-      args.push_back(row_offset);
-      args.push_back(col_offset);
+      Array<PrimExpr> args = GetTextureAccessArgs(op, buffer);
       args.push_back(op->indices.back());
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
     }
@@ -243,6 +194,31 @@ class TextureFlattener : public TextureLoweringBase {
 
  protected:
 
+  template<typename T>
+  Array<PrimExpr> GetTextureAccessArgs(const T* op, const Buffer& buffer) {
+    Array<PrimExpr> args;
+    if (let_binding_.count(op->buffer->data)) {
+      args.push_back(let_binding_[op->buffer->data]);
+    } else {
+      args.push_back(buffer->data);
+    }
+    Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
+    for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
+      if (i < GetAxisSeparator(op->buffer->shape.size())) {
+        col_dims.push_back(op->buffer->shape[i]);
+        col_indices.push_back(op->indices[i]);
+      } else {
+        row_dims.push_back(op->buffer->shape[i]);
+        row_indices.push_back(op->indices[i]);
+      }
+    }
+    PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
+    PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
+    args.push_back(row_offset);
+    args.push_back(col_offset);
+    return args;
+  }
+
   // Let binding
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_binds_;
@@ -283,7 +259,8 @@ class ExternalBufferForwarding : public TextureLoweringBase {
       if (extern_buf_.count(load->buffer)) {
         // If the buffer to load and the buffer to store to are both texture
         // check for identical access
-        if (GetStorageScope(load->buffer) == "texture" && GetStorageScope(op->buffer) == "texture") {
+        if (GetStorageScope(load->buffer) == "texture" &&
+            GetStorageScope(op->buffer) == "texture") {
           auto store_index = SimplifyOffset(op->buffer->shape, op->indices);
           auto load_index = SimplifyOffset(load->buffer->shape, load->indices);
           if (arith::Analyzer().CanProve(store_index == load_index)) {

From 48cf179dfe4bdd1bd959fcb46a55f821d71ee061 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sun, 7 Mar 2021 15:17:43 -0800
Subject: [PATCH 25/59] Add IsTextureType to tir and allow buffer var type
 annotation to be TextureType in addition to PointerType.

---
 include/tvm/tir/op.h | 15 +++++++++++++++
 src/tir/ir/buffer.cc |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 9cf7d0a3cd1f..c45a9d77a446 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -895,6 +895,21 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) {
   }
   return false;
 }
+/*!
+ * \brief Check if type is a texture handle of a runtime element type.
+ * \param type The type to be checked.
+ * \param element_type The corresponding element type.
+ * \return The check results
+ */
+inline bool IsTextureType(const Type& type, const DataType& element_type) {
+  if (!type.defined()) return false;
+  if (const auto* ptr_type = type.as<TextureTypeNode>()) {
+    if (const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
+      return prim_type->dtype == element_type;
+    }
+  }
+  return false;
+}
 
 /*!
  * \brief Make a const value with certain data type.
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index beee377d8401..cd6cffabcda4 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -166,7 +166,8 @@ Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr>
   if (storage_dtype == DataType::Bool()) {
     storage_dtype = DataType::Int(8);
   }
-  ICHECK(IsPointerType(data->type_annotation, storage_dtype))
+  ICHECK(IsPointerType(data->type_annotation, storage_dtype) ||
+         IsTextureType(data->type_annotation, storage_dtype))
       << "Buffer data field expect to have the right pointer type annotation"
       << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype;
 

From 91f00ee89ebeeeb31d536ace1b66e68bdbaf83cc Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 9 Mar 2021 14:27:00 -0800
Subject: [PATCH 26/59] Bug fix in texture access qualifier inference pass

---
 src/target/source/codegen_opencl.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index b8ff1d451445..05136045fc80 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -69,6 +69,7 @@ class InferTextureAccess : public StmtExprVisitor {
         var_access_map_[buffer] |= write_access;
       }
     }
+    StmtExprVisitor::VisitExpr_(op);
   }
 private:
   std::unordered_map<const VarNode*, uint8_t> var_access_map_;

From 0ac0875c2e03cf2125346cd1324e69c639b66671 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 9 Mar 2021 11:29:04 -0800
Subject: [PATCH 27/59] Step toward handling external texture buffer forwarding
 when external buffer is not stored directly to cache_read realized buffer.
 For example when it is conditionally stored via an IfThenElse node when
 padding is used.

---
 src/tir/transforms/texture_flatten.cc | 33 ++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index cba4751bc668..1639f1cc2416 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -252,28 +252,48 @@ class ExternalBufferForwarding : public TextureLoweringBase {
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
+    ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store";
+    external_loads_.emplace_back();
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
 
-    if (auto load = op->value.as<BufferLoadNode>()) {
+    auto check_identity = [this](const BufferStoreNode* store, const BufferLoad& load) {
       if (extern_buf_.count(load->buffer)) {
         // If the buffer to load and the buffer to store to are both texture
         // check for identical access
         if (GetStorageScope(load->buffer) == "texture" &&
-            GetStorageScope(op->buffer) == "texture") {
-          auto store_index = SimplifyOffset(op->buffer->shape, op->indices);
+            GetStorageScope(store->buffer) == "texture") {
+          auto store_index = SimplifyOffset(store->buffer->shape, store->indices);
           auto load_index = SimplifyOffset(load->buffer->shape, load->indices);
           if (arith::Analyzer().CanProve(store_index == load_index)) {
-            extern_buffer_copy_.insert(op->buffer);
-            buffer_map_.insert({op->buffer, load->buffer});
+            extern_buffer_copy_.insert(store->buffer);
+            buffer_map_.insert({store->buffer, load->buffer});
           }
         }
       }
-    }
+    };
 
+    if (auto load_node = op->value.as<BufferLoadNode>()) {
+      check_identity(op, GetRef<BufferLoad>(load_node));
+    } else {
+      // Stored value is not a load, check for external loads collected
+      // when visiting the store node's value
+      for (auto& expr : external_loads_.back()) {
+        check_identity(op, Downcast<BufferLoad>(expr));
+      }
+    }
+    external_loads_.pop_back();
     return stmt;
   }
 
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+    if (external_loads_.size() && extern_buf_.count(op->buffer)) {
+      external_loads_.back().push_back(expr);
+    }
+    return expr;
+  }
+
   const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& GetForwardedBuffers() {
     return buffer_map_;
   }
@@ -282,6 +302,7 @@ class ExternalBufferForwarding : public TextureLoweringBase {
   std::deque<Stmt> realize_attrs_;
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buffer_copy_;
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
+  std::vector<std::vector<PrimExpr>> external_loads_;
 };
 
 

From 50d4ee523bd38423aa43455f7866c1595a57db2a Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 17 Mar 2021 10:54:27 -0700
Subject: [PATCH 28/59] [Part 2/3] Support texture:weight lowering convention
 for externally provided texture buffers. Need to propagate this to allocated
 textures when cache_read(texture) is used for weights.

---
 src/tir/transforms/texture_flatten.cc | 34 ++++++++++++++++++---------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 1639f1cc2416..80c8accda588 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -63,14 +63,26 @@ inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExp
   return base;
 }
 
-size_t GetAxisSeparator(size_t shape_rank) {
+size_t GetAxisSeparator(size_t shape_rank, std::string scope = "texture") {
   // Convention is that shape is packed with the last axis
   // as RGBA (length 4) and the second to last axis
   // will be the packed texure columns. All other
   // axes are packed into rows.
-  //
+  // Texture activation:
   // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c]
-  return shape_rank - 2;
+  // Texture weight:
+  // e.g. [O,I,H,W,c] -> TextureFlattening -> [O, I*H*W, c]
+  size_t separator;
+  if (scope == "texture"){
+    separator = shape_rank - 2;
+  } else if (scope == "texture:weight") {
+    separator = 1;
+  }
+  return separator;
+}
+
+bool IsTextureStorage(std::string scope) {
+  return scope.find("texture") != std::string::npos;
 }
 }
 
@@ -136,7 +148,7 @@ class TextureFlattener : public TextureLoweringBase {
     Stmt body = this->VisitStmt(op->body);
 
     std::string storage_scope = GetStorageScope(op->buffer);
-    if (storage_scope == "texture") {
+    if (IsTextureStorage(storage_scope)) {
       body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
@@ -146,7 +158,7 @@ class TextureFlattener : public TextureLoweringBase {
       auto width = IntImm(DataType::Int(32), 1);
       auto height = IntImm(DataType::Int(32), 1);
       for (size_t i = 0; i < op->bounds.size()-1; i++) {
-        if (i < GetAxisSeparator(op->bounds.size())) {
+        if (i < GetAxisSeparator(op->bounds.size(), storage_scope)) {
           height *= op->bounds[i]->extent;
         } else {
           width *= op->bounds[i]->extent;
@@ -164,7 +176,7 @@ class TextureFlattener : public TextureLoweringBase {
     op = stmt.as<BufferStoreNode>();
     std::string storage_scope = GetStorageScope(op->buffer);
     // Lower to two dimensional access
-    if (storage_scope == "texture") {
+    if (IsTextureStorage(storage_scope)) {
       Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->value);
       stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
@@ -183,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase {
     }
     // Lower to two dimensional access
     std::string storage_scope = GetStorageScope(buffer);
-    if (storage_scope == "texture") {
+    if (IsTextureStorage(storage_scope)) {
       Array<PrimExpr> args = GetTextureAccessArgs(op, buffer);
       args.push_back(op->indices.back());
       expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
@@ -204,7 +216,7 @@ class TextureFlattener : public TextureLoweringBase {
     }
     Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
     for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
-      if (i < GetAxisSeparator(op->buffer->shape.size())) {
+      if (i < GetAxisSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
         col_dims.push_back(op->buffer->shape[i]);
         col_indices.push_back(op->indices[i]);
       } else {
@@ -236,7 +248,7 @@ class ExternalBufferForwarding : public TextureLoweringBase {
       if (op->body->IsInstance<BufferRealizeNode>()) {
         const auto* realize = Downcast<BufferRealize>(op->body).get();
         std::string realize_scope = GetStorageScope(realize->buffer);
-        if (realize_scope == "texture" && extern_buffer_copy_.count(realize->buffer)) {
+        if (IsTextureStorage(realize_scope) && extern_buffer_copy_.count(realize->buffer)) {
           return realize_attrs_.back();
         } else {
           if (realize_attrs_.size()) {
@@ -261,8 +273,8 @@ class ExternalBufferForwarding : public TextureLoweringBase {
       if (extern_buf_.count(load->buffer)) {
         // If the buffer to load and the buffer to store to are both texture
         // check for identical access
-        if (GetStorageScope(load->buffer) == "texture" &&
-            GetStorageScope(store->buffer) == "texture") {
+        if (IsTextureStorage(GetStorageScope(load->buffer)) &&
+            IsTextureStorage(GetStorageScope(store->buffer))) {
           auto store_index = SimplifyOffset(store->buffer->shape, store->indices);
           auto load_index = SimplifyOffset(load->buffer->shape, load->indices);
           if (arith::Analyzer().CanProve(store_index == load_index)) {

From fba2d3ff0d30e4bf8ad7b35132f578ffdef03129 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 8 Mar 2021 22:46:51 -0800
Subject: [PATCH 29/59] Bug fix in texture access qualifier inference pass

---
 src/target/source/codegen_opencl.cc | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 05136045fc80..c8a549a54491 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -40,7 +40,7 @@ class InferTextureAccess : public StmtExprVisitor {
 
   explicit InferTextureAccess() {}
   std::unordered_map<const VarNode*, std::string> Infer(const Stmt& n) {
-    this->operator()(n);
+    StmtExprVisitor::VisitStmt(n);
     std::unordered_map<const VarNode*, std::string> storage_scope_qualifiers;
     for (auto& texture : var_access_map_) {
       if (texture.second == read_access) {
@@ -56,21 +56,17 @@ class InferTextureAccess : public StmtExprVisitor {
     return storage_scope_qualifiers;
   }
   void VisitExpr_(const CallNode* op) {
-    if (!op->args.size())
-    {
-      return;
+    if (op->op.same_as(builtin::text2d_load())) {
+      var_access_map_[op->args[0].as<VarNode>()] |= read_access;
     }
-    if (const VarNode* buffer = op->args[0].as<VarNode>())
-    {
-      if (op->op.same_as(builtin::text2d_load())) {
-        var_access_map_[buffer] |= read_access;
-      }
-      else if (op->op.same_as(builtin::text2d_store())) {
-        var_access_map_[buffer] |= write_access;
-      }
+    else if (op->op.same_as(builtin::text2d_store())) {
+      var_access_map_[op->args[0].as<VarNode>()] |= write_access;
+    } else {
+      StmtExprVisitor::VisitExpr_(op);
     }
     StmtExprVisitor::VisitExpr_(op);
   }
+
 private:
   std::unordered_map<const VarNode*, uint8_t> var_access_map_;
 };

From 2fc4238d97e60d6275d68721d00f3f346dfdc1e2 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 12 Mar 2021 14:03:11 -0800
Subject: [PATCH 30/59] Tighten constraint on external buffer forwarding --
 cache_read(texture) cancellation -- to avoid incorrect programs. Currently
 only forward through if_then_else node and direct external loads. For
 if_then_else, still need proper analysis of structural equality between
 buffers and access patterns to determine if an external buffer can replace
 the texture buffer realized via cache_read.

---
 src/tir/transforms/texture_flatten.cc | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 80c8accda588..c2398ed9ee39 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -265,7 +265,14 @@ class ExternalBufferForwarding : public TextureLoweringBase {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store";
-    external_loads_.emplace_back();
+    if (auto* call_node = op->value.as<CallNode>()) {
+      // Path to supporting external cache_read canceling when padding has induced
+      // a conditional load into the cache_read buffer. We may be able to elide the
+      // conditional completely due to hardware support for returning 0 when OOB
+      if (call_node->op.same_as(builtin::if_then_else())) {
+        external_loads_.emplace_back();
+      }
+    }
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferStoreNode>();
 
@@ -287,14 +294,14 @@ class ExternalBufferForwarding : public TextureLoweringBase {
 
     if (auto load_node = op->value.as<BufferLoadNode>()) {
       check_identity(op, GetRef<BufferLoad>(load_node));
-    } else {
+    } else if (external_loads_.size()) {
       // Stored value is not a load, check for external loads collected
-      // when visiting the store node's value
+      // when visiting the store node's value, e.g. from if_then_else
       for (auto& expr : external_loads_.back()) {
         check_identity(op, Downcast<BufferLoad>(expr));
       }
+      external_loads_.pop_back();
     }
-    external_loads_.pop_back();
     return stmt;
   }
 

From 384dad02637e0b1f08bd71725b8f8aa264273426 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 19 Mar 2021 15:56:31 -0700
Subject: [PATCH 31/59] Use texture lowering convention from texture runtime
 util.

---
 src/tir/transforms/texture_flatten.cc | 37 +++++++--------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index c2398ed9ee39..8b0989d9e40c 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -41,6 +41,7 @@
 
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
+#include "../../runtime/texture.h"
 #include "../ir/buffer_common.h"
 #include "arg_binder.h"
 #include "ir_utils.h"
@@ -63,24 +64,6 @@ inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExp
   return base;
 }
 
-size_t GetAxisSeparator(size_t shape_rank, std::string scope = "texture") {
-  // Convention is that shape is packed with the last axis
-  // as RGBA (length 4) and the second to last axis
-  // will be the packed texure columns. All other
-  // axes are packed into rows.
-  // Texture activation:
-  // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c]
-  // Texture weight:
-  // e.g. [O,I,H,W,c] -> TextureFlattening -> [O, I*H*W, c]
-  size_t separator;
-  if (scope == "texture"){
-    separator = shape_rank - 2;
-  } else if (scope == "texture:weight") {
-    separator = 1;
-  }
-  return separator;
-}
-
 bool IsTextureStorage(std::string scope) {
   return scope.find("texture") != std::string::npos;
 }
@@ -155,15 +138,13 @@ class TextureFlattener : public TextureLoweringBase {
       ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)";
 
       Array<PrimExpr> shape;
-      auto width = IntImm(DataType::Int(32), 1);
-      auto height = IntImm(DataType::Int(32), 1);
-      for (size_t i = 0; i < op->bounds.size()-1; i++) {
-        if (i < GetAxisSeparator(op->bounds.size(), storage_scope)) {
-          height *= op->bounds[i]->extent;
-        } else {
-          width *= op->bounds[i]->extent;
-        }
-      }
+      Integer width = 1, height = 1;
+      size_t axis = runtime::DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
+      struct Shape {
+        Array<Range> bounds;
+        PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
+      };
+      std::tie(width, height) = runtime::ApplyTexture2DFlattening<Integer>(Shape{op->bounds}, op->bounds.size(), axis);
       Array<PrimExpr> args = {width, height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
@@ -216,7 +197,7 @@ class TextureFlattener : public TextureLoweringBase {
     }
     Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
     for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
-      if (i < GetAxisSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
+      if (i < runtime::DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
         col_dims.push_back(op->buffer->shape[i]);
         col_indices.push_back(op->indices[i]);
       } else {

From 3c1b1220641f6e34a48201ed97d60df6ba75b839 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 19 Mar 2021 15:58:25 -0700
Subject: [PATCH 32/59] Use updated texture lowering utilities

---
 src/tir/transforms/texture_flatten.cc | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 8b0989d9e40c..e4e8861b492e 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -50,6 +50,10 @@ namespace tvm {
 namespace tir {
 namespace {
 
+using runtime::IsTextureStorage;
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::ApplyTexture2DFlattening;
+
 inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) {
   PrimExpr base = make_const(DataType::Int(32), 0);
   ICHECK_EQ(shape.size(), index.size());
@@ -63,10 +67,6 @@ inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExp
   }
   return base;
 }
-
-bool IsTextureStorage(std::string scope) {
-  return scope.find("texture") != std::string::npos;
-}
 }
 
 class TextureLoweringBase : public StmtExprMutator {
@@ -137,15 +137,13 @@ class TextureFlattener : public TextureLoweringBase {
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
       ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)";
 
-      Array<PrimExpr> shape;
-      Integer width = 1, height = 1;
-      size_t axis = runtime::DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
       struct Shape {
-        Array<Range> bounds;
+        const Array<Range>& bounds;
         PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
       };
-      std::tie(width, height) = runtime::ApplyTexture2DFlattening<Integer>(Shape{op->bounds}, op->bounds.size(), axis);
-      Array<PrimExpr> args = {width, height};
+      size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
+      auto texture = ApplyTexture2DFlattening<PrimExpr>(Shape{op->bounds}, op->bounds.size(), axis);
+      Array<PrimExpr> args = {texture.width, texture.height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
     }
 
@@ -197,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase {
     }
     Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
     for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
-      if (i < runtime::DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
+      if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
         col_dims.push_back(op->buffer->shape[i]);
         col_indices.push_back(op->indices[i]);
       } else {

From 137da362ae50180e8f5e03b3368fa309329161cc Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 15 Mar 2021 16:29:18 -0700
Subject: [PATCH 33/59] Use inherited visitor overloads in texture flattener.

---
 src/tir/transforms/texture_flatten.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index e4e8861b492e..cbc7a3ced362 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -114,6 +114,7 @@ class TextureLoweringBase : public StmtExprMutator {
 
 class TextureFlattener : public TextureLoweringBase {
  public:
+  using StmtExprMutator::VisitStmt_;
   explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map,
                             const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_)
     : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;}

From 5b6787e27e74773a48b577219b8214e68ce11628 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 16 Mar 2021 23:08:03 -0700
Subject: [PATCH 34/59] Add check in codegen for float/half until
 read/write_image codegen supports other types.

---
 src/target/source/codegen_opencl.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index c8a549a54491..a17897d2d10a 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -358,6 +358,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     }
     else if (buffer_type.is_float()) {
       os << "write_imagef(";
+    } else {
+      LOG(FATAL) << "Unsupported type: " << buffer_type
+                 << ", currently only float and half are supported for image2d OpenCL codegen.";
     }
     this->PrintExpr(op->args[0], os);
     os << ", ";
@@ -375,6 +378,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     }
     else if (op->dtype.is_float()) {
       ss << "read_imagef(";
+    } else {
+      LOG(FATAL) << "Unsupported type: " << op->dtype
+                 << ", currently only float and half are supported for image2d OpenCL codegen.";
     }
     this->PrintExpr(op->args[0], ss);
     ss << ", ";

From c839505e689712ea428898dfa0f1d4bb363b0bc4 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 17 Mar 2021 09:57:59 -0700
Subject: [PATCH 35/59] Rename tir texture builtins

---
 include/tvm/tir/builtin.h               | 10 +++++-----
 src/target/source/codegen_opencl.cc     | 18 +++++++++---------
 src/tir/op/builtin.cc                   |  6 +++---
 src/tir/transforms/lower_tvm_builtin.cc |  2 +-
 src/tir/transforms/texture_flatten.cc   |  6 +++---
 src/tir/transforms/vectorize_loop.cc    |  8 ++------
 6 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 66fa069d62fa..86857a33cdf4 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -603,17 +603,17 @@ TVM_DLL const Op& atomic_add();
 /*!
  * \brief Create a texture 2d memory allocation
  */
-TVM_DLL const Op& text2d_alloca();
+TVM_DLL const Op& texture2d_alloca();
 
 /*!
- * \brief Store to a texture 2d memory
+ * \brief Store to texture 2d memory
  */
-TVM_DLL const Op& text2d_store();
+TVM_DLL const Op& texture2d_store();
 
 /*!
- * \brief Load from a texture 2d memory
+ * \brief Load from texture 2d memory
  */
-TVM_DLL const Op& text2d_load();
+TVM_DLL const Op& texture2d_load();
 
 /*! \brief The kind of structure field info used in intrinsic */
 enum TVMStructFieldKind : int {
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index a17897d2d10a..87cbe8dd4d5a 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -56,10 +56,9 @@ class InferTextureAccess : public StmtExprVisitor {
     return storage_scope_qualifiers;
   }
   void VisitExpr_(const CallNode* op) {
-    if (op->op.same_as(builtin::text2d_load())) {
+    if (op->op.same_as(builtin::texture2d_load())) {
       var_access_map_[op->args[0].as<VarNode>()] |= read_access;
-    }
-    else if (op->op.same_as(builtin::text2d_store())) {
+    } else if (op->op.same_as(builtin::texture2d_store())) {
       var_access_map_[op->args[0].as<VarNode>()] |= write_access;
     } else {
       StmtExprVisitor::VisitExpr_(op);
@@ -304,7 +303,7 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType
 
 void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
   if (auto call = op->value.as<CallNode>()) {
-    if (call->op.same_as(builtin::text2d_load())) {
+    if (call->op.same_as(builtin::texture2d_load())) {
       need_texture_ssa_ = false;
       // If storing a texture load into a buffer, don't use an
       // intermediate local unless the buffer allocation is a
@@ -322,7 +321,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
 
 void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) {
   if (auto call = op->value.as<CallNode>()) {
-    if (call->op.same_as(builtin::text2d_load())) {
+    if (call->op.same_as(builtin::texture2d_load())) {
       need_texture_ssa_ = false;
     }
   }
@@ -349,9 +348,10 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << " *)" << this->GetVarID(load->buffer_var.get()) << " + ";
     this->PrintExpr(load->index, os);
     os << ')';
-  } else if (op->op.same_as(builtin::text2d_store())) {
-    auto* texture_type  = op->args[0].as<VarNode>()->type_annotation.as<TextureTypeNode>();
-    ICHECK(texture_type != nullptr) << "builtin::text2d_store() only supports storing to texture buffers";
+  } else if (op->op.same_as(builtin::texture2d_store())) {
+    auto* texture_type = op->args[0].as<VarNode>()->type_annotation.as<TextureTypeNode>();
+    ICHECK(texture_type != nullptr)
+        << "builtin::texture2d_store() only supports storing to texture buffers";
     DataType buffer_type = texture_type->element_type.as<PrimTypeNode>()->dtype;
     if (buffer_type.is_float16()) {
       os << "write_imageh(";
@@ -371,7 +371,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << "), ";
     this->PrintExpr(op->args[3], os);
     os << ")";
-  } else if (op->op.same_as(builtin::text2d_load())) {
+  } else if (op->op.same_as(builtin::texture2d_load())) {
     std::stringstream ss;
     if (op->dtype.is_float16()) {
       ss << "read_imageh(";
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 7705369eb5c8..c593cbf7290c 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -246,14 +246,14 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine)
 TIR_DEFINE_BUILTIN_FUNC(atomic_add)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(text2d_alloca)
+TIR_DEFINE_BUILTIN_FUNC(texture2d_alloca)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(text2d_store)
+TIR_DEFINE_BUILTIN_FUNC(texture2d_store)
     .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(text2d_load)
+TIR_DEFINE_BUILTIN_FUNC(texture2d_load)
     .set_attr<TVectorizable>("TVectorizable", true)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 19d434006b83..9c28c6f55926 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -100,7 +100,7 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt_(const LetStmtNode* op) final {
     if (const CallNode* call = op->value.as<CallNode>()) {
-      if (call->op.same_as(builtin::text2d_alloca())) {
+      if (call->op.same_as(builtin::texture2d_alloca())) {
         return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call));
       }
     }
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index cbc7a3ced362..740742b1a0ff 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -145,7 +145,7 @@ class TextureFlattener : public TextureLoweringBase {
       size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
       auto texture = ApplyTexture2DFlattening<PrimExpr>(Shape{op->bounds}, op->bounds.size(), axis);
       Array<PrimExpr> args = {texture.width, texture.height};
-      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body);
+      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body);
     }
 
     return stmt;
@@ -159,7 +159,7 @@ class TextureFlattener : public TextureLoweringBase {
     if (IsTextureStorage(storage_scope)) {
       Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->value);
-      stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args));
+      stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args));
     }
 
     return stmt;
@@ -178,7 +178,7 @@ class TextureFlattener : public TextureLoweringBase {
     if (IsTextureStorage(storage_scope)) {
       Array<PrimExpr> args = GetTextureAccessArgs(op, buffer);
       args.push_back(op->indices.back());
-      expr = Call(op->buffer->dtype, builtin::text2d_load(), args);
+      expr = Call(op->buffer->dtype, builtin::texture2d_load(), args);
     }
 
     return expr;
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 9943d1e37938..cd2d230f5775 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -265,18 +265,14 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::if_then_else())) {
       return MutateIfThenElseExpr_(op);
-    }
-    else if (op->op.same_as(builtin::text2d_load()))
-    {
+    } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
       auto new_args = op->args;
       new_args.pop_back();
       new_args.push_back(fcd[0]);
       return Call(op->dtype.with_lanes(4), op->op, new_args);
-    }
-    else if (op->op.same_as(builtin::text2d_store()))
-    {
+    } else if (op->op.same_as(builtin::texture2d_store())) {
       int lane = 0;
       // Vectorize the value to store
       Array<PrimExpr> value{op->args.back()};

From 8041cc900089977e6915f641fd5b5c82065648d8 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 23 Mar 2021 11:34:54 -0700
Subject: [PATCH 36/59] Remove codegen and tir runtime dependence on for
 TVMBackendAlloc/FreeTexture.

---
 src/target/llvm/codegen_cpu.cc |  2 --
 src/tir/op/runtime.cc          | 10 ----------
 2 files changed, 12 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 8b01f9d9186e..ab96d6e69d14 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -403,8 +403,6 @@ void CodeGenCPU::InitGlobalContext(bool dynamic_lookup) {
       // Mark as context functions
       gv_func_map_["TVMBackendAllocWorkspace"] = nullptr;
       gv_func_map_["TVMBackendFreeWorkspace"] = nullptr;
-      gv_func_map_["TVMBackendAllocTexture"] = nullptr;
-      gv_func_map_["TVMBackendFreeTexture"] = nullptr;
     }
   }
 }
diff --git a/src/tir/op/runtime.cc b/src/tir/op/runtime.cc
index 2a894d00ec0c..adabae9e75f7 100644
--- a/src/tir/op/runtime.cc
+++ b/src/tir/op/runtime.cc
@@ -37,15 +37,5 @@ TVM_REGISTER_OP("tir.TVMBackendFreeWorkspace")
     .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeWorkspace")
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TVM_REGISTER_OP("tir.TVMBackendAllocTexture")
-    .set_num_inputs(6)
-    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendAllocTexture")
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
-TVM_REGISTER_OP("tir.TVMBackendFreeTexture")
-    .set_num_inputs(3)
-    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeTexture")
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
 }  // namespace tir
 }  // namespace tvm

From 684e513042f32de77aad5a190683cfe92962c034 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 29 Apr 2021 15:32:57 -0700
Subject: [PATCH 37/59] Dispatch texture allocas via target specialized
 tir.tvm_call_packed

---
 src/tir/transforms/lower_tvm_builtin.cc | 31 ++++++++++++++-----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 9c28c6f55926..83e6e97a428f 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -360,20 +360,25 @@ class BuiltinLower : public StmtExprMutator {
                                     throw_last_error),
                          let->body});
     DataType dtype = let->var->type_annotation.as<TextureTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
-    Stmt alloca = LetStmt(
-        let->var,
-        Call(let->var.dtype(), Op::Get("tir.TVMBackendAllocTexture"),
-                    {cast(DataType::Int(32), device_type_),
-                        cast(DataType::Int(32), device_id_),
-                        cast(DataType::UInt(64), call->args[0]),
-                        cast(DataType::UInt(64), call->args[1]),
-                        IntImm(DataType::Int(32), dtype.code()),
-                        IntImm(DataType::Int(32), dtype.bits())}),
-        body);
 
-    PrimExpr free_op = Call(DataType::Int(32), Op::Get("tir.TVMBackendFreeTexture"),
-                            {cast(DataType::Int(32), device_type_),
-                             cast(DataType::Int(32), device_id_), let->var});
+    std::string fdevapi_prefix = "device_api.";
+    fdevapi_prefix += runtime::DeviceName(device_type_.as<IntImmNode>()->value);
+    Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(),
+                            {StringImm(fdevapi_prefix + ".AllocTexture"),
+                                cast(DataType::Int(32), device_type_),
+                                cast(DataType::Int(32), device_id_),
+                                cast(DataType::UInt(64), call->args[0]),
+                                cast(DataType::UInt(64), call->args[1]),
+                                IntImm(DataType::Int(32), dtype.code()),
+                                IntImm(DataType::Int(32), dtype.bits())});
+
+    Stmt alloca = LetStmt(let->var, call_packed, body);
+
+    Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(),
+                     {StringImm(fdevapi_prefix + ".FreeTexture"),
+                         cast(DataType::Int(32), device_type_),
+                         cast(DataType::Int(32), device_id_), let->var});
+
     Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error);
     body = SeqStmt({alloca, free_stmt});
     return body;

From 069ec771cf2d0d7a07047a3191d7294dd2f94517 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 4 May 2021 15:23:36 -0700
Subject: [PATCH 38/59] Remove kTexture scope and use kGlobal with texture tag.

---
 src/runtime/thread_storage_scope.h | 7 -------
 src/te/operation/op_utils.cc       | 2 +-
 src/te/schedule/bound.cc           | 2 +-
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 611a40d996ea..ac8260ffbe39 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -59,8 +59,6 @@ enum class StorageRank {
   kWMMAMatrixB = 5,
   /*! \brief wmma scope memory of accumulator */
   kWMMAAccumulator = 6,
-  /*! \brief global scope texture memory */
-  kTexture = 7,
 };
 
 /*!
@@ -110,8 +108,6 @@ struct StorageScope {
         return "wmma.matrix_b" + tag;
       case StorageRank::kWMMAAccumulator:
         return "wmma.accumulator" + tag;
-      case StorageRank::kTexture:
-        return "texture" + tag;
       default:
         LOG(FATAL) << "unknown storage scope";
         return "";
@@ -147,9 +143,6 @@ struct StorageScope {
     } else if (s.compare(0, 16, "wmma.accumulator") == 0) {
       r.rank = StorageRank::kWMMAAccumulator;
       r.tag = s.substr(16, std::string::npos);
-    } else if (s.compare(0, 7, "texture") == 0) {
-      r.rank = StorageRank::kTexture;
-      r.tag = s.substr(7, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index de0d6b5be848..ddc78866ae02 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -161,7 +161,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       } else {
         runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag);
         runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope);
-        if (static_cast<int>(ss.rank) <= ts.rank || ss.rank == runtime::StorageRank::kTexture) {
+        if (static_cast<int>(ss.rank) <= ts.rank) {
           value_map[iv] = var;
         } else if (stage->scope == "warp" && ts.rank == 1) {
           // To determine whether a thread index is inside or outside a warp, we need
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index c7ec8f23892c..12c9b5538b44 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -66,7 +66,7 @@ bool NeedRelax(const IterVar& iv, bool found_attach,
   if (scope.rank == StorageRank::kWarp && ts.rank == 1 && ts.dim_index == 0) {
     return true;
   }
-  return static_cast<int>(scope.rank) <= ts.rank || scope.rank == StorageRank::kTexture;
+  return static_cast<int>(scope.rank) <= ts.rank;
 }
 
 // infer storage scope, if not given

From 86bb2e6e05234d08829f3ed76f2fbdc315b11cf6 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 10 May 2021 17:02:46 -0700
Subject: [PATCH 39/59] Remove TextureType.

---
 include/tvm/ir/type.h            | 49 --------------------------------
 include/tvm/ir/type_functor.h    |  4 ---
 include/tvm/tir/op.h             | 15 ----------
 src/ir/type.cc                   | 27 ------------------
 src/ir/type_functor.cc           | 12 --------
 src/printer/text_printer.h       |  1 -
 src/printer/tir_text_printer.cc  |  6 ----
 src/printer/tvmscript_printer.cc |  7 -----
 src/tir/op/op.cc                 |  2 +-
 9 files changed, 1 insertion(+), 122 deletions(-)

diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 8d073e88b0ab..c772650809fa 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -189,55 +189,6 @@ class PointerType : public Type {
   TVM_DEFINE_OBJECT_REF_METHODS(PointerType, Type, PointerTypeNode);
 };
 
-/*!
- * \brief Low-level texture type.
- *
- *  TextureType represents type hints in the TIR to be
- *  passed to the final code generator.
- *
- *  TextureType should not occur in the high-level analysis.
- *
- * \sa TextureType
- */
-class TextureTypeNode : public TypeNode {
- public:
-  /*!
-   * \brief The base type of the texture.
-   */
-  Type element_type;
-
-  void VisitAttrs(AttrVisitor* v) { v->Visit("element_type", &element_type); }
-
-  bool SEqualReduce(const TextureTypeNode* other, SEqualReducer equal) const {
-    return equal(element_type, other->element_type);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(element_type); }
-
-  static constexpr const char* _type_key = "TextureType";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TextureTypeNode, TypeNode);
-};
-
-/*
- * \brief Managed reference to TextureTypeNode.
- * \sa TextureTypeNode
- */
-class TextureType : public Type {
- public:
-  /*!
-   * \brief Constructor
-   * \param element_type The base type of the texture.
-   */
-  TVM_DLL explicit TextureType(Type element_type);
-  /*!
-   * \brief Constructor
-   * \param element_type The base type of the texture.
-   */
-  TVM_DLL explicit TextureType(runtime::DataType dtype);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TextureType, Type, TextureTypeNode);
-};
-
 /*! \brief Possible kinds of TypeVars. */
 enum TypeKind : int {
   kType = 0,
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index c71051e6f61c..11bf7d4740d0 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -89,7 +89,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
   virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const PrimTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const PointerTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
-  virtual R VisitType_(const TextureTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitTypeDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;  // unreachable, written to stop compiler warning
@@ -113,7 +112,6 @@ class TypeFunctor<R(const Type& n, Args...)> {
     TVM_TYPE_FUNCTOR_DISPATCH(TypeDataNode);
     TVM_TYPE_FUNCTOR_DISPATCH(PrimTypeNode);
     TVM_TYPE_FUNCTOR_DISPATCH(PointerTypeNode);
-    TVM_TYPE_FUNCTOR_DISPATCH(TextureTypeNode);
     return vtable;
   }
 };
@@ -137,7 +135,6 @@ class TVM_DLL TypeVisitor : public TypeFunctor<void(const Type& n)> {
   void VisitType_(const TypeDataNode* op) override;
   void VisitType_(const PrimTypeNode* op) override;
   void VisitType_(const PointerTypeNode* op) override;
-  void VisitType_(const TextureTypeNode* op) override;
 };
 
 /*!
@@ -158,7 +155,6 @@ class TVM_DLL TypeMutator : public TypeFunctor<Type(const Type& n)> {
   Type VisitType_(const TypeDataNode* op) override;
   Type VisitType_(const PrimTypeNode* op) override;
   Type VisitType_(const PointerTypeNode* op) override;
-  Type VisitType_(const TextureTypeNode* op) override;
 
  private:
   Array<Type> MutateArray(Array<Type> arr);
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index c45a9d77a446..9cf7d0a3cd1f 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -895,21 +895,6 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) {
   }
   return false;
 }
-/*!
- * \brief Check if type is a texture handle of a runtime element type.
- * \param type The type to be checked.
- * \param element_type The corresponding element type.
- * \return The check results
- */
-inline bool IsTextureType(const Type& type, const DataType& element_type) {
-  if (!type.defined()) return false;
-  if (const auto* ptr_type = type.as<TextureTypeNode>()) {
-    if (const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
-      return prim_type->dtype == element_type;
-    }
-  }
-  return false;
-}
 
 /*!
  * \brief Make a const value with certain data type.
diff --git a/src/ir/type.cc b/src/ir/type.cc
index 5e0c8911c543..fe8e00329bbc 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -67,33 +67,6 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << '*';
     });
 
-TextureType::TextureType(Type element_type) {
-  ObjectPtr<TextureTypeNode> n = make_object<TextureTypeNode>();
-  n->element_type = std::move(element_type);
-  data_ = std::move(n);
-}
-TextureType::TextureType(runtime::DataType dtype) {
-  ObjectPtr<TextureTypeNode> n = make_object<TextureTypeNode>();
-  n->element_type = PrimType(dtype);
-  data_ = std::move(n);
-}
-
-
-TVM_REGISTER_NODE_TYPE(TextureTypeNode);
-
-TVM_REGISTER_GLOBAL("ir.TextureType").set_body_typed([](Type element_type) {
-    return TextureType(element_type);
-});
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TextureTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-        auto* node = static_cast<const TextureTypeNode*>(ref.get());
-        p->stream << "texture ";
-        p->Print(node->element_type);
-        p->stream << '*';
-    });
-
-
 TypeVar::TypeVar(String name, TypeKind kind, Span span) {
   ObjectPtr<TypeVarNode> n = make_object<TypeVarNode>();
   n->name_hint = std::move(name);
diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc
index e084a82ed7be..51d5d3778c10 100644
--- a/src/ir/type_functor.cc
+++ b/src/ir/type_functor.cc
@@ -89,8 +89,6 @@ void TypeVisitor::VisitType_(const PrimTypeNode* op) {}
 
 void TypeVisitor::VisitType_(const PointerTypeNode* op) { this->VisitType(op->element_type); }
 
-void TypeVisitor::VisitType_(const TextureTypeNode* op) { this->VisitType(op->element_type); }
-
 Type TypeMutator::VisitType(const Type& t) {
   return t.defined() ? TypeFunctor<Type(const Type&)>::VisitType(t) : t;
 }
@@ -200,16 +198,6 @@ Type TypeMutator::VisitType_(const PointerTypeNode* op) {
   }
 }
 
-Type TypeMutator::VisitType_(const TextureTypeNode* op) {
-  Type element_type = VisitType(op->element_type);
-
-  if (element_type.same_as(op->element_type)) {
-    return GetRef<Type>(op);
-  } else {
-    return TextureType(element_type);
-  }
-}
-
 // Implements bind.
 class TypeBinder : public TypeMutator {
  public:
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 55f68f3e36cb..0332a2d539d2 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -333,7 +333,6 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
 
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
-  Doc VisitType_(const TextureTypeNode* node) override;
   Doc VisitType_(const TupleTypeNode* node) override;
 
   Doc PrintIRModule(const IRModule& module);
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index b137ae34107d..f232994480f8 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -613,12 +613,6 @@ Doc TIRTextPrinter::VisitType_(const PointerTypeNode* node) {
   return doc;
 }
 
-Doc TIRTextPrinter::VisitType_(const TextureTypeNode* node) {
-  Doc doc;
-  doc << "Texture(" << Print(node->element_type) << ")";
-  return doc;
-}
-
 Doc TIRTextPrinter::VisitType_(const TupleTypeNode* node) {
   std::vector<Doc> fields;
   for (Type field : node->fields) {
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 39852c39b82a..cc7536b48cfd 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -145,7 +145,6 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
 
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
-  Doc VisitType_(const TextureTypeNode* node) override;
   Doc VisitType_(const TupleTypeNode* node) override;
 
   Doc PrintBody(const Stmt& body);
@@ -733,12 +732,6 @@ Doc TVMScriptPrinter::VisitType_(const PointerTypeNode* node) {
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitType_(const TextureTypeNode* node) {
-  Doc doc;
-  doc << "ty.Texture[" << Print(node->element_type) << "]";
-  return doc;
-}
-
 Doc TVMScriptPrinter::VisitType_(const TupleTypeNode* node) {
   if (node->fields.empty()) {
     return Doc::Text("None");
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index d03cf22094a8..d29132450227 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -51,7 +51,7 @@ using namespace tir;
 runtime::DataType GetRuntimeDataType(const Type& type) {
   if (auto* n = type.as<PrimTypeNode>()) {
     return n->dtype;
-  } else if (type.as<PointerTypeNode>() || type.as<TextureTypeNode>()) {
+  } else if (type.as<PointerTypeNode>()) {
     return DataType::Handle();
   } else if (IsVoidType(type)) {
     return DataType::Void();

From 3e17295194c6acfb2117c7b987940bf18360b0d4 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 19 May 2021 16:54:21 -0700
Subject: [PATCH 40/59] Remove TextureType from OpenCL codegen.

---
 src/target/source/codegen_opencl.cc | 33 +++++++++++++++++------------
 src/tir/ir/buffer.cc                |  3 +--
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 87cbe8dd4d5a..5da29ca4643c 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -28,6 +28,7 @@
 
 #include "../../runtime/opencl/opencl_module.h"
 #include "../../runtime/thread_storage_scope.h"
+#include "../../runtime/texture.h"
 #include "../build_common.h"
 
 namespace tvm {
@@ -77,10 +78,10 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
   this->SetTextureScope(InferTextureAccess().Infer(f->body));
   for (Var arg : f->params) {
-    if (arg->type_annotation.as<TextureTypeNode>())
-    {
+    auto ptr_type = arg->type_annotation.as<PointerTypeNode>();
+    if (ptr_type && runtime::IsTextureStorage(std::string(ptr_type->storage_scope))) {
       // Storage scope qualifiers for textures are inferred
-      // and set prior function codegen.
+      // and set prior to function codegen.
       continue;
     }
     else if (arg.dtype().is_handle()) {
@@ -211,10 +212,12 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
   if (auto* ptr = type.as<PrimTypeNode>()) {
     return PrintType(ptr->dtype, os);
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
-    PrintType(ptr->element_type, os);
-    os << '*';
-  } else if (type.as<TextureTypeNode>()){
-    os << "image2d_t";
+    if (runtime::IsTextureStorage(std::string(ptr->storage_scope))) {
+      os << "image2d_t";
+    } else {
+      PrintType(ptr->element_type, os);
+      os << '*';
+    }
   } else if (IsVoidType(type)) {
     os << "void";
   } else {
@@ -278,10 +281,11 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os
 }
 
 void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
-  // Only apply restrict qualifer for non-texture types
-  if (v->type_annotation.as<TextureTypeNode>() == nullptr)
-  {
-    os << ' ' << restrict_keyword_;
+  // Apply restrict qualifer for non-texture types only
+  if (auto* ptr = v->type_annotation.as<PointerTypeNode>()) {
+    if (!runtime::IsTextureStorage(std::string(ptr->storage_scope))) {
+      os << ' ' << restrict_keyword_;
+    }
   }
 }
 
@@ -349,10 +353,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(load->index, os);
     os << ')';
   } else if (op->op.same_as(builtin::texture2d_store())) {
-    auto* texture_type = op->args[0].as<VarNode>()->type_annotation.as<TextureTypeNode>();
-    ICHECK(texture_type != nullptr)
+    auto* ptr_type = op->args[0].as<VarNode>()->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr_type != nullptr) << "Texture Var's must be of PointerType";
+    ICHECK(runtime::IsTextureStorage(std::string(ptr_type->storage_scope)))
         << "builtin::texture2d_store() only supports storing to texture buffers";
-    DataType buffer_type = texture_type->element_type.as<PrimTypeNode>()->dtype;
+    DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
     if (buffer_type.is_float16()) {
       os << "write_imageh(";
     }
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index cd6cffabcda4..beee377d8401 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -166,8 +166,7 @@ Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr>
   if (storage_dtype == DataType::Bool()) {
     storage_dtype = DataType::Int(8);
   }
-  ICHECK(IsPointerType(data->type_annotation, storage_dtype) ||
-         IsTextureType(data->type_annotation, storage_dtype))
+  ICHECK(IsPointerType(data->type_annotation, storage_dtype))
       << "Buffer data field expect to have the right pointer type annotation"
       << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype;
 

From b3cdc52082aab042046d3f216f1a7909152fc16c Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 19 May 2021 16:55:06 -0700
Subject: [PATCH 41/59] Remove TextureType from TIR lowering.

---
 src/tir/transforms/lower_tvm_builtin.cc | 2 +-
 src/tir/transforms/texture_flatten.cc   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 83e6e97a428f..daa868668c47 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -359,7 +359,7 @@ class BuiltinLower : public StmtExprMutator {
     Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}),
                                     throw_last_error),
                          let->body});
-    DataType dtype = let->var->type_annotation.as<TextureTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
+    DataType dtype = let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
 
     std::string fdevapi_prefix = "device_api.";
     fdevapi_prefix += runtime::DeviceName(device_type_.as<IntImmNode>()->value);
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 740742b1a0ff..1544b68a55a4 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -124,14 +124,14 @@ class TextureFlattener : public TextureLoweringBase {
       return this->VisitStmt(op->body);
     }
 
-    Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype));
+    std::string storage_scope = GetStorageScope(op->buffer);
+    Var buffer_var(op->buffer->data->name_hint, PointerType(PrimType(op->buffer->dtype), String(storage_scope)));
     let_binding_.insert({op->buffer->data, buffer_var});
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferRealizeNode>();
     Stmt body = this->VisitStmt(op->body);
 
-    std::string storage_scope = GetStorageScope(op->buffer);
     if (IsTextureStorage(storage_scope)) {
       body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";

From 17b8808a648e71e4a4a6181ca3fabd23a258b052 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 18 Jun 2021 22:59:36 -0700
Subject: [PATCH 42/59] Remove dependency on MergeMulMod.

---
 src/tir/transforms/texture_flatten.cc | 53 +++++++++++++++------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 1544b68a55a4..25770b2438e3 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -48,30 +48,15 @@
 
 namespace tvm {
 namespace tir {
-namespace {
-
 using runtime::IsTextureStorage;
 using runtime::DefaultTextureLayoutSeparator;
 using runtime::ApplyTexture2DFlattening;
 
-inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) {
-  PrimExpr base = make_const(DataType::Int(32), 0);
-  ICHECK_EQ(shape.size(), index.size());
-  arith::Analyzer ana;
-  if (index.size() > 0) {
-    PrimExpr offset = index[0];
-    for (size_t i = 1; i < index.size(); ++i) {
-      offset = MergeMulMod(&ana, offset * shape[i] + index[i]);
-    }
-    base = base + offset;
-  }
-  return base;
-}
-}
 
 class TextureLoweringBase : public StmtExprMutator {
  public:
-  explicit TextureLoweringBase(const Map<Var, Buffer>& extern_buffer_map) {
+  explicit TextureLoweringBase(const Map<Var, Buffer>& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer)
+    : bound_analyzer_{bound_analyzer} {
     for (auto kv : extern_buffer_map) {
       extern_buf_.insert(kv.second);
     }
@@ -92,6 +77,19 @@ class TextureLoweringBase : public StmtExprMutator {
     return StmtExprMutator::VisitStmt_(op);
   }
 
+  inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) const {
+    PrimExpr base = make_const(DataType::Int(32), 0);
+    ICHECK_EQ(shape.size(), index.size());
+    if (index.size() > 0) {
+      PrimExpr offset = index[0];
+      for (size_t i = 1; i < index.size(); ++i) {
+        offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]);
+      }
+      base = base + offset;
+    }
+    return base;
+  }
+
  protected:
 
   std::string GetStorageScope(const Buffer& buffer) {
@@ -106,18 +104,22 @@ class TextureLoweringBase : public StmtExprMutator {
     return storage_scope;
   }
 
+  // TODO: need docs
   // External buffer
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
   // Storage scope
   std::unordered_map<const Object*, std::string> storage_scope_;
+  // Bound analzer
+  IRVisitorWithAnalyzer* bound_analyzer_;
 };
 
 class TextureFlattener : public TextureLoweringBase {
  public:
   using StmtExprMutator::VisitStmt_;
   explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map,
-                            const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_)
-    : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;}
+                            const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_,
+                            IRVisitorWithAnalyzer* bound_analyzer)
+    : TextureLoweringBase(extern_buffer_map, bound_analyzer), buffer_binds_(extern_buffer_binds_) {;}
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     if (extern_buf_.count(op->buffer)) {
@@ -211,6 +213,7 @@ class TextureFlattener : public TextureLoweringBase {
     return args;
   }
 
+  // TODO: Need docs
   // Let binding
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_binds_;
@@ -219,8 +222,9 @@ class TextureFlattener : public TextureLoweringBase {
 
 class ExternalBufferForwarding : public TextureLoweringBase {
  public:
-  explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map)
-    : TextureLoweringBase(extern_buffer_map) {;}
+  explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map,
+                                    IRVisitorWithAnalyzer* bound_analyzer)
+    : TextureLoweringBase(extern_buffer_map, bound_analyzer) {;}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     Stmt stmt = TextureLoweringBase::VisitStmt_(op);
@@ -307,9 +311,12 @@ class ExternalBufferForwarding : public TextureLoweringBase {
 
 PrimFunc TextureFlatten(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
-  ExternalBufferForwarding forward(fptr->buffer_map);
+
+  IRVisitorWithAnalyzer bound_analyzer;
+  bound_analyzer(fptr->body);
+  ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer);
   fptr->body = forward(std::move(fptr->body));
-  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers())(std::move(fptr->body));
+  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), &bound_analyzer)(std::move(fptr->body));
   return func;
 }
 

From c758eb9baf0f0109b87e9d124db89dbbcd13ad49 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 18 Jun 2021 23:06:25 -0700
Subject: [PATCH 43/59] Revert "Add buffer_common.h to house buffer offset
 simplification routines."

This reverts commit 027628259229aaee051dbf1dfbed4e63ef820544.
---
 src/tir/ir/buffer.cc                  | 240 +++++++++++++++++++++++++-
 src/tir/transforms/texture_flatten.cc |   2 -
 2 files changed, 239 insertions(+), 3 deletions(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index beee377d8401..6a102339bcea 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -29,13 +29,17 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
-#include "buffer_common.h"
+#include <iterator>
+#include <stack>
 
 #include "../../arith/pattern_match.h"
 
 namespace tvm {
 namespace tir {
 
+using IndexMod = tir::FloorModNode;
+using IndexDiv = tir::FloorDivNode;
+
 Array<PrimExpr> SimplifyArray(arith::Analyzer* ana, Array<PrimExpr> array) {
   for (size_t i = 0; i < array.size(); ++i) {
     array.Set(i, ana->Simplify(array[i]));
@@ -50,6 +54,240 @@ Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, String st
                 Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault, span);
 }
 
+namespace {
+// Split the given expression w.r.t the add operator
+inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
+  using namespace tir;
+  std::vector<const PrimExpr*> ret;
+  std::stack<const PrimExpr*> split_buffer;
+  split_buffer.push(&expr);
+  while (!split_buffer.empty()) {
+    const PrimExpr* top_ele = split_buffer.top();
+    split_buffer.pop();
+    auto expr_add_match = top_ele->as<AddNode>();
+    if (expr_add_match) {
+      split_buffer.push(&expr_add_match->b);
+      split_buffer.push(&expr_add_match->a);
+    } else {
+      ret.emplace_back(top_ele);
+    }
+  }
+  return ret;
+}
+
+// Searches for the following types of expr:
+//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   mod_l_expr = c
+//   mod_r_expr = k1 * k2 * ... * ki
+// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
+// Currently the we will not search the add/mult combinations exhaustively
+//   as it will take too much computation.
+inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
+                                                  const PrimExpr& mod_l_expr,
+                                                  const PrimExpr& mod_r_expr) {
+  using namespace tir;
+  const MulNode* mult_ptr = mult_expr.as<MulNode>();
+  if (!mult_ptr) return std::make_pair(false, PrimExpr());
+  PrimExpr mult_outer = mult_ptr->b;
+  const PrimExpr* inner = &(mult_ptr->a);
+  // 1. Calculate the outer multiplier
+  while (true) {
+    mult_ptr = inner->as<MulNode>();
+    if (mult_ptr) {
+      inner = &(mult_ptr->a);
+      mult_outer = mult_ptr->b * mult_outer;
+    } else {
+      break;
+    }
+  }
+  // 2. Search for the pattern c / (...) * (...) + c % (...)
+  // We match the search element with Add, Mul and Div.
+  //   If Add is found, we need to continue our search for the rhs
+  //   If Mult is found, we will expand the inner multiplication factor
+  //   If Div is found, we will go on testing whether lhs matches the lhs of mod expr
+  //      and returns the optimization result.
+  const PrimExpr* search_ptr = inner;
+  PrimExpr mult_inner;  // The inner multiplication factor
+  PrimExpr no_opt_sum;  // Sum of the exprs that cannot be optimized
+  tir::ExprDeepEqual expr_equal;
+
+  while (true) {
+    auto inner_div_ptr = search_ptr->as<IndexDiv>();
+    auto inner_mult_ptr = search_ptr->as<MulNode>();
+    auto inner_add_ptr = search_ptr->as<AddNode>();
+    if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) {
+      return std::make_pair(false, PrimExpr());
+    } else if (inner_div_ptr) {
+      PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
+      if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
+          expr_equal(inner_div_ptr->a, mod_l_expr)) {
+        // Found!
+        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
+        return std::make_pair(true, ret);
+      } else {
+        return std::make_pair(false, PrimExpr());
+      }
+    } else if (inner_mult_ptr) {
+      mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b;
+      search_ptr = &(inner_mult_ptr->a);
+    } else if (inner_add_ptr) {
+      if (mult_inner.get()) {
+        return std::make_pair(false, PrimExpr());
+      }
+      no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a;
+      search_ptr = &(inner_add_ptr->b);
+    } else {
+      LOG(FATAL) << "Unexpected search result!";
+      break;
+    }
+  }
+  return std::make_pair(false, PrimExpr());
+}
+
+// Insert the elements into the corresponding mult_exprs and mod_exprs.
+// If the element is found to match Mul, it will be pushed to the mult_exprs.
+// If the element it found to match Mod, it will be pused to the mod_exprs.
+// Otherwise, the elements will be added to the no_opt_sum variable
+inline void MergeMulModInsertElements(const std::vector<const PrimExpr*>& eles,
+                                      std::list<PrimExpr>* mult_exprs,
+                                      std::list<std::pair<PrimExpr, PrimExpr> >* mod_exprs,
+                                      PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) {
+  using namespace tir;
+  *has_mult = false;
+  *has_mod = false;
+  for (const PrimExpr* ele : eles) {
+    auto mod_ptr = ele->as<IndexMod>();
+    auto mult_ptr = ele->as<MulNode>();
+    if (mod_ptr) {
+      *has_mod = true;
+      mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b)));
+    } else if (mult_ptr) {
+      *has_mult = true;
+      mult_exprs->emplace_back(*ele);
+    } else {
+      *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele;
+    }
+  }
+}
+
+// Searches for this types of expr:
+//   (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   + c % (k1 * k2 * ... * ki)
+// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c
+// The search will be performed repeatively until no pattern is found.
+// Return: a pair with (false, Expr()) if cannot be optimized.
+//         a pair with (true, optimized_expr) if can be optimized
+inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
+  using namespace tir;
+  // 1. Prepare the lists.
+  // We store two lists, a list that contain all the elements that match Mul and
+  //                     a list that contain all the elements that match Mod.
+  // The elements in the Mod will be used to match against the elements in Mul.
+  // The result will then be split and pushed back to these two lists.
+  PrimExpr simplified_base = analyzer->Simplify(base);
+  std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
+  std::list<PrimExpr> mult_exprs;
+  std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;
+  PrimExpr no_opt_sum;
+  bool has_mult;
+  bool has_mod;
+  MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod);
+  bool find_opt = false;
+  std::list<std::pair<PrimExpr, PrimExpr> >::iterator search_mod_it = mod_exprs.begin();
+  // 2. Exhaustive Search
+  while (search_mod_it != mod_exprs.end()) {
+    std::list<PrimExpr>::iterator mult_it = mult_exprs.begin();
+    bool inner_find_opt = false;
+    while (mult_it != mult_exprs.end()) {
+      std::pair<bool, PrimExpr> ret =
+          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
+      if (ret.first) {
+        inner_find_opt = true;
+        auto temp_mod_it = search_mod_it;
+        ++search_mod_it;
+        mod_exprs.erase(temp_mod_it);
+        mult_exprs.erase(mult_it);
+        std::vector<const PrimExpr*> ret_eles = ExprSplitAddition(ret.second);
+        MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult,
+                                  &has_mod);
+        if (has_mult) {
+          search_mod_it = mod_exprs.begin();
+        } else if (has_mod && search_mod_it == mod_exprs.end()) {
+          search_mod_it--;
+        }
+        break;
+      } else {
+        ++mult_it;
+      }
+    }
+    find_opt = find_opt || inner_find_opt;
+    if (!inner_find_opt) {
+      ++search_mod_it;
+    }
+  }
+  if (!find_opt) {
+    return simplified_base;
+  }
+  for (std::list<PrimExpr>::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) {
+    no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it;
+  }
+  for (std::list<std::pair<PrimExpr, PrimExpr> >::iterator it = mod_exprs.begin();
+       it != mod_exprs.end(); ++it) {
+    no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second)
+                                  : indexmod(it->first, it->second);
+  }
+  return no_opt_sum;
+}
+
+// The buffer offset in convention of number of elements of
+// original data ignoring number of lanes.
+// We also perform optimization to simplify the indexing expression.
+inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
+  PrimExpr base = n->elem_offset;
+  arith::Analyzer ana;
+  if (n->strides.size() == 0) {
+    // Scalar case
+    if (n->shape.size() == 0 && index.size() == 1) {
+      auto is_int = index[0].as<IntImmNode>();
+      ICHECK(is_int && is_int->value == 0);
+      base = base + index[0];
+    } else {
+      ICHECK_EQ(n->shape.size(), index.size());
+      if (index.size() > 0) {
+        PrimExpr offset = index[0];
+        for (size_t i = 1; i < index.size(); ++i) {
+          offset = MergeMulMod(&ana, offset * n->shape[i] + index[i]);
+        }
+        base = base + offset;
+      }
+    }
+  } else {
+    ICHECK_EQ(n->strides.size(), index.size());
+    if (is_zero(base)) {
+      base = MergeMulMod(&ana, index[0] * n->strides[0]);
+    } else {
+      base = MergeMulMod(&ana, base + index[0] * n->strides[0]);
+    }
+    for (size_t i = 1; i < index.size(); ++i) {
+      base = MergeMulMod(&ana, base + index[i] * n->strides[i]);
+    }
+  }
+  return base;
+}
+
+inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataType dtype) {
+  PrimExpr offset = ElemOffset(n, index);
+  if (n->dtype.lanes() != 1) {
+    offset = offset * make_const(offset.dtype(), dtype.lanes());
+  }
+  if (dtype.lanes() != 1) {
+    return tir::Ramp(offset, make_const(offset.dtype(), 1), dtype.lanes());
+  } else {
+    return offset;
+  }
+}
+}
+
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 25770b2438e3..e62d736bd80f 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -42,8 +42,6 @@
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "../../runtime/texture.h"
-#include "../ir/buffer_common.h"
-#include "arg_binder.h"
 #include "ir_utils.h"
 
 namespace tvm {

From a794abdd0a5ed6975f0e9f0e17ea7036ff85523d Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 18 Jun 2021 23:08:57 -0700
Subject: [PATCH 44/59] Prune include list

---
 src/tir/transforms/texture_flatten.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index e62d736bd80f..85058b9c2112 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -22,27 +22,18 @@
  * \brief Flattens texture from multi-dimensional array to 2D buffer access
  */
 
-#include <tvm/arith/analyzer.h>
-#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/target/target_info.h>
 #include <tvm/te/operation.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/buffer.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
-#include <tvm/tir/op.h>
 #include <tvm/tir/stmt.h>
-#include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
-#include <stack>
 
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "../../runtime/texture.h"
-#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {

From 11fe640529537b4a263b9ae7c6a444002c6ff09c Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sat, 19 Jun 2021 14:40:19 -0700
Subject: [PATCH 45/59] Add more documentation to texture flattening.

---
 src/tir/transforms/texture_flatten.cc | 37 +++++++++++++++++++--------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 85058b9c2112..0e571b732090 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -19,7 +19,8 @@
 
 /*!
  * \file texture_flatten.cc
- * \brief Flattens texture from multi-dimensional array to 2D buffer access
+ * \brief Flattens texture storage from multi-dimensional array
+ * to 2D (width, height) buffer access
  */
 
 #include <tvm/runtime/registry.h>
@@ -93,15 +94,17 @@ class TextureLoweringBase : public StmtExprMutator {
     return storage_scope;
   }
 
-  // TODO: need docs
-  // External buffer
+  // Set of all external input and output buffers
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
-  // Storage scope
+  // Map to track the storage scope of buffer realization and the
+  //  buffer directly.
   std::unordered_map<const Object*, std::string> storage_scope_;
   // Bound analzer
   IRVisitorWithAnalyzer* bound_analyzer_;
 };
 
+// Lower Nd storage access to 2d texture access using lowering convention
+// specified by the buffers storage scope.
 class TextureFlattener : public TextureLoweringBase {
  public:
   using StmtExprMutator::VisitStmt_;
@@ -123,18 +126,19 @@ class TextureFlattener : public TextureLoweringBase {
     op = stmt.as<BufferRealizeNode>();
     Stmt body = this->VisitStmt(op->body);
 
+    // Rewrite any buffer realizations with storage scope to 2d texture allocations
     if (IsTextureStorage(storage_scope)) {
       body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
-      ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)";
+      ICHECK(vec_length == 4 || vec_length == 1) << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)";
 
-      struct Shape {
+      struct ShapeFromRange {
         const Array<Range>& bounds;
         PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
       };
       size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
-      auto texture = ApplyTexture2DFlattening<PrimExpr>(Shape{op->bounds}, op->bounds.size(), axis);
+      auto texture = ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
       Array<PrimExpr> args = {texture.width, texture.height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body);
     }
@@ -202,13 +206,19 @@ class TextureFlattener : public TextureLoweringBase {
     return args;
   }
 
-  // TODO: Need docs
-  // Let binding
+  // Bindings to new texture vars with texture pointer scope
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
+  // Bindings from realized buffers to external buffers when the memory transfer
+  // to the realized buffer can be cancelled
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_binds_;
 };
 
-
+// Populate bindings from internal buffers to external ones of the same scope
+// when it can be proven that the intermediate buffer access is identical
+// to the external access. This can allow for cache_read/write cancellation
+// when the external buffers are identical to the realized ones. Currently doesn't
+// support forwarding external buffers when the realized buffer is conditionally
+// loaded due to padding and other possible access modifying expressions.
 class ExternalBufferForwarding : public TextureLoweringBase {
  public:
   explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map,
@@ -291,16 +301,21 @@ class ExternalBufferForwarding : public TextureLoweringBase {
   }
 
  private:
+  // List of realize_attrs used to mark the last valid attr stmt to use when rewriting
+  // the AST to remove any unecessary buffer realization.
   std::deque<Stmt> realize_attrs_;
+  // Set of buffers which are identical to external buffers and are copied into.
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buffer_copy_;
+  // Binding from internal identical realized buffer and external buffer.
   std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
+  // Active set of loads on external buffers contained in the scope of a buffer
+  // realize node.
   std::vector<std::vector<PrimExpr>> external_loads_;
 };
 
 
 PrimFunc TextureFlatten(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
-
   IRVisitorWithAnalyzer bound_analyzer;
   bound_analyzer(fptr->body);
   ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer);

From 89d832fd1c74a9f362eca3efe0969acc31fa066e Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sat, 19 Jun 2021 14:48:37 -0700
Subject: [PATCH 46/59] Add TextureFlatten transform to refactored tvm lower
 API.

---
 include/tvm/tir/transform.h | 9 +++++++++
 src/driver/driver_api.cc    | 1 +
 2 files changed, 10 insertions(+)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index d1308fe0059e..0ce29fa9da16 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -437,6 +437,15 @@ TVM_DLL Pass LowerMatchBuffer();
  */
 TVM_DLL Pass FlattenBuffer();
 
+/*
+ * \brief Flatten the multi-dimensional read/write
+ *  to two dimensional texture Load/Store and realize
+ *  texture buffer allocations.
+ *
+ * \return The Pass
+ */
+TVM_DLL Pass TextureFlatten();
+
 /*!
  *  A pass to merge multiple TIR-level dynamic shared memory allocations into one
  */
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index d6af9936ca40..2759c8503393 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -215,6 +215,7 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
 
   // PHASE 1
   pass_list.push_back(tir::transform::InjectPrefetch());
+  pass_list.push_back(tir::transform::TextureFlatten());
   pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers));
   pass_list.push_back(tir::transform::LowerInitBlock());
   pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation());

From 349eb512bf203cd37dd61bf72474ec5f893ba4ab Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sat, 19 Jun 2021 14:57:17 -0700
Subject: [PATCH 47/59] Apply clang formatting.

---
 src/target/source/codegen_opencl.cc     | 40 +++++++++------------
 src/target/source/codegen_opencl.h      | 10 +++---
 src/tir/ir/buffer.cc                    |  2 +-
 src/tir/transforms/lower_tvm_builtin.cc | 31 ++++++++--------
 src/tir/transforms/texture_flatten.cc   | 47 ++++++++++++++-----------
 5 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 5da29ca4643c..379851b0d8f4 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -27,15 +27,15 @@
 #include <vector>
 
 #include "../../runtime/opencl/opencl_module.h"
-#include "../../runtime/thread_storage_scope.h"
 #include "../../runtime/texture.h"
+#include "../../runtime/thread_storage_scope.h"
 #include "../build_common.h"
 
 namespace tvm {
 namespace codegen {
 
 class InferTextureAccess : public StmtExprVisitor {
-public:
+ public:
   static constexpr const uint8_t read_access = 1;
   static constexpr const uint8_t write_access = 2;
 
@@ -46,11 +46,9 @@ class InferTextureAccess : public StmtExprVisitor {
     for (auto& texture : var_access_map_) {
       if (texture.second == read_access) {
         storage_scope_qualifiers.insert({texture.first, "texture_read"});
-      }
-      else if (texture.second == write_access) {
+      } else if (texture.second == write_access) {
         storage_scope_qualifiers.insert({texture.first, "texture_write"});
-      }
-      else if (texture.second == (read_access | write_access)) {
+      } else if (texture.second == (read_access | write_access)) {
         storage_scope_qualifiers.insert({texture.first, ""});
       }
     }
@@ -67,11 +65,10 @@ class InferTextureAccess : public StmtExprVisitor {
     StmtExprVisitor::VisitExpr_(op);
   }
 
-private:
+ private:
   std::unordered_map<const VarNode*, uint8_t> var_access_map_;
 };
 
-
 CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; }
 
 void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
@@ -83,8 +80,7 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
       // Storage scope qualifiers for textures are inferred
       // and set prior to function codegen.
       continue;
-    }
-    else if (arg.dtype().is_handle()) {
+    } else if (arg.dtype().is_handle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -313,8 +309,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) {
       // intermediate local unless the buffer allocation is a
       // single element selected from the texture read.
       auto it = allocation_size_.find(op->buffer_var.get());
-      if (it != allocation_size_.end() && it->second == 1)
-      {
+      if (it != allocation_size_.end() && it->second == 1) {
         need_texture_ssa_ = true;
       }
     }
@@ -334,7 +329,8 @@ void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) {
 }
 
 void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) {
-  allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()});
+  allocation_size_.insert(
+      {op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()});
   CodeGenC::VisitStmt_(op);
 }
 
@@ -360,8 +356,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
     if (buffer_type.is_float16()) {
       os << "write_imageh(";
-    }
-    else if (buffer_type.is_float()) {
+    } else if (buffer_type.is_float()) {
       os << "write_imagef(";
     } else {
       LOG(FATAL) << "Unsupported type: " << buffer_type
@@ -380,8 +375,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::stringstream ss;
     if (op->dtype.is_float16()) {
       ss << "read_imageh(";
-    }
-    else if (op->dtype.is_float()) {
+    } else if (op->dtype.is_float()) {
       ss << "read_imagef(";
     } else {
       LOG(FATAL) << "Unsupported type: " << op->dtype
@@ -397,11 +391,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     ss << "))";
 
     // Only use local SSA if texture is not already being stored
-    if (need_texture_ssa_)
-    {
+    if (need_texture_ssa_) {
       std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4));
-      if (op->args.back().as<RampNode>())
-      {
+      if (op->args.back().as<RampNode>()) {
         os << rhs;
       } else {
         os << "((";
@@ -450,9 +442,9 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // N
   }
 }
 
-void CodeGenOpenCL::SetTextureScope(const std::unordered_map<const VarNode*, std::string>& scope) { // NOLINT(*)
-  for (auto& texture : scope)
-  {
+void CodeGenOpenCL::SetTextureScope(
+    const std::unordered_map<const VarNode*, std::string>& scope) {  // NOLINT(*)
+  for (auto& texture : scope) {
     alloc_storage_scope_.insert(texture);
   }
 }
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index a456fdd94f5f..4c57a84ebeaf 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -51,11 +51,10 @@ class CodeGenOpenCL final : public CodeGenC {
                      const std::string& value) final;  // NOLINT(*)
   // the address of load/store
   void PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base,
-                    std::ostream& os);                                          // NOLINT(*)
-  void PrintRestrict(const Var& v, std::ostream& os) final;                     // NOLINT(*)
-  std::string CastFromTo(std::string value, DataType from, DataType target);    // NOLINT(*)
-  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&); // NOLINT(*)
-
+                    std::ostream& os);                                           // NOLINT(*)
+  void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
+  std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
+  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
 
   // overload visitor
   void VisitStmt_(const AllocateNode* op) final;                     // NOLINT(*)
@@ -65,7 +64,6 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;   // NOLINT(*)
   void VisitStmt_(const StoreNode* op) final;                        // NOLINT(*)
 
-
  private:
   // whether enable fp16 and fp64 extension
   bool enable_fp16_{false};
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 6a102339bcea..5293c7f1fee5 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -286,7 +286,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
     return offset;
   }
 }
-}
+}  // namespace
 
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index daa868668c47..83f35b150b24 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -356,28 +356,27 @@ class BuiltinLower : public StmtExprMutator {
     ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
 
-    Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}),
-                                    throw_last_error),
-                         let->body});
-    DataType dtype = let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
+    Stmt body = SeqStmt(
+        {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error),
+         let->body});
+    DataType dtype =
+        let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
 
     std::string fdevapi_prefix = "device_api.";
     fdevapi_prefix += runtime::DeviceName(device_type_.as<IntImmNode>()->value);
-    Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(),
-                            {StringImm(fdevapi_prefix + ".AllocTexture"),
-                                cast(DataType::Int(32), device_type_),
-                                cast(DataType::Int(32), device_id_),
-                                cast(DataType::UInt(64), call->args[0]),
-                                cast(DataType::UInt(64), call->args[1]),
-                                IntImm(DataType::Int(32), dtype.code()),
-                                IntImm(DataType::Int(32), dtype.bits())});
+    Call call_packed =
+        Call(let->var.dtype(), builtin::tvm_call_packed(),
+             {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_),
+              cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]),
+              cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()),
+              IntImm(DataType::Int(32), dtype.bits())});
 
     Stmt alloca = LetStmt(let->var, call_packed, body);
 
-    Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(),
-                     {StringImm(fdevapi_prefix + ".FreeTexture"),
-                         cast(DataType::Int(32), device_type_),
-                         cast(DataType::Int(32), device_id_), let->var});
+    Call free_op =
+        Call(DataType::Int(32), builtin::tvm_call_packed(),
+             {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_),
+              cast(DataType::Int(32), device_id_), let->var});
 
     Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error);
     body = SeqStmt({alloca, free_stmt});
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 0e571b732090..0ca4668a57cf 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -33,20 +33,20 @@
 #include <unordered_map>
 
 #include "../../arith/ir_visitor_with_analyzer.h"
-#include "../../runtime/thread_storage_scope.h"
 #include "../../runtime/texture.h"
+#include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
 namespace tir {
-using runtime::IsTextureStorage;
-using runtime::DefaultTextureLayoutSeparator;
 using runtime::ApplyTexture2DFlattening;
-
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::IsTextureStorage;
 
 class TextureLoweringBase : public StmtExprMutator {
  public:
-  explicit TextureLoweringBase(const Map<Var, Buffer>& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer)
-    : bound_analyzer_{bound_analyzer} {
+  explicit TextureLoweringBase(const Map<Var, Buffer>& extern_buffer_map,
+                               IRVisitorWithAnalyzer* bound_analyzer)
+      : bound_analyzer_{bound_analyzer} {
     for (auto kv : extern_buffer_map) {
       extern_buf_.insert(kv.second);
     }
@@ -81,7 +81,6 @@ class TextureLoweringBase : public StmtExprMutator {
   }
 
  protected:
-
   std::string GetStorageScope(const Buffer& buffer) {
     std::string storage_scope;
     auto it = storage_scope_.find(buffer.get());
@@ -108,10 +107,14 @@ class TextureLoweringBase : public StmtExprMutator {
 class TextureFlattener : public TextureLoweringBase {
  public:
   using StmtExprMutator::VisitStmt_;
-  explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map,
-                            const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_,
-                            IRVisitorWithAnalyzer* bound_analyzer)
-    : TextureLoweringBase(extern_buffer_map, bound_analyzer), buffer_binds_(extern_buffer_binds_) {;}
+  explicit TextureFlattener(
+      const Map<Var, Buffer>& extern_buffer_map,
+      const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_,
+      IRVisitorWithAnalyzer* bound_analyzer)
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer),
+        buffer_binds_(extern_buffer_binds_) {
+    ;
+  }
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     if (extern_buf_.count(op->buffer)) {
@@ -119,7 +122,8 @@ class TextureFlattener : public TextureLoweringBase {
     }
 
     std::string storage_scope = GetStorageScope(op->buffer);
-    Var buffer_var(op->buffer->data->name_hint, PointerType(PrimType(op->buffer->dtype), String(storage_scope)));
+    Var buffer_var(op->buffer->data->name_hint,
+                   PointerType(PrimType(op->buffer->dtype), String(storage_scope)));
     let_binding_.insert({op->buffer->data, buffer_var});
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
@@ -131,14 +135,16 @@ class TextureFlattener : public TextureLoweringBase {
       body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
-      ICHECK(vec_length == 4 || vec_length == 1) << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)";
+      ICHECK(vec_length == 4 || vec_length == 1)
+          << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)";
 
       struct ShapeFromRange {
         const Array<Range>& bounds;
         PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
       };
       size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
-      auto texture = ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
+      auto texture =
+          ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
       Array<PrimExpr> args = {texture.width, texture.height};
       stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body);
     }
@@ -180,8 +186,7 @@ class TextureFlattener : public TextureLoweringBase {
   }
 
  protected:
-
-  template<typename T>
+  template <typename T>
   Array<PrimExpr> GetTextureAccessArgs(const T* op, const Buffer& buffer) {
     Array<PrimExpr> args;
     if (let_binding_.count(op->buffer->data)) {
@@ -190,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase {
       args.push_back(buffer->data);
     }
     Array<PrimExpr> row_dims, row_indices, col_dims, col_indices;
-    for (size_t i = 0; i < op->buffer->shape.size()-1; i++) {
+    for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) {
       if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) {
         col_dims.push_back(op->buffer->shape[i]);
         col_indices.push_back(op->indices[i]);
@@ -223,7 +228,9 @@ class ExternalBufferForwarding : public TextureLoweringBase {
  public:
   explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map,
                                     IRVisitorWithAnalyzer* bound_analyzer)
-    : TextureLoweringBase(extern_buffer_map, bound_analyzer) {;}
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {
+    ;
+  }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     Stmt stmt = TextureLoweringBase::VisitStmt_(op);
@@ -313,14 +320,14 @@ class ExternalBufferForwarding : public TextureLoweringBase {
   std::vector<std::vector<PrimExpr>> external_loads_;
 };
 
-
 PrimFunc TextureFlatten(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
   IRVisitorWithAnalyzer bound_analyzer;
   bound_analyzer(fptr->body);
   ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer);
   fptr->body = forward(std::move(fptr->body));
-  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), &bound_analyzer)(std::move(fptr->body));
+  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(),
+                                &bound_analyzer)(std::move(fptr->body));
   return func;
 }
 

From da3e146324e3d3c23dd5a8bcf6a74753b9b8a808 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sat, 19 Jun 2021 14:57:57 -0700
Subject: [PATCH 48/59] Blacken python APIs.

---
 python/tvm/tir/transform/transform.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 4cdf7d47856e..489331b049d4 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -94,8 +94,9 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False):
     """
     return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute)  # type: ignore
 
+
 def TextureFlatten():
-    """Flatten the multi-dimensional read/write to 1D.
+    """Flatten the multi-dimensional read/write to 2D.
 
 
     Parameters

From b43e0e780094b578141815bcd007fb177983899d Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Sun, 20 Jun 2021 21:02:03 -0700
Subject: [PATCH 49/59] Apply cpplint changes.

---
 src/target/source/codegen_opencl.cc   | 2 +-
 src/target/source/codegen_opencl.h    | 1 +
 src/tir/transforms/texture_flatten.cc | 8 ++------
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 379851b0d8f4..9137b69e4bb5 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -39,7 +39,7 @@ class InferTextureAccess : public StmtExprVisitor {
   static constexpr const uint8_t read_access = 1;
   static constexpr const uint8_t write_access = 2;
 
-  explicit InferTextureAccess() {}
+  InferTextureAccess() {}
   std::unordered_map<const VarNode*, std::string> Infer(const Stmt& n) {
     StmtExprVisitor::VisitStmt(n);
     std::unordered_map<const VarNode*, std::string> storage_scope_qualifiers;
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 4c57a84ebeaf..722db2b7f7b5 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -27,6 +27,7 @@
 #include <tvm/target/codegen.h>
 
 #include <string>
+#include <unordered_map>
 
 #include "codegen_c.h"
 
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 0ca4668a57cf..53266681e1db 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -112,9 +112,7 @@ class TextureFlattener : public TextureLoweringBase {
       const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_,
       IRVisitorWithAnalyzer* bound_analyzer)
       : TextureLoweringBase(extern_buffer_map, bound_analyzer),
-        buffer_binds_(extern_buffer_binds_) {
-    ;
-  }
+        buffer_binds_(extern_buffer_binds_) {}
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     if (extern_buf_.count(op->buffer)) {
@@ -228,9 +226,7 @@ class ExternalBufferForwarding : public TextureLoweringBase {
  public:
   explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map,
                                     IRVisitorWithAnalyzer* bound_analyzer)
-      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {
-    ;
-  }
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     Stmt stmt = TextureLoweringBase::VisitStmt_(op);

From 61387263ba227a63e6299fde616ee01f40beba69 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 19 Jul 2021 15:51:46 -0700
Subject: [PATCH 50/59] Attempt to extract storage scope from pointer scope.

---
 src/tir/ir/buffer.cc                    | 2 --
 src/tir/transforms/lower_tvm_builtin.cc | 1 -
 src/tir/transforms/texture_flatten.cc   | 5 +++++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 5293c7f1fee5..de2d92e0f3ab 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -54,7 +54,6 @@ Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, String st
                 Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault, span);
 }
 
-namespace {
 // Split the given expression w.r.t the add operator
 inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
   using namespace tir;
@@ -286,7 +285,6 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
     return offset;
   }
 }
-}  // namespace
 
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 83f35b150b24..f5a553aa0598 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -193,7 +193,6 @@ class BuiltinLower : public StmtExprMutator {
       return StmtExprMutator::VisitExpr_(op);
     }
   }
-
   // call shape
   PrimExpr MakeShape(const CallNode* op) {
     // if args.size() == 0, it represents a scalar shape ()
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 53266681e1db..738bfd3de2c5 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -89,6 +89,11 @@ class TextureLoweringBase : public StmtExprMutator {
       storage_scope = it->second;
     } else {
       storage_scope = buffer->scope;
+      if (storage_scope == "global" || storage_scope == "") {
+        if (auto* ptr = buffer->data->type_annotation.as<PointerTypeNode>()) {
+          storage_scope = ptr->storage_scope;
+        }
+      }
     }
     return storage_scope;
   }

From bd084c06a8afddb972898d584da99981a869b9c0 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 20 Jul 2021 20:30:00 -0700
Subject: [PATCH 51/59] Remove ExternalBufferForwarding (cache_read
 cancellation) for now.

---
 src/tir/transforms/texture_flatten.cc | 156 ++------------------------
 1 file changed, 7 insertions(+), 149 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 738bfd3de2c5..2a7394d01396 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -52,21 +52,6 @@ class TextureLoweringBase : public StmtExprMutator {
     }
   }
 
-  virtual Stmt VisitStmt_(const AttrStmtNode* op) {
-    if (op->attr_key == attr::realize_scope) {
-      std::string realize_scope = op->value.as<StringImmNode>()->value;
-      // If realize_scope for external buffer is unset, infer from buffer scope
-      if (realize_scope == "" && op->body->IsInstance<BufferRealizeNode>()) {
-        const auto* realize = Downcast<BufferRealize>(op->body).get();
-        if (extern_buf_.count(realize->buffer)) {
-          realize_scope = realize->buffer->scope;
-        }
-      }
-      storage_scope_[op->node.get()] = realize_scope;
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
   inline PrimExpr SimplifyOffset(const Array<PrimExpr>& shape, const Array<PrimExpr>& index) const {
     PrimExpr base = make_const(DataType::Int(32), 0);
     ICHECK_EQ(shape.size(), index.size());
@@ -82,27 +67,13 @@ class TextureLoweringBase : public StmtExprMutator {
 
  protected:
   std::string GetStorageScope(const Buffer& buffer) {
-    std::string storage_scope;
-    auto it = storage_scope_.find(buffer.get());
-    // If buffer has a realize_scope attr return it
-    if (it != storage_scope_.end()) {
-      storage_scope = it->second;
-    } else {
-      storage_scope = buffer->scope;
-      if (storage_scope == "global" || storage_scope == "") {
-        if (auto* ptr = buffer->data->type_annotation.as<PointerTypeNode>()) {
-          storage_scope = ptr->storage_scope;
-        }
-      }
-    }
-    return storage_scope;
+    auto* ptr = buffer->data->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
   }
 
   // Set of all external input and output buffers
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
-  // Map to track the storage scope of buffer realization and the
-  //  buffer directly.
-  std::unordered_map<const Object*, std::string> storage_scope_;
   // Bound analzer
   IRVisitorWithAnalyzer* bound_analyzer_;
 };
@@ -114,10 +85,8 @@ class TextureFlattener : public TextureLoweringBase {
   using StmtExprMutator::VisitStmt_;
   explicit TextureFlattener(
       const Map<Var, Buffer>& extern_buffer_map,
-      const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& extern_buffer_binds_,
       IRVisitorWithAnalyzer* bound_analyzer)
-      : TextureLoweringBase(extern_buffer_map, bound_analyzer),
-        buffer_binds_(extern_buffer_binds_) {}
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     if (extern_buf_.count(op->buffer)) {
@@ -172,15 +141,10 @@ class TextureFlattener : public TextureLoweringBase {
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<BufferLoadNode>();
-    // Replace with identitcal external buffer if one exists
-    auto buffer = op->buffer;
-    if (buffer_binds_.count(op->buffer)) {
-      buffer = buffer_binds_[op->buffer];
-    }
     // Lower to two dimensional access
-    std::string storage_scope = GetStorageScope(buffer);
+    std::string storage_scope = GetStorageScope(op->buffer);
     if (IsTextureStorage(storage_scope)) {
-      Array<PrimExpr> args = GetTextureAccessArgs(op, buffer);
+      Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->indices.back());
       expr = Call(op->buffer->dtype, builtin::texture2d_load(), args);
     }
@@ -216,119 +180,13 @@ class TextureFlattener : public TextureLoweringBase {
 
   // Bindings to new texture vars with texture pointer scope
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
-  // Bindings from realized buffers to external buffers when the memory transfer
-  // to the realized buffer can be cancelled
-  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_binds_;
-};
-
-// Populate bindings from internal buffers to external ones of the same scope
-// when it can be proven that the intermediate buffer access is identical
-// to the external access. This can allow for cache_read/write cancellation
-// when the external buffers are identical to the realized ones. Currently doesn't
-// support forwarding external buffers when the realized buffer is conditionally
-// loaded due to padding and other possible access modifying expressions.
-class ExternalBufferForwarding : public TextureLoweringBase {
- public:
-  explicit ExternalBufferForwarding(const Map<Var, Buffer>& extern_buffer_map,
-                                    IRVisitorWithAnalyzer* bound_analyzer)
-      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
-
-  Stmt VisitStmt_(const AttrStmtNode* op) final {
-    Stmt stmt = TextureLoweringBase::VisitStmt_(op);
-    if (op->attr_key == attr::realize_scope) {
-      if (op->body->IsInstance<BufferRealizeNode>()) {
-        const auto* realize = Downcast<BufferRealize>(op->body).get();
-        std::string realize_scope = GetStorageScope(realize->buffer);
-        if (IsTextureStorage(realize_scope) && extern_buffer_copy_.count(realize->buffer)) {
-          return realize_attrs_.back();
-        } else {
-          if (realize_attrs_.size()) {
-            realize_attrs_.pop_back();
-          }
-          realize_attrs_.push_back(stmt);
-        }
-        return stmt;
-      }
-    }
-
-    return stmt;
-  }
-
-  Stmt VisitStmt_(const BufferStoreNode* op) final {
-    ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store";
-    if (auto* call_node = op->value.as<CallNode>()) {
-      // Path to supporting external cache_read canceling when padding has induced
-      // a conditional load into the cache_read buffer. We may be able to elide the
-      // conditional completely due to hardware support for returning 0 when OOB
-      if (call_node->op.same_as(builtin::if_then_else())) {
-        external_loads_.emplace_back();
-      }
-    }
-    Stmt stmt = StmtExprMutator::VisitStmt_(op);
-    op = stmt.as<BufferStoreNode>();
-
-    auto check_identity = [this](const BufferStoreNode* store, const BufferLoad& load) {
-      if (extern_buf_.count(load->buffer)) {
-        // If the buffer to load and the buffer to store to are both texture
-        // check for identical access
-        if (IsTextureStorage(GetStorageScope(load->buffer)) &&
-            IsTextureStorage(GetStorageScope(store->buffer))) {
-          auto store_index = SimplifyOffset(store->buffer->shape, store->indices);
-          auto load_index = SimplifyOffset(load->buffer->shape, load->indices);
-          if (arith::Analyzer().CanProve(store_index == load_index)) {
-            extern_buffer_copy_.insert(store->buffer);
-            buffer_map_.insert({store->buffer, load->buffer});
-          }
-        }
-      }
-    };
-
-    if (auto load_node = op->value.as<BufferLoadNode>()) {
-      check_identity(op, GetRef<BufferLoad>(load_node));
-    } else if (external_loads_.size()) {
-      // Stored value is not a load, check for external loads collected
-      // when visiting the store node's value, e.g. from if_then_else
-      for (auto& expr : external_loads_.back()) {
-        check_identity(op, Downcast<BufferLoad>(expr));
-      }
-      external_loads_.pop_back();
-    }
-    return stmt;
-  }
-
-  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    if (external_loads_.size() && extern_buf_.count(op->buffer)) {
-      external_loads_.back().push_back(expr);
-    }
-    return expr;
-  }
-
-  const std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>& GetForwardedBuffers() {
-    return buffer_map_;
-  }
-
- private:
-  // List of realize_attrs used to mark the last valid attr stmt to use when rewriting
-  // the AST to remove any unecessary buffer realization.
-  std::deque<Stmt> realize_attrs_;
-  // Set of buffers which are identical to external buffers and are copied into.
-  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buffer_copy_;
-  // Binding from internal identical realized buffer and external buffer.
-  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
-  // Active set of loads on external buffers contained in the scope of a buffer
-  // realize node.
-  std::vector<std::vector<PrimExpr>> external_loads_;
 };
 
 PrimFunc TextureFlatten(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
   IRVisitorWithAnalyzer bound_analyzer;
   bound_analyzer(fptr->body);
-  ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer);
-  fptr->body = forward(std::move(fptr->body));
-  fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(),
-                                &bound_analyzer)(std::move(fptr->body));
+  fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
   return func;
 }
 

From d2f8bda8c44beceec814a91d498b65ed10cbbbbc Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 20 Jul 2021 20:30:24 -0700
Subject: [PATCH 52/59] Apply MyPy.

---
 python/tvm/tir/transform/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 489331b049d4..732bf0f1ca11 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -107,7 +107,7 @@ def TextureFlatten():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.TextureFlatten()
+    return _ffi_api.TextureFlatten()  # type: ignore
 
 
 def InjectCopyIntrin(pragma_key: str, fintrin):

From b0ea2834f9e353f52cce03835583522a978c1bdf Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 20 Jul 2021 20:30:44 -0700
Subject: [PATCH 53/59] Clang format

---
 src/tir/transforms/texture_flatten.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 2a7394d01396..d74202a3ab79 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -83,9 +83,8 @@ class TextureLoweringBase : public StmtExprMutator {
 class TextureFlattener : public TextureLoweringBase {
  public:
   using StmtExprMutator::VisitStmt_;
-  explicit TextureFlattener(
-      const Map<Var, Buffer>& extern_buffer_map,
-      IRVisitorWithAnalyzer* bound_analyzer)
+  explicit TextureFlattener(const Map<Var, Buffer>& extern_buffer_map,
+                            IRVisitorWithAnalyzer* bound_analyzer)
       : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
 
   Stmt VisitStmt_(const BufferRealizeNode* op) final {

From 11027316774fe602c528a245423203c0af63f80d Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 22 Jul 2021 09:33:39 -0700
Subject: [PATCH 54/59] Only visit RealizeBuffer body for texture storage.

---
 src/tir/transforms/texture_flatten.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index d74202a3ab79..7dc800737944 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -99,11 +99,10 @@ class TextureFlattener : public TextureLoweringBase {
 
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<BufferRealizeNode>();
-    Stmt body = this->VisitStmt(op->body);
 
     // Rewrite any buffer realizations with storage scope to 2d texture allocations
     if (IsTextureStorage(storage_scope)) {
-      body = this->VisitStmt(op->body);
+      Stmt body = this->VisitStmt(op->body);
       ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
       int vec_length = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
       ICHECK(vec_length == 4 || vec_length == 1)

From 46ae5abd38cfdec18f8a439ba0858c35329a48ad Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 22 Jul 2021 21:14:12 -0700
Subject: [PATCH 55/59] Fix bad merge.

---
 src/tir/ir/buffer.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index de2d92e0f3ab..335ff19dd775 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -183,7 +183,12 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
   //                     a list that contain all the elements that match Mod.
   // The elements in the Mod will be used to match against the elements in Mul.
   // The result will then be split and pushed back to these two lists.
-  PrimExpr simplified_base = analyzer->Simplify(base);
+  PrimExpr simplified_base = base;
+  arith::PVar<PrimExpr> x, y;
+  if ((floordiv(x, y) * y + floormod(x, y)).Match(simplified_base)) {
+    simplified_base = x.Eval();
+  }
+  simplified_base = analyzer->Simplify(simplified_base);
   std::vector<const PrimExpr*> eles = ExprSplitAddition(simplified_base);
   std::list<PrimExpr> mult_exprs;
   std::list<std::pair<PrimExpr, PrimExpr> > mod_exprs;

From 9fa362d1c7f9190a998cf86420404962c229fcc5 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 26 Jul 2021 14:31:29 -0700
Subject: [PATCH 56/59] Utilize OpenCL preprocessor to switch between
 sampler-less and codegen provided sampler for texture reads depending on
 whether the opencl runtime is 2.0 compliant.

---
 src/target/source/codegen_opencl.cc | 48 ++++++++++++++++++++++++++---
 src/target/source/codegen_opencl.h  |  7 +++++
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 9137b69e4bb5..8d760a07e032 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -69,7 +69,10 @@ class InferTextureAccess : public StmtExprVisitor {
   std::unordered_map<const VarNode*, uint8_t> var_access_map_;
 };
 
-CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; }
+CodeGenOpenCL::CodeGenOpenCL() {
+  // Set OpenCL specific restrict keyword
+  restrict_keyword_ = "restrict";
+}
 
 void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
@@ -117,6 +120,40 @@ std::string CodeGenOpenCL::Finish() {
     decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
                    "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n";
   }
+
+  // Enable OpenCL 1.2 sampler-less texture reads, but utilize
+  // provided sampler in OpenCL 2.0.
+  if (enable_compliant_texture_reads_) {
+    // TODO(csullivan, lunderberg): Extend device attribute querying to support remote devices
+    // generically through the device API such that a target can be created from a specific device's
+    // attributes and utilized during codegen. Potential generlization of #8127 (c02cafb) for remote
+    // devices.
+    //
+    // E.g. Only provide an image sampler when the local or remote device supports OpenCL 2.0,
+    //      see below for context.
+    //
+    // For backwards compatibility with OpenCL 1.2, sampler-less read_image calls are used.
+    // By default in sampler-less read_image calls OpenCL defaults to
+    // sampler_ = "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
+    // See section 6.12.14.3 Built-in Image Sampler-less Read Functions in the OpenCL 1.2
+    // specification. For OpenCL 2.0 it can be preferable to use,
+    // sampler_ = "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
+    // For now we rely on OpenCL preprocessor directives to utilize the correct behavior
+    // depending on the OpenCL version detected at OpenCL compile time.
+    decl_stream << "#ifdef __OPENCL_VERSION__\n"
+                << "#if __OPENCL_VERSION__ == CL_VERSION_2_0\n"
+                << "#define READ_IMAGEH(image, sampler, coord) "
+                << "read_imageh(image, sampler, coord)\n"
+                << "#define READ_IMAGEF(image, sampler, coord) "
+                << "read_imagef(image, sampler, coord)\n"
+                << "#else\n"
+                << "#define READ_IMAGEH(image, sampler, coord) "
+                << "read_imageh(image, coord)\n"
+                << "#define READ_IMAGEF(image, sampler, coord) "
+                << "read_imagef(image, coord)\n"
+                << "#endif\n"
+                << "#endif\n\n";
+  }
   return CodeGenC::Finish();
 }
 
@@ -372,11 +409,12 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(op->args[3], os);
     os << ")";
   } else if (op->op.same_as(builtin::texture2d_load())) {
+    enable_compliant_texture_reads_ = true;
     std::stringstream ss;
     if (op->dtype.is_float16()) {
-      ss << "read_imageh(";
+      ss << "READ_IMAGEH(";
     } else if (op->dtype.is_float()) {
-      ss << "read_imagef(";
+      ss << "READ_IMAGEF(";
     } else {
       LOG(FATAL) << "Unsupported type: " << op->dtype
                  << ", currently only float and half are supported for image2d OpenCL codegen.";
@@ -384,11 +422,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(op->args[0], ss);
     ss << ", ";
     ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, ";
-    ss << "(int2)(";
+    ss << "((int2)(";
     this->PrintExpr(op->args[1], ss);
     ss << ", ";
     this->PrintExpr(op->args[2], ss);
-    ss << "))";
+    ss << ")))";
 
     // Only use local SSA if texture is not already being stored
     if (need_texture_ssa_) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 722db2b7f7b5..a8c293c03056 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -71,7 +71,14 @@ class CodeGenOpenCL final : public CodeGenC {
   bool enable_fp64_{false};
   // Whether to enable atomics extension.
   bool enable_atomics_{false};
+  // Whether to enable sampler or sampler-less texture reads,
+  // where the choice depends on the OpenCL version used.
+  bool enable_compliant_texture_reads_{false};
+  // Key to disable use of texture SSA in certain scenarios. For example,
+  // when loaded value is stored directly to a user declared l-value buffer
   bool need_texture_ssa_{true};
+  // Mapping from buffer to allocation size.
+  // Useful to track when a scalar store of a vectorized texture load is required.
   std::unordered_map<const Object*, int32_t> allocation_size_;
 };
 

From 213492ce6994b5a341afaef6fe78c09b99903f20 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 5 Aug 2021 16:10:45 -0700
Subject: [PATCH 57/59] Add texture codegen test example.

---
 .../test_target_texture_codegen_opencl.py     | 1767 +++++++++++++++++
 1 file changed, 1767 insertions(+)
 create mode 100644 tests/python/unittest/test_target_texture_codegen_opencl.py

diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py
new file mode 100644
index 000000000000..b155d56f1346
--- /dev/null
+++ b/tests/python/unittest/test_target_texture_codegen_opencl.py
@@ -0,0 +1,1767 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+from tvm import autotvm
+from tvm import te
+from tvm.topi import testing
+from tvm.topi.utils import get_const_tuple, simplify
+from tvm.topi import nn
+
+
+def compute(shape):
+    X = te.placeholder(shape, name="X", dtype="float32")
+    Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y")
+    return X, Y
+
+def schedule(X, Y):
+    s = te.create_schedule(Y.op)
+    #Xt = s.cache_read(X, "texture", [Y])
+    #Xt = s.cache_read(X, "global", [Y])
+    Xt = s.cache_read(X, "global.texture", [Y])
+
+    # copy to texture stage
+    x, y, c = s[Xt].op.axis
+    s[Xt].bind(x, te.thread_axis("blockIdx.x"))
+    s[Xt].bind(y, te.thread_axis("threadIdx.x"))
+    s[Xt].vectorize(c)
+
+    # the compute stage
+    x, y, c = s[Y].op.axis
+    xo, yo, xi, yi = s[Y].tile(x, y, 4, 4)
+    s[Y].bind(xo, te.thread_axis("blockIdx.x"))
+    s[Y].bind(yo, te.thread_axis("threadIdx.x"))
+    s[Y].vectorize(c)
+    return s
+
+def compute5d(shape):
+    X = te.placeholder(shape, name="X", dtype="float32")
+    Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y")
+    return X, Y
+
+def schedule5d(X, Y):
+    s = te.create_schedule(Y.op)
+    Xt = s.cache_read(X, "global.texture", [Y])
+
+    # copy to texture stage
+    a, b, c, d, e = s[Xt].op.axis
+    abc = s[Xt].fuse(a, b, c)
+    s[Xt].bind(abc, te.thread_axis("blockIdx.x"))
+    s[Xt].bind(d, te.thread_axis("threadIdx.x"))
+    s[Xt].vectorize(e)
+
+    # the compute stage
+    a, b, c, d, e = s[Y].op.axis
+    abc = s[Y].fuse(a, b, c)
+    xo, yo, xi, yi = s[Y].tile(abc, d, 4, 4)
+    s[Y].bind(xo, te.thread_axis("blockIdx.x"))
+    s[Y].bind(yo, te.thread_axis("threadIdx.x"))
+    s[Y].vectorize(e)
+    return s
+
+def compute_matmul(shape):
+    A = te.placeholder(shape, name="A", dtype="float32")
+    B = te.placeholder(shape, name="B", dtype="float32")
+    k = te.reduce_axis((0, shape[1]), name="k")
+    C = te.compute(
+        (shape[0]*shape[2], shape[0]*shape[2]),
+        lambda i, j: te.sum(
+            A[i//shape[2], k, i%shape[2]].astype("float32") * B[j//shape[2], k, j%shape[2]].astype("float32"), axis=[k]
+        ),
+        name="Compute_MatMul",
+    )
+    return A, B, C
+
+def schedule_matmul(A, B, C, local=False):
+    s = te.create_schedule(C.op)
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    if local:
+        Al = s.cache_read(At, "local", [C])
+        Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    bx = te.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    def copy_to_texture(stage):
+        _io, _k, _ii = s[stage].op.axis
+        s[stage].vectorize(_ii)
+        s[stage].bind(_io, bx)
+        s[stage].bind(_k, tx)
+
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    # copy to global stage
+    _i, _j = s[C].op.axis
+    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
+    s[C].unroll(xi)
+    s[C].vectorize(yi)
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(yo, te.thread_axis("threadIdx.x"))
+
+    # the compute stage
+    s[Cl].compute_at(s[C], yo)
+    (_k,) = Cl.op.reduce_axis
+    _x, _y = s[Cl].op.axis
+    s[Cl].reorder(_k, _x, _y)
+    s[Cl].unroll(_x)
+    s[Cl].vectorize(_y)
+
+    if local:
+        s[Al].compute_at(s[Cl], _k)
+        s[Al].vectorize(s[Al].op.axis[-1])
+        s[Bl].compute_at(s[Cl], _k)
+        s[Bl].vectorize(s[Bl].op.axis[-1])
+
+    return s
+
+
+def compute_matmul_inner(shape):
+    A = te.placeholder(shape, name="A", dtype="float32")
+    B = te.placeholder(shape, name="B", dtype="float32")
+    k = te.reduce_axis((0, shape[1]*shape[2]), name="k")
+    # (M, K) x (N, K)
+    # (32, 256) x (32, 256)
+    # (32, 64, 4) x (32, 64, 4)
+    C = te.compute(
+        (shape[0], shape[0]),
+        lambda i, j: te.sum(
+            A[i, k//shape[2], k%shape[2]].astype("float32") * B[j, k//shape[2], k%shape[2]].astype("float32"), axis=[k]
+        ),
+        name="Compute_MatMul",
+    )
+    return A, B, C
+
+def schedule_matmul_inner(A, B, C, local=False):
+    s = te.create_schedule(C.op)
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    if local:
+        Al = s.cache_read(At, "local", [C])
+        Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    bx = te.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    def copy_to_texture(stage):
+        _i, _ko, _ki = s[stage].op.axis
+        s[stage].vectorize(_ki)
+        s[stage].bind(_i, bx)
+        s[stage].bind(_ko, tx)
+
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    # copy to global stage
+    _i, _j = s[C].op.axis
+    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
+    s[C].unroll(xi)
+    s[C].vectorize(yi)
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(yo, te.thread_axis("threadIdx.x"))
+
+    # the compute stage
+    s[Cl].compute_at(s[C], yo)
+    (_k,) = Cl.op.reduce_axis
+    _x, _y = s[Cl].op.axis
+    s[Cl].reorder(_x, _y, _k)
+    s[Cl].unroll(_x)
+    # TODO(csullivan): consider whether the below error is worth resolving
+    # s[Cl].vectorize(_y) # error
+
+    if local:
+        s[Al].compute_at(s[Cl], _x)
+        s[Al].vectorize(s[Al].op.axis[-1])
+        s[Bl].compute_at(s[Cl], _x)
+        s[Bl].vectorize(s[Bl].op.axis[-1])
+
+    return s
+
+def compute_matmul_vector_accumulator(shapeA, shapeB):
+    # A x B
+    # (K/4, M, K%4) x (K, N/4, N%4) = (M, N)
+    # (32, 64, 4) x (128, 16, 4) = (64, 64)
+    A = te.placeholder(shapeA, name="A", dtype="float32")
+    B = te.placeholder(shapeB, name="B", dtype="float32")
+    k = te.reduce_axis((0, shapeB[0]), name="k")
+    C = te.compute(
+        (shapeA[1], shapeB[1]*shapeB[2]),
+        lambda i, j: te.sum(
+            A[k//shapeA[-1], i, k%shapeA[-1]].astype("float32") * B[k, j//shapeB[-1], j%shapeB[-1]].astype("float32"), axis=[k]
+        ),
+        name="Compute_MatMul",
+    )
+    return A, B, C
+
+def schedule_matmul_vector_accumulator(A, B, C, local=False):
+    s = te.create_schedule(C.op)
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    if local:
+        Al = s.cache_read(At, "local", [C])
+        Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    def copy_to_texture(stage):
+        _y, _x, _v = s[stage].op.axis
+        # TODO(csullivan): removing this vectorize results in numerical errors, autovectorize
+        s[stage].vectorize(_v)
+        s[stage].bind(_y, te.thread_axis("blockIdx.x"))
+        s[stage].bind(_x, te.thread_axis("threadIdx.x"))
+
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    # copy to global stage
+    _i, _j = s[C].op.axis
+    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
+    s[C].unroll(xi)
+    s[C].vectorize(yi)
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(yo, te.thread_axis("threadIdx.x"))
+
+    # the compute stage
+    s[Cl].compute_at(s[C], yo)
+    (_k,) = Cl.op.reduce_axis
+    _a, _b = s[Cl].op.axis
+    _ko, _ki = s[Cl].split(_k, factor=4)
+    s[Cl].reorder(_ko, _a, _ki, _b)
+    s[Cl].unroll(_ki)
+    s[Cl].unroll(_a)
+    s[Cl].vectorize(_b)
+
+    if local:
+        s[Al].compute_at(s[Cl], _a)
+        _aa, _ka, _ba = s[Al].op.axis
+        # TODO(csullivan)[BEFORE PR]: removing this vectorize command causes a crash. This needs to be autovectorized.
+        s[Al].vectorize(_ba)
+        s[Bl].compute_at(s[Cl], _ko)
+        _ab, _kb, _bb = s[Bl].op.axis
+        s[Bl].vectorize(_bb)
+        s[Bl].unroll(_ab)
+
+    return s
+
+def schedule_matmul_vector_accumulator_autotvm(A, B, C):
+    s = te.create_schedule(C.op)
+    cfg = autotvm.get_config()
+
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    Al = s.cache_read(At, "local", [C])
+    Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    def copy_to_texture(stage):
+        _y, _x, _v = s[stage].op.axis
+        s[stage].vectorize(_v)
+        s[stage].bind(_y, te.thread_axis("blockIdx.x"))
+        s[stage].bind(_x, te.thread_axis("threadIdx.x"))
+
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    # copy to global stage
+    _i, _j = s[C].op.axis
+    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
+    s[C].unroll(xi)
+    s[C].vectorize(yi)
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(yo, te.thread_axis("threadIdx.x"))
+
+    # the compute stage
+    s[Cl].compute_at(s[C], yo)
+    (_k,) = Cl.op.reduce_axis
+    _a, _b = s[Cl].op.axis
+    _ko, _ki = s[Cl].split(_k, factor=4)
+
+    s[Cl].reorder(_ko, _a, _ki, _b)
+    cfg.define_knob("unroll", [0, 1])
+    if cfg["unroll"] == 1:
+        s[Cl].unroll(_ki)
+        s[Cl].unroll(_a)
+    s[Cl].vectorize(_b)
+
+    s[Al].compute_at(s[Cl], _a)
+    _aa, _ka, _ba = s[Al].op.axis
+    s[Al].vectorize(_ba)
+    s[Bl].compute_at(s[Cl], _ko)
+    _ab, _kb, _bb = s[Bl].op.axis
+    s[Bl].vectorize(_bb)
+    s[Bl].unroll(_ab)
+
+
+    return s
+
+def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
+    # conv2d( [N, C, H, W, c] , [1, 1, C, K, k]
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    c = te.reduce_axis((0, input_shape[1]), name="C")
+    c4 = te.reduce_axis((0, input_shape[-1]), name="c4")
+    kh = te.reduce_axis((0, filter_shape[0]), name="kh")
+    kw = te.reduce_axis((0, filter_shape[1]), name="kw")
+    conv = te.compute(
+        (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]),
+        lambda n, ko, i, j, ki: te.sum(
+            data[n, c, i, j, c4].astype("float32") * filt[kh, kw, c*input_shape[-1] + c4, ko, ki].astype("float32"), axis=[kh, kw, c, c4]
+        ),
+        #name="Compute_conv2d_1x1_NCHWc_RSCKk",
+        name = "conv2d_1x1"
+    )
+    return data, filt, conv
+
+def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv):
+    # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
+    # outputs:
+    s = te.create_schedule(conv.op)
+    A, B, C = data, filt, conv
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    Al = s.cache_read(At, "local", [C])
+    Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    def copy_to_texture(stage):
+        axes = s[stage].op.axis
+        fused = s[stage].fuse(*axes[:-1])
+        block, thread = s[stage].split(fused, factor=32)
+        s[stage].vectorize(axes[-1])
+        s[stage].bind(block, te.thread_axis("blockIdx.x"))
+        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    _n, _ko, _h, _w, _ki = s[C].op.axis
+    s[C].vectorize(_ki)
+    s[C].bind(_n, te.thread_axis("blockIdx.x"))
+    s[C].bind(_ko, te.thread_axis("threadIdx.x"))
+
+    s[Cl].compute_at(s[C], _w)
+    _nl, _kol, _hl, _wl, _kil = s[Cl].op.axis
+    _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis
+    _clo, _cli = s[Cl].split(_cl, factor=4)
+    s[Cl].reorder(_clo, _cli, _cl4, _kil)
+    s[Cl].unroll(_cli)
+    s[Cl].unroll(_cl4)
+    s[Cl].vectorize(_kil)
+
+    s[Al].compute_at(s[Cl], _cli)
+    s[Al].vectorize(s[Al].op.axis[-1])
+    s[Bl].compute_at(s[Cl], _kwl)
+    s[Bl].vectorize(s[Bl].op.axis[-1])
+
+    return s
+
+
+def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
+    # input_shape = [W, C, H, N, c] -> [W, C, H*N, c]
+    # filter_shape = [C, R, S, K, k] -> [C, R*S*K, k]
+    # output_shape: [WK, HN, k] -> [W, K, H, N, k]
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+
+    packed_data = te.compute(
+        (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]),
+        lambda i, j, k, l: data[i, j, k//input_shape[3], k%input_shape[3], l],
+        name = "packed_data"
+    )
+
+    # Logical transformation of Nd -> 3d tensor
+    # CRSKk -> C|RSK|k
+    # r = rsk // SK
+    # sk = rsk % SK
+    # s = sk // K == (rsk % SK) // K == (rsk // K) % S
+    # k = sk % K == (rsk % SK) % K == rsk % K
+    packed_filter = te.compute(
+        (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]),
+        lambda i, j, k: filt[i, j//(filter_shape[3] * filter_shape[2]), (j//filter_shape[3])%filter_shape[2], j%filter_shape[3], k],
+        name = "packed_filter"
+    )
+
+    c = te.reduce_axis((0, input_shape[1]), name="C")
+    c4 = te.reduce_axis((0, input_shape[-1]), name="c4")
+    r = te.reduce_axis((0, filter_shape[1]), name="r")
+    s = te.reduce_axis((0, filter_shape[2]), name="s")
+
+    conv = te.compute(
+        (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]),
+        lambda w, ko, h, n, ki: te.sum(
+            packed_data[w, c, h * input_shape[3] + n, c4].astype("float32")
+            *
+            packed_filter[c*input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki].astype("float32"), axis=[r, s, c, c4]
+        ),
+        name = "conv2d_1x1"
+    )
+    return data, filt, packed_data, packed_filter, conv
+
+def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv):
+    # data: [W, C, H*N, c]
+    # filter: [C, R*S*K, k]
+    # output: [W, K, H, N, k]
+
+    # conv2d( [N, C, H, W, c] , [1, 1, C, K, k]
+    # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
+
+    # data: (56, 128//4, 56*1, 4) = (56, 32, 56, 4)
+    # filt: (128, 1*1*128//4, 4) = (128, 32, 4)
+    # conv: (56, 32, 56, 1, 4)
+
+    s = te.create_schedule(conv.op)
+    cfg = autotvm.get_config()
+
+    s[packed_data].compute_inline()
+    s[packed_filter].compute_inline()
+    A, B, C = packed_data, packed_filter, conv
+    At = s.cache_read(A, "global.texture", [C])
+    Bt = s.cache_read(B, "global.texture", [C])
+    Al = s.cache_read(At, "local", [C])
+    Bl = s.cache_read(Bt, "local", [C])
+    Cl = s.cache_write(C, "local")
+
+    def copy_to_texture(stage):
+        axes = s[stage].op.axis
+        fused = s[stage].fuse(*axes[:-1])
+        block, thread = s[stage].split(fused, factor=32)
+        s[stage].vectorize(axes[-1])
+        s[stage].bind(block, te.thread_axis("blockIdx.x"))
+        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+    copy_to_texture(At)
+    copy_to_texture(Bt)
+
+    _w, _ko, _h, _n, _ki = s[C].op.axis
+    kernel_scope, _n = s[C].split(_n, nparts=1)
+
+    cfg.define_split("tile_f", _ko, num_outputs=4)
+    cfg.define_split("tile_w", _w, num_outputs=4)
+    cfg.define_split("tile_h", _h, num_outputs=4)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+
+    bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko)
+    bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w)
+    bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h)
+    s[C].reorder(bh, _n, vh, th, hi)
+    bhn = s[C].fuse(bh, _n)
+
+    s[C].bind(bk, te.thread_axis("blockIdx.z"))
+    s[C].bind(bhn, te.thread_axis("blockIdx.y"))
+    s[C].bind(bw, te.thread_axis("blockIdx.x"))
+    s[C].bind(vk, te.thread_axis("vthread"))
+    s[C].bind(vh, te.thread_axis("vthread"))
+    s[C].bind(vw, te.thread_axis("vthread"))
+    s[C].bind(tk, te.thread_axis("threadIdx.z"))
+    s[C].bind(th, te.thread_axis("threadIdx.y"))
+    s[C].bind(tw, te.thread_axis("threadIdx.x"))
+    s[C].reorder(bw, bk, bhn, vw, vk, vh, tw, tk, th, ki, hi, wi, _ki)
+    s[C].vectorize(_ki)
+
+    # TODO(csullivan): Try uneven workgroup split
+    # _wo, _wi = s[C].split(_w, factor=4)
+    # #_hno, _hni = s[C].split(_hn, factor=8)
+    # #s[C].reorder(_wo, _wi, _ko, _hno, _hni, _ki)
+    # s[C].reorder(_wo, _ko, _hn, _ki, _wi)
+    # s[C].unroll(_wi)
+
+    # # mace:
+    # # const int out_ch_blk = get_global_id(0);
+    # # const int out_w_blk = get_global_id(1);
+    # # const int out_hb = get_global_id(2);
+
+    # bx = te.thread_axis("blockIdx.x")
+    # by = te.thread_axis("blockIdx.y")
+    # bz = te.thread_axis("blockIdx.z")
+    # s[C].bind(_ko, bx)
+    # s[C].bind(_wo, by)
+    # s[C].bind(_hn, bz)
+
+    #s[Cl].compute_at(s[C], _hn)
+    s[Cl].compute_at(s[C], th)
+
+    _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis
+    _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis
+
+    cfg.define_split("tile_c", _cl, num_outputs=2)
+    cfg.define_split("tile_kh", _khl, num_outputs=2)
+    cfg.define_split("tile_kw", _kwl, num_outputs=2)
+
+
+
+    _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl)
+    _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl)
+    _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl)
+    #s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+    s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl)
+    #s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli)
+    # s[Cl].reorder(_cl, _cl4, _kil, _wl)
+    s[Cl].unroll(_cl4)
+    s[Cl].unroll(_wl)
+    s[Cl].vectorize(_kil)
+
+
+    _wla, _cla, _hnla, _cl4a = s[Al].op.axis
+    s[Al].compute_at(s[Cl], _cli)
+    s[Al].vectorize(_cl4a)
+    s[Al].unroll(_wla)
+
+    _clb, _rskolb, _kilb = s[Bl].op.axis
+    s[Bl].compute_at(s[Cl], _cli)
+    s[Bl].vectorize(_kilb)
+    s[Bl].unroll(_clb)
+
+    s[C].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+
+    WO, K, HO, N, K4 = get_const_tuple(C.shape)
+    RSC, _, _ = get_const_tuple(B.shape)
+    cfg.add_flop(2 * N * K * K4 * HO * WO * RSC)
+
+    return s
+
+def compute_conv2d_cuda_NCHW_KCRS(Input, Filter, stride, padding, dilation, out_dtype=None):
+    """Convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    Input : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.te.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = Input.shape
+    num_filter, channel, kernel_h, kernel_w = Filter.shape
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+    out_channel = num_filter
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
+
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+    return te.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda nn, ff, yy, xx: te.sum(
+            temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(
+                out_dtype
+            )
+            * Filter[ff, rc, ry, rx].astype(out_dtype),
+            axis=[rc, ry, rx],
+        ),
+        tag="conv2d_nchw",
+    )
+
+
+def schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv):
+    """schedule optimized for batch size = 1"""
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, "local")
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope("local")
+        OL = conv
+
+    AA = s.cache_read(pad_data, "shared", [OL])
+    WW = s.cache_read(kernel, "shared", [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
+    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
+
+
+def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_dtype=None):
+    """Convolution operator in NCHWc layout. """
+
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
+    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
+
+    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
+    rcb = te.reduce_axis((0, in_channel_block), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+    # NCHWc x KCRSk
+    # texture: NCH|W|c
+    # texture: K|CRS|k
+    # c = crs//RS
+    # rs = crs % RS
+    # r = rs // W == (crs // S) % R
+    # s = rs % W == crs % S
+    Filter_tx = te.compute(
+        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
+        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
+        name = "packed_filter"
+    )
+    return te.compute(
+        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
+        lambda nn, ffc, yy, xx, ffb: te.sum(
+            temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb].astype(
+                out_dtype
+            )
+            * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb].astype(out_dtype),
+            axis=[rcc, rcb, ry, rx],
+        ),
+        tag="conv2d_nchwc_kcrsk_texture",
+    )
+
+def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv):
+    """schedule optimized for batch size = 1"""
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_fc", fc, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rcc", rcc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    pad_data, flattened_kernel = s[conv].op.input_tensors
+    kernel = s[flattened_kernel].op.input_tensors[0]
+    s[flattened_kernel].compute_inline()
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+    kernel = flattened_kernel
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, "local")
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope("local")
+        OL = conv
+
+    # create cache stage
+    AT = s.cache_read(pad_data, "global.texture", [OL])
+    WT = s.cache_read(kernel, "global.texture", [OL])
+    def copy_to_texture(stage):
+        axes = s[stage].op.axis
+        fused = s[stage].fuse(*axes[:-1])
+        block, thread = s[stage].split(fused, factor=32)
+        s[stage].vectorize(axes[-1])
+        s[stage].bind(block, te.thread_axis("blockIdx.x"))
+        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+    copy_to_texture(AT)
+    copy_to_texture(WT)
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[output].op.axis
+
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
+    s[output].vectorize(fb)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[OL].op.axis
+
+    rcc, rcb, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg["tile_rcc"].apply(s, OL, rcc)
+    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
+
+    # TODO(csullivan): check position of rcb
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
+    s[OL].vectorize(fb)
+    s[OL].unroll(rcb)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    # cooperative fetching
+    for load in [AA, WW]:
+        if load == WW:
+            n, fyx, v = s[load].op.axis
+            fused = s[load].fuse(n, fyx)
+        else:
+            n, f, y, x, v = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        s[load].vectorize(v)
+
+    # unroll
+    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+
+    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
+    _, ICKHKW, _ = get_const_tuple(kernel.shape)
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
+
+
+def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
+    """Convolution operator in NCHWc layout. """
+
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
+    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
+
+    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
+    rcb = te.reduce_axis((0, in_channel_block), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+    # NCHWc x KCRSk
+    # texture: NCH|W|c
+    # texture: K|CRS|k
+    # c = crs//RS
+    # rs = crs % RS
+    # r = rs // W == (crs // S) % R
+    # s = rs % W == crs % S
+    Filter_tx = te.compute(
+        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
+        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
+        name = "packed_filter"
+    )
+    conv = te.compute(
+        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
+        lambda nn, ffc, yy, xx, ffb: te.sum(
+            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
+            * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb]).astype(out_dtype),
+            axis=[rcc, rcb, ry, rx],
+        ),
+        tag="conv2d_nchwc_kcrsk_texture",
+    )
+    output = te.compute(conv.shape, lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype("float32"))
+    return output
+
+
+
+def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
+    """schedule optimized for batch size = 1"""
+
+    conv = output.op.input_tensors[0]
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_fc", fc, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rcc", rcc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    pad_data, flattened_kernel = s[conv].op.input_tensors
+    kernel = s[flattened_kernel].op.input_tensors[0]
+    s[flattened_kernel].compute_inline()
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+    kernel = flattened_kernel
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, "local")
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope("local")
+        OL = conv
+
+    # create cache stage
+    AT = s.cache_read(pad_data, "global.texture", [OL])
+    WT = s.cache_read(kernel, "global.texture", [OL])
+    def copy_to_texture(stage):
+        axes = s[stage].op.axis
+        fused = s[stage].fuse(*axes[:-1])
+        block, thread = s[stage].split(fused, factor=32)
+        s[stage].vectorize(axes[-1])
+        s[stage].bind(block, te.thread_axis("blockIdx.x"))
+        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+    copy_to_texture(AT)
+    copy_to_texture(WT)
+
+    AA = s.cache_read(AT, "shared", [OL])
+    WW = s.cache_read(WT, "shared", [OL])
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[output].op.axis
+
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
+    s[output].vectorize(fb)
+
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[OL].op.axis
+
+    rcc, rcb, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg["tile_rcc"].apply(s, OL, rcc)
+    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
+
+    # TODO(csullivan): check position of rcb
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
+    s[OL].vectorize(fb)
+    s[OL].unroll(rcb)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    # cooperative fetching
+    for load in [AA, WW]:
+        if load == WW:
+            n, fyx, v = s[load].op.axis
+            fused = s[load].fuse(n, fyx)
+        else:
+            n, f, y, x, v = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        s[load].vectorize(v)
+
+    # unroll
+    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+
+    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
+    _, ICKHKW, _ = get_const_tuple(kernel.shape)
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
+
+
+
+def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
+    """Depthwise convolution operator in NCHWc layout. """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, channel_chunk, in_height, in_width, channel_block = Input.shape
+    _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape
+
+    # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+    out_channel_chunk = simplify(channel_chunk * channel_multiplier)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
+
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+
+    # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c]
+    # NCHWc x CMRS
+    # texture: NCH|W|c
+    # texture: C|MRS|c
+    # output: N
+    # m = mrs//RS
+    # rs = mrs % RS
+    # r = rs // W == (mrs // S) % R
+    # s = rs % W == mrs % S
+    Filter_tx = te.compute(
+        (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block),
+        lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb],
+        name = "packed_filter"
+    )
+
+    conv = te.compute(
+        (batch, out_channel_chunk, out_height, out_width, channel_block),
+        lambda nn, ffc, yy, xx, ffb: te.sum(
+            (temp[nn, ffc//channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb]
+             * Filter_tx[ffc//channel_multiplier, ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(out_dtype),
+            axis=[ry, rx],
+        ),
+        tag="depthwise_conv2d_nchwc_kcrsk_texture",
+    )
+    return te.compute(conv.shape, lambda n,ffc,y,x,ffb: conv[n,ffc,y,x,ffb].astype("float32"))
+
+
+
+def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
+    """schedule optimized for batch size = 1"""
+
+    conv = output.op.input_tensors[0]
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_fc", fc, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    pad_data, flattened_kernel = s[conv].op.input_tensors
+    kernel = s[flattened_kernel].op.input_tensors[0]
+    s[flattened_kernel].compute_inline()
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+    kernel = flattened_kernel
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, "local")
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope("local")
+        OL = conv
+
+    # create cache stage
+    AT = s.cache_read(pad_data, "global.texture", [OL])
+    WT = s.cache_read(kernel, "global.texture", [OL])
+    def copy_to_texture(stage):
+        axes = s[stage].op.axis
+        fused = s[stage].fuse(*axes[:-1])
+        block, thread = s[stage].split(fused, factor=32)
+        s[stage].vectorize(axes[-1])
+        s[stage].bind(block, te.thread_axis("blockIdx.x"))
+        s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+    copy_to_texture(AT)
+    copy_to_texture(WT)
+
+    AA = s.cache_read(AT, "shared", [OL])
+    WW = s.cache_read(WT, "shared", [OL])
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[output].op.axis
+
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb)
+    s[output].vectorize(fb)
+
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[OL].op.axis
+
+    ry, rx = s[OL].op.reduce_axis
+    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
+
+    s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
+    s[OL].vectorize(fb)
+    #s[OL].unroll()
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    # cooperative fetching
+    for load in [AA, WW]:
+        if load == WW:
+            n, fyx, v = s[load].op.axis
+            fused = s[load].fuse(n, fyx)
+        else:
+            n, f, y, x, v = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        s[load].vectorize(v)
+
+    # unroll
+    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+
+    N, OCC, OH, OW, OCB = get_const_tuple(output.shape)
+    ICC, MKHKW, ICB = get_const_tuple(kernel.shape)
+    M = (OCC * OCB) // (ICC * ICB)
+    KHKW = MKHKW // M
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW)
+
+
+def compute_conv2d_NCHWc_KCRSk(
+    cfg, data, kernel, stride, padding, dilation, out_dtype=None
+):
+    """Convolution operator for 'conv2d_NCHWc_KCRSk'.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.te.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
+    """
+    if out_dtype is None:
+        out_dtype = data.dtype
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    pre_computed = len(kernel.shape) == 5
+    if not pre_computed:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+        assert (
+            channels % ic_block_factor == 0
+        ), "Number of input channels must divide {}".format(ic_block_factor)
+        assert (
+            out_channels % oc_block_factor == 0
+        ), "Number of output channels must divide {}".format(oc_block_factor)
+
+        packed_data = te.compute(
+            (batch, channels // ic_block_factor, height, width, ic_block_factor),
+            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
+            name="packed_data",
+        )
+        packed_kernel = te.compute(
+            (
+                out_channels // oc_block_factor,
+                in_channels,
+                kernel_h,
+                kernel_w,
+                oc_block_factor
+            ),
+            lambda oc_chunk, ic, kh, kw, oc_block: kernel[
+                oc_chunk * oc_block_factor + oc_block, ic, kh, kw
+            ],
+            name="packed_kernel",
+        )
+    else:
+        packed_data = data
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(packed_data.shape)
+    oc_chunk, _, kernel_h, kernel_w, oc_block = get_const_tuple(packed_kernel.shape)
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    # pad the input data
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = nn.pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
+    icb = te.reduce_axis((0, ic_block_factor), name="ic_block")
+    kh = te.reduce_axis((0, kernel_h), name="kh")
+    kw = te.reduce_axis((0, kernel_w), name="kw")
+
+    conv = te.compute(
+        oshape,
+        lambda n, occ, oh, ow, ocb: te.sum(
+            pad_data[
+                n,
+                icc,
+                oh * stride_h + kh * dilation_h,
+                ow * stride_w + kw * dilation_w,
+                icb,
+            ]
+            * packed_kernel[occ, icc * ic_block + icb, kh, kw, ocb],
+            axis=[icc, kh, kw, icb],
+        ),
+    )
+
+    # Type conversion
+    output = te.compute(
+        oshape, lambda *index: conv(*index).astype(out_dtype), tag="conv2d_NCHWc_KCRSk"
+    )
+
+    num_flop = (
+        batch
+        * oc_chunk
+        * oc_block
+        * out_height
+        * out_width
+        * ic_chunk
+        * ic_block
+        * kernel_h
+        * kernel_w
+        * 2
+    )
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
+    """Schedule conv2d NCHWc template"""
+
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    # if autotvm.GLOBAL_SCOPE.in_tuning:
+    #     # skip this part during tuning to make records accurate
+    #     # this part will be pre-computed during NNVM's pre-compute optimization pass
+    #     s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+    #     s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
+    # else:
+    #     if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel":
+    #         # data and kernel are not pre-computed, schedule layout transform here
+    #         schedule_injective_from_existing(s, packed_data)
+    #         schedule_injective_from_existing(s, packed_kernel)
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    # create cache stage
+    AA = s.cache_read(pad_data, "shared", [conv])
+    WW = s.cache_read(packed_kernel, "shared", [conv])
+
+    s[conv].set_scope("local")
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    oc_chunk = nn.get_const_int(output.shape[1])
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_n", n, num_outputs=4)
+    cfg.define_split("tile_f", cfg.axis(oc_chunk), num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    s[output].bind(n, te.thread_axis("blockIdx.z"))
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, te.thread_axis("blockIdx.z"))
+    #s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
+    s[output].bind(vn, te.thread_axis("vthread"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(tf, te.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg["tile_rc"].apply(s, conv, rc)
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+    #_, rc_block = s[conv].split(rc_block, factor=4)
+    #s[conv].tensorize(rc_block, _dp4a)
+
+    s[AA].compute_at(s[conv], rxo)
+    s[WW].compute_at(s[conv], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        fcd = s[load].op.axis[-1]
+        #fcd_outer, fcd = s[load].split(fcd, factor=4)
+        s[load].vectorize(fcd)
+        #fused = s[load].op.axis[:-1] + [fcd_outer]
+        fused = s[load].op.axis[:-1]
+        fused = s[load].fuse(*fused)
+
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
+    return s
+
+
+#@autotvm.template("matmul_vector_accumulator_tune")
+def matmul_vector_acc_template(shapeA, shapeB):
+    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
+    s = schedule_matmul_vector_accumulator_autotvm(*placeholders)
+    return s, placeholders
+
+#@autotvm.template("conv2d_1x1_NCHWc_RSCKk_tune")
+def conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape):
+    placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape)
+    s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders)
+    return s, placeholders
+
+#@autotvm.template("conv2d_1x1_WCHNc_CRSKk_tune")
+def conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape):
+    placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape)
+    s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders)
+    return s, (placeholders[0], placeholders[1], placeholders[-1])
+
+#@autotvm.template("conv2d_cuda_NCHW_KCRS_tune")
+def conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape):
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    conv = compute_conv2d_cuda_NCHW_KCRS(data, filt, [1,1], [0,0], [0,0], "float32")
+    cfg = autotvm.get_config()
+    s = te.create_schedule([x.op for x in [conv]])
+    schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv)
+    return s, (data, filt, conv)
+
+#@autotvm.template("conv2d_cuda_NCHWc_KCRSk_tune")
+def conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape):
+    cfg = autotvm.get_config()
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    output = compute_conv2d_NCHWc_KCRSk(cfg, data, filt, [1,1], [0,0], [0,0], "float32")
+    s = te.create_schedule([x.op for x in [output]])
+    s = schedule_conv2d_NCHWc_KCRSk(cfg, s, output)
+    return s, (data, filt, output)
+
+def conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape):
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    conv = compute_conv2d_NCHWc_KCRSk_tx(data, filt, [1,1], [0,0], [1,1], "float32")
+    cfg = autotvm.get_config()
+    s = te.create_schedule([x.op for x in [conv]])
+    schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv)
+    return s, (data, filt, conv)
+
+def conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape):
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    output = compute_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32")
+    cfg = autotvm.get_config()
+    s = te.create_schedule([x.op for x in [output]])
+    schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output)
+    return s, (data, filt, output)
+
+def depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape):
+    data = te.placeholder(input_shape, name="data", dtype="float32")
+    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
+    output = compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32")
+    cfg = autotvm.get_config()
+    s = te.create_schedule([x.op for x in [output]])
+    schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output)
+    return s, (data, filt, output)
+
+#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune")
+def conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape):
+    return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape)
+
+#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune2")
+def conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape):
+    return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape)
+
+#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune")
+def conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape):
+    return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape)
+
+#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune2")
+def conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape):
+    return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape)
+
+#@autotvm.template("depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune")
+def depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape):
+    return depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape)
+
+def ref_convolution(data, kernel, stride, pad):
+    import mxnet as mx
+    groups = 1
+    kernel_size = (kernel.shape[2], kernel.shape[3])
+    num_filter = kernel.shape[0]
+    ref_res = mx.nd.Convolution(
+        data=mx.nd.array(data),
+        weight=mx.nd.array(kernel),
+        bias=None,
+        no_bias=True,
+        kernel=kernel_size,
+        stride=stride,
+        pad=pad,
+        num_filter=num_filter,
+        num_group=groups,
+    )
+    return ref_res.asnumpy()
+
+def ref_depthwise_convolution(data, kernel, stride, pad):
+    import mxnet as mx
+    groups = kernel.shape[0]
+    kernel_size = (kernel.shape[2], kernel.shape[3])
+    num_filter = kernel.shape[0]
+    multiplier = kernel.shape[1]
+    ref_res = mx.nd.Convolution(
+        data=mx.nd.array(data),
+        weight=mx.nd.array(kernel),
+        bias=None,
+        no_bias=True,
+        kernel=kernel_size,
+        stride=stride,
+        pad=pad,
+        num_filter=num_filter,
+        num_group=groups,
+    )
+    return ref_res.asnumpy()
+
+def validate(workload, target, dev):
+    s, placeholders = workload()
+    func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction")
+
+    args_tvm = []
+    args_np = []
+    for var in placeholders[:-1]:
+        var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype)
+        args_np.append(var_np)
+        args_tvm.append(tvm.nd.array(var_np, dev))
+    args_tvm.append(tvm.nd.array(np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev))
+    func(*args_tvm)
+
+    if "plus_one" in workload.__name__:
+        np_result = args_np[0] + 1.0;
+    elif "matmul" in workload.__name__:
+        if 'inner' in workload.__name__:
+            np_result = np.matmul(args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0))
+        elif 'accum' in workload.__name__:
+            np_result = np.matmul(args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64))
+        else:
+            np_result = np.matmul(args_np[0].transpose((0, 2, 1)).reshape(128, 64), args_np[1].transpose(1, 0, 2).reshape(64,128))
+    elif "conv2d_1x1_NCHWc_RSCKk_tune" in workload.__name__:
+        vec_length = args_np[1].shape[-1]
+        # nchwc -> nchw
+        args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3])
+        # rsckk -> rsck -> kcrs
+        args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 2, 0, 1))
+        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
+        # nkhw -> nkhwk
+        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2)
+    elif "conv2d_1x1_WCHNc_CRSKk_tune" in workload.__name__:
+        vec_length = args_np[1].shape[-1]
+        # wchnc -> nchw
+        args_np[0] = args_np[0].transpose((3, 1, 4, 2, 0)).reshape(args_np[0].shape[3], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[0])
+        # crskk -> crsk -> kcrs
+        args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 0, 1, 2))
+        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
+        # nkhw -> nkkhw -> wkhnk
+        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(4, 1, 3, 0, 2)
+    elif "NCHW_KCRS" in workload.__name__:
+        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
+    elif "NCHWc_KCRSk" in workload.__name__:
+        vec_length = args_np[1].shape[-1]
+        # nchwc -> nchw
+        args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3])
+        # kcrsk/cmrsc -> kcrs/cmrs
+        args_np[1] = args_np[1].transpose((0, 4, 1, 2, 3)).reshape(args_np[1].shape[0] * args_np[1].shape[4], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3])
+        if "depthwise" in workload.__name__:
+            #np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID")
+            np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], [])
+        else:
+            #np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
+            np_result = ref_convolution(args_np[0], args_np[1], [], [])
+        # nkhw -> nkhwk
+        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2)
+    np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2)
+
+def verify_plus_one_rank3():
+    shape =(32, 32, 4)
+    placeholders = compute(shape)
+    s = schedule(*placeholders)
+    return s, placeholders
+
+def verify_matmul():
+    shape = (32, 64, 4)
+    placeholders = compute_matmul(shape)
+    s = schedule_matmul(*placeholders)
+    return s, placeholders
+
+def verify_matmul_with_local():
+    shape = (32, 64, 4)
+    placeholders = compute_matmul(shape)
+    s = schedule_matmul(*placeholders, local=True)
+    return s, placeholders
+
+def verify_matmul_inner():
+    shape = (32, 64, 4)
+    placeholders = compute_matmul_inner(shape)
+    s = schedule_matmul_inner(*placeholders)
+    return s, placeholders
+
+def verify_matmul_vector_accumulator():
+    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
+    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
+    s = schedule_matmul_vector_accumulator(*placeholders)
+    return s, placeholders
+
+def verify_matmul_vector_accumulator_with_local():
+    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
+    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
+    s = schedule_matmul_vector_accumulator(*placeholders, local=True)
+    return s, placeholders
+
+def verify_plus_one_rank5():
+    shape =(32, 2, 4, 4, 4)
+    placeholders = compute5d(shape)
+    s = schedule5d(*placeholders)
+    return s, placeholders
+
+def verify_matmul_vector_accumulator_tune():
+    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
+    return matmul_vector_acc_template(shapeA, shapeB)
+
+def verify_conv2d_1x1_NCHWc_RSCKk_tune():
+    # mobilenetv1 1x1 conv2d
+    input_shape, filter_shape = (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
+    return conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape)
+
+def verify_conv2d_1x1_WCHNc_CRSKk_tune():
+    input_shape, filter_shape = (56, 128//4, 56, 1, 4), (128, 1, 1, 128//4, 4)
+    return conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape)
+
+def verify_conv2d_cuda_NCHW_KCRS_tune():
+    # NCHW, KCRS
+    input_shape, filter_shape = (1, 128, 56, 56), (128, 128, 1, 1)
+    return conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape)
+
+def verify_conv2d_cuda_NCHWc_KCRSk_tune():
+    # NCHWc, KCRSk
+    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
+    return conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape)
+
+def verify_conv2d_NCHWc_KCRSk_tx_tune():
+    # NCHWc, KCRSk
+    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
+    return conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape)
+
+def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune():
+    # NCHWc, KCRSk
+    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
+    return conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape)
+
+def verify_conv2d_NCHWc_KCRSk_tx_tune2():
+    # NCHWc, KCRSk
+    input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4)
+    # input_shape, filter_shape = (1, 128, 7, 7, 4), (256, 512, 1, 1, 4)
+    # input_shape, filter_shape = (1, 128, 7, 7, 4), (128, 512, 3, 3, 4)
+    # input_shape, filter_shape = (1, 128, 7, 7, 4), (512, 512, 1, 1, 4)
+    return conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape)
+
+def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune2():
+    # NCHWc, KCRSk
+    input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4)
+    return conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape)
+
+def verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune():
+    # deeplabv3
+    # [1, 144, 129, 129], [144, 1, 3, 3]
+    # [1, 96, 257, 257], [96, 1, 3, 3]
+    # [N, C, H, W], [K, 1, R, S]
+    # [N, C/4, H, W, 4c], [C/4, 1, R, S, 4c]
+    input_shape, filter_shape = (1, 96//4, 257, 257, 4), (96//4, 1, 3, 3, 4)
+    return depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape)
+
+@tvm.testing.parametrize_targets("opencl")
+def test_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(target, dev):
+    validate(verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune, target, dev)
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From 7d76707e15c40ab2e6eb771df0e49e65d5c318ca Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 17 Aug 2021 15:23:41 -0700
Subject: [PATCH 58/59] Refactor tests to use pytest parameterization.

Blacken tests.
---
 .../test_target_texture_codegen_opencl.py     | 1025 ++++++-----------
 1 file changed, 329 insertions(+), 696 deletions(-)

diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py
index b155d56f1346..03944c85ade5 100644
--- a/tests/python/unittest/test_target_texture_codegen_opencl.py
+++ b/tests/python/unittest/test_target_texture_codegen_opencl.py
@@ -27,15 +27,16 @@
 from tvm.topi import nn
 
 
-def compute(shape):
+def compute_plus_one_rank3(shape):
     X = te.placeholder(shape, name="X", dtype="float32")
     Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y")
     return X, Y
 
-def schedule(X, Y):
+
+def schedule_plus_one_rank3(X, Y):
     s = te.create_schedule(Y.op)
-    #Xt = s.cache_read(X, "texture", [Y])
-    #Xt = s.cache_read(X, "global", [Y])
+    # Xt = s.cache_read(X, "texture", [Y])
+    # Xt = s.cache_read(X, "global", [Y])
     Xt = s.cache_read(X, "global.texture", [Y])
 
     # copy to texture stage
@@ -52,12 +53,14 @@ def schedule(X, Y):
     s[Y].vectorize(c)
     return s
 
-def compute5d(shape):
+
+def compute_plus_one_rank5(shape):
     X = te.placeholder(shape, name="X", dtype="float32")
     Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y")
     return X, Y
 
-def schedule5d(X, Y):
+
+def schedule_plus_one_rank5(X, Y):
     s = te.create_schedule(Y.op)
     Xt = s.cache_read(X, "global.texture", [Y])
 
@@ -77,19 +80,23 @@ def schedule5d(X, Y):
     s[Y].vectorize(e)
     return s
 
+
 def compute_matmul(shape):
     A = te.placeholder(shape, name="A", dtype="float32")
     B = te.placeholder(shape, name="B", dtype="float32")
     k = te.reduce_axis((0, shape[1]), name="k")
     C = te.compute(
-        (shape[0]*shape[2], shape[0]*shape[2]),
+        (shape[0] * shape[2], shape[0] * shape[2]),
         lambda i, j: te.sum(
-            A[i//shape[2], k, i%shape[2]].astype("float32") * B[j//shape[2], k, j%shape[2]].astype("float32"), axis=[k]
+            A[i // shape[2], k, i % shape[2]].astype("float32")
+            * B[j // shape[2], k, j % shape[2]].astype("float32"),
+            axis=[k],
         ),
         name="Compute_MatMul",
     )
     return A, B, C
 
+
 def schedule_matmul(A, B, C, local=False):
     s = te.create_schedule(C.op)
     At = s.cache_read(A, "global.texture", [C])
@@ -101,6 +108,7 @@ def schedule_matmul(A, B, C, local=False):
 
     bx = te.thread_axis("blockIdx.x")
     tx = te.thread_axis("threadIdx.x")
+
     def copy_to_texture(stage):
         _io, _k, _ii = s[stage].op.axis
         s[stage].vectorize(_ii)
@@ -138,19 +146,22 @@ def copy_to_texture(stage):
 def compute_matmul_inner(shape):
     A = te.placeholder(shape, name="A", dtype="float32")
     B = te.placeholder(shape, name="B", dtype="float32")
-    k = te.reduce_axis((0, shape[1]*shape[2]), name="k")
+    k = te.reduce_axis((0, shape[1] * shape[2]), name="k")
     # (M, K) x (N, K)
     # (32, 256) x (32, 256)
     # (32, 64, 4) x (32, 64, 4)
     C = te.compute(
         (shape[0], shape[0]),
         lambda i, j: te.sum(
-            A[i, k//shape[2], k%shape[2]].astype("float32") * B[j, k//shape[2], k%shape[2]].astype("float32"), axis=[k]
+            A[i, k // shape[2], k % shape[2]].astype("float32")
+            * B[j, k // shape[2], k % shape[2]].astype("float32"),
+            axis=[k],
         ),
         name="Compute_MatMul",
     )
     return A, B, C
 
+
 def schedule_matmul_inner(A, B, C, local=False):
     s = te.create_schedule(C.op)
     At = s.cache_read(A, "global.texture", [C])
@@ -162,6 +173,7 @@ def schedule_matmul_inner(A, B, C, local=False):
 
     bx = te.thread_axis("blockIdx.x")
     tx = te.thread_axis("threadIdx.x")
+
     def copy_to_texture(stage):
         _i, _ko, _ki = s[stage].op.axis
         s[stage].vectorize(_ki)
@@ -196,6 +208,7 @@ def copy_to_texture(stage):
 
     return s
 
+
 def compute_matmul_vector_accumulator(shapeA, shapeB):
     # A x B
     # (K/4, M, K%4) x (K, N/4, N%4) = (M, N)
@@ -204,14 +217,17 @@ def compute_matmul_vector_accumulator(shapeA, shapeB):
     B = te.placeholder(shapeB, name="B", dtype="float32")
     k = te.reduce_axis((0, shapeB[0]), name="k")
     C = te.compute(
-        (shapeA[1], shapeB[1]*shapeB[2]),
+        (shapeA[1], shapeB[1] * shapeB[2]),
         lambda i, j: te.sum(
-            A[k//shapeA[-1], i, k%shapeA[-1]].astype("float32") * B[k, j//shapeB[-1], j%shapeB[-1]].astype("float32"), axis=[k]
+            A[k // shapeA[-1], i, k % shapeA[-1]].astype("float32")
+            * B[k, j // shapeB[-1], j % shapeB[-1]].astype("float32"),
+            axis=[k],
         ),
         name="Compute_MatMul",
     )
     return A, B, C
 
+
 def schedule_matmul_vector_accumulator(A, B, C, local=False):
     s = te.create_schedule(C.op)
     At = s.cache_read(A, "global.texture", [C])
@@ -261,56 +277,6 @@ def copy_to_texture(stage):
 
     return s
 
-def schedule_matmul_vector_accumulator_autotvm(A, B, C):
-    s = te.create_schedule(C.op)
-    cfg = autotvm.get_config()
-
-    At = s.cache_read(A, "global.texture", [C])
-    Bt = s.cache_read(B, "global.texture", [C])
-    Al = s.cache_read(At, "local", [C])
-    Bl = s.cache_read(Bt, "local", [C])
-    Cl = s.cache_write(C, "local")
-
-    def copy_to_texture(stage):
-        _y, _x, _v = s[stage].op.axis
-        s[stage].vectorize(_v)
-        s[stage].bind(_y, te.thread_axis("blockIdx.x"))
-        s[stage].bind(_x, te.thread_axis("threadIdx.x"))
-
-    copy_to_texture(At)
-    copy_to_texture(Bt)
-
-    # copy to global stage
-    _i, _j = s[C].op.axis
-    xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4)
-    s[C].unroll(xi)
-    s[C].vectorize(yi)
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(yo, te.thread_axis("threadIdx.x"))
-
-    # the compute stage
-    s[Cl].compute_at(s[C], yo)
-    (_k,) = Cl.op.reduce_axis
-    _a, _b = s[Cl].op.axis
-    _ko, _ki = s[Cl].split(_k, factor=4)
-
-    s[Cl].reorder(_ko, _a, _ki, _b)
-    cfg.define_knob("unroll", [0, 1])
-    if cfg["unroll"] == 1:
-        s[Cl].unroll(_ki)
-        s[Cl].unroll(_a)
-    s[Cl].vectorize(_b)
-
-    s[Al].compute_at(s[Cl], _a)
-    _aa, _ka, _ba = s[Al].op.axis
-    s[Al].vectorize(_ba)
-    s[Bl].compute_at(s[Cl], _ko)
-    _ab, _kb, _bb = s[Bl].op.axis
-    s[Bl].vectorize(_bb)
-    s[Bl].unroll(_ab)
-
-
-    return s
 
 def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
     # conv2d( [N, C, H, W, c] , [1, 1, C, K, k]
@@ -323,13 +289,16 @@ def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
     conv = te.compute(
         (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]),
         lambda n, ko, i, j, ki: te.sum(
-            data[n, c, i, j, c4].astype("float32") * filt[kh, kw, c*input_shape[-1] + c4, ko, ki].astype("float32"), axis=[kh, kw, c, c4]
+            data[n, c, i, j, c4].astype("float32")
+            * filt[kh, kw, c * input_shape[-1] + c4, ko, ki].astype("float32"),
+            axis=[kh, kw, c, c4],
         ),
-        #name="Compute_conv2d_1x1_NCHWc_RSCKk",
-        name = "conv2d_1x1"
+        # name="Compute_conv2d_1x1_NCHWc_RSCKk",
+        name="conv2d_1x1",
     )
     return data, filt, conv
 
+
 def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv):
     # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
     # outputs:
@@ -348,6 +317,7 @@ def copy_to_texture(stage):
         s[stage].vectorize(axes[-1])
         s[stage].bind(block, te.thread_axis("blockIdx.x"))
         s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+
     copy_to_texture(At)
     copy_to_texture(Bt)
 
@@ -382,8 +352,8 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
 
     packed_data = te.compute(
         (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]),
-        lambda i, j, k, l: data[i, j, k//input_shape[3], k%input_shape[3], l],
-        name = "packed_data"
+        lambda i, j, k, l: data[i, j, k // input_shape[3], k % input_shape[3], l],
+        name="packed_data",
     )
 
     # Logical transformation of Nd -> 3d tensor
@@ -394,8 +364,14 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
     # k = sk % K == (rsk % SK) % K == rsk % K
     packed_filter = te.compute(
         (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]),
-        lambda i, j, k: filt[i, j//(filter_shape[3] * filter_shape[2]), (j//filter_shape[3])%filter_shape[2], j%filter_shape[3], k],
-        name = "packed_filter"
+        lambda i, j, k: filt[
+            i,
+            j // (filter_shape[3] * filter_shape[2]),
+            (j // filter_shape[3]) % filter_shape[2],
+            j % filter_shape[3],
+            k,
+        ],
+        name="packed_filter",
     )
 
     c = te.reduce_axis((0, input_shape[1]), name="C")
@@ -407,13 +383,16 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
         (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]),
         lambda w, ko, h, n, ki: te.sum(
             packed_data[w, c, h * input_shape[3] + n, c4].astype("float32")
-            *
-            packed_filter[c*input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki].astype("float32"), axis=[r, s, c, c4]
+            * packed_filter[
+                c * input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki
+            ].astype("float32"),
+            axis=[r, s, c, c4],
         ),
-        name = "conv2d_1x1"
+        name="conv2d_1x1",
     )
     return data, filt, packed_data, packed_filter, conv
 
+
 def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv):
     # data: [W, C, H*N, c]
     # filter: [C, R*S*K, k]
@@ -445,6 +424,7 @@ def copy_to_texture(stage):
         s[stage].vectorize(axes[-1])
         s[stage].bind(block, te.thread_axis("blockIdx.x"))
         s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+
     copy_to_texture(At)
     copy_to_texture(Bt)
 
@@ -456,7 +436,6 @@ def copy_to_texture(stage):
     cfg.define_split("tile_h", _h, num_outputs=4)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
 
-
     bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko)
     bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w)
     bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h)
@@ -494,7 +473,7 @@ def copy_to_texture(stage):
     # s[C].bind(_wo, by)
     # s[C].bind(_hn, bz)
 
-    #s[Cl].compute_at(s[C], _hn)
+    # s[Cl].compute_at(s[C], _hn)
     s[Cl].compute_at(s[C], th)
 
     _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis
@@ -504,20 +483,17 @@ def copy_to_texture(stage):
     cfg.define_split("tile_kh", _khl, num_outputs=2)
     cfg.define_split("tile_kw", _kwl, num_outputs=2)
 
-
-
     _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl)
     _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl)
     _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl)
-    #s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+    # s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
     s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl)
-    #s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli)
+    # s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli)
     # s[Cl].reorder(_cl, _cl4, _kil, _wl)
     s[Cl].unroll(_cl4)
     s[Cl].unroll(_wl)
     s[Cl].vectorize(_kil)
 
-
     _wla, _cla, _hnla, _cl4a = s[Al].op.axis
     s[Al].compute_at(s[Cl], _cli)
     s[Al].vectorize(_cl4a)
@@ -536,165 +512,8 @@ def copy_to_texture(stage):
 
     return s
 
-def compute_conv2d_cuda_NCHW_KCRS(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCHW layout.
-
-    Parameters
-    ----------
-    Input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Filter : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
-
-    stride : int or a list/tuple of two ints
-        Stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_channel, in_height, in_width = Input.shape
-    num_filter, channel, kernel_h, kernel_w = Filter.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")
-
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    return te.compute(
-        (batch, out_channel, out_height, out_width),
-        lambda nn, ff, yy, xx: te.sum(
-            temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(
-                out_dtype
-            )
-            * Filter[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx],
-        ),
-        tag="conv2d_nchw",
-    )
-
-
-def schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv):
-    """schedule optimized for batch size = 1"""
-
-    ##### space definition begin #####
-    n, f, y, x = s[conv].op.axis
-    rc, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, "local")
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope("local")
-        OL = conv
-
-    AA = s.cache_read(pad_data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    bf = s[output].fuse(n, bf)
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    s[OL].compute_at(s[output], tx)
-
-    # tile reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg["tile_rc"].apply(s, OL, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, OL, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
 
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-
-    N, CO, OH, OW = get_const_tuple(output.shape)
-    _, KH, KW, CI = get_const_tuple(kernel.shape)
-
-    if isinstance(N, int):
-        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
-
-
-def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_dtype=None):
+def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in NCHWc layout. """
 
     if out_dtype is None:
@@ -739,24 +558,29 @@ def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_
     # rs = crs % RS
     # r = rs // W == (crs // S) % R
     # s = rs % W == crs % S
-    Filter_tx = te.compute(
+    Filter = te.compute(
         (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
-        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
-        name = "packed_filter"
+        lambda ffc, crs, ffb: Filter[
+            ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb
+        ],
+        name="packed_filter",
     )
     return te.compute(
         (batch, num_filter_chunk, out_height, out_width, num_filter_block),
         lambda nn, ffc, yy, xx, ffb: te.sum(
-            temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb].astype(
-                out_dtype
-            )
-            * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb].astype(out_dtype),
+            temp[
+                nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb
+            ].astype(out_dtype)
+            * Filter[
+                ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb
+            ].astype(out_dtype),
             axis=[rcc, rcb, ry, rx],
         ),
         tag="conv2d_nchwc_kcrsk_texture",
     )
 
-def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv):
+
+def schedule_conv2d_NCHWc_KCRSk(cfg, s, conv):
     """schedule optimized for batch size = 1"""
 
     ##### space definition begin #####
@@ -790,6 +614,7 @@ def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv):
     # create cache stage
     AT = s.cache_read(pad_data, "global.texture", [OL])
     WT = s.cache_read(kernel, "global.texture", [OL])
+
     def copy_to_texture(stage):
         axes = s[stage].op.axis
         fused = s[stage].fuse(*axes[:-1])
@@ -797,9 +622,13 @@ def copy_to_texture(stage):
         s[stage].vectorize(axes[-1])
         s[stage].bind(block, te.thread_axis("blockIdx.x"))
         s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+
     copy_to_texture(AT)
     copy_to_texture(WT)
 
+    AA = s.cache_read(AT, "shared", [OL])
+    WW = s.cache_read(WT, "shared", [OL])
+
     # tile and bind spatial axes
     n, fc, y, x, fb = s[output].op.axis
 
@@ -864,7 +693,7 @@ def copy_to_texture(stage):
         cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
 
 
-def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
+def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in NCHWc layout. """
 
     if out_dtype is None:
@@ -909,26 +738,29 @@ def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation
     # rs = crs % RS
     # r = rs // W == (crs // S) % R
     # s = rs % W == crs % S
-    Filter_tx = te.compute(
+    Filter = te.compute(
         (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
-        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
-        name = "packed_filter"
+        lambda ffc, crs, ffb: Filter[
+            ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb
+        ],
+        name="packed_filter",
     )
     conv = te.compute(
         (batch, num_filter_chunk, out_height, out_width, num_filter_block),
         lambda nn, ffc, yy, xx, ffb: te.sum(
-            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
-            * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb]).astype(out_dtype),
+            (
+                temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
+                * Filter[ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb]
+            ).astype(out_dtype),
             axis=[rcc, rcb, ry, rx],
         ),
         tag="conv2d_nchwc_kcrsk_texture",
     )
-    output = te.compute(conv.shape, lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype("float32"))
+    output = te.compute(conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32"))
     return output
 
 
-
-def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
+def schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output):
     """schedule optimized for batch size = 1"""
 
     conv = output.op.input_tensors[0]
@@ -964,6 +796,7 @@ def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
     # create cache stage
     AT = s.cache_read(pad_data, "global.texture", [OL])
     WT = s.cache_read(kernel, "global.texture", [OL])
+
     def copy_to_texture(stage):
         axes = s[stage].op.axis
         fused = s[stage].fuse(*axes[:-1])
@@ -971,6 +804,7 @@ def copy_to_texture(stage):
         s[stage].vectorize(axes[-1])
         s[stage].bind(block, te.thread_axis("blockIdx.x"))
         s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+
     copy_to_texture(AT)
     copy_to_texture(WT)
 
@@ -1042,8 +876,9 @@ def copy_to_texture(stage):
         cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
 
 
-
-def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None):
+def compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
+    Input, Filter, stride, padding, dilation, out_dtype=None
+):
     """Depthwise convolution operator in NCHWc layout. """
     if out_dtype is None:
         out_dtype = Input.dtype
@@ -1080,7 +915,6 @@ def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding
     ry = te.reduce_axis((0, kernel_h), name="ry")
     rx = te.reduce_axis((0, kernel_w), name="rx")
 
-
     # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c]
     # NCHWc x CMRS
     # texture: NCH|W|c
@@ -1090,26 +924,41 @@ def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding
     # rs = mrs % RS
     # r = rs // W == (mrs // S) % R
     # s = rs % W == mrs % S
-    Filter_tx = te.compute(
+    Filter = te.compute(
         (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block),
-        lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb],
-        name = "packed_filter"
+        lambda ffc, mrs, ffb: Filter[
+            ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb
+        ],
+        name="packed_filter",
     )
 
     conv = te.compute(
         (batch, out_channel_chunk, out_height, out_width, channel_block),
         lambda nn, ffc, yy, xx, ffb: te.sum(
-            (temp[nn, ffc//channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb]
-             * Filter_tx[ffc//channel_multiplier, ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(out_dtype),
+            (
+                temp[
+                    nn,
+                    ffc // channel_multiplier,
+                    yy * stride_h + ry * dilation_h,
+                    xx * stride_w + rx * dilation_w,
+                    ffb,
+                ]
+                * Filter[
+                    ffc // channel_multiplier,
+                    ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx,
+                    ffb,
+                ]
+            ).astype(out_dtype),
             axis=[ry, rx],
         ),
         tag="depthwise_conv2d_nchwc_kcrsk_texture",
     )
-    return te.compute(conv.shape, lambda n,ffc,y,x,ffb: conv[n,ffc,y,x,ffb].astype("float32"))
-
+    return te.compute(
+        conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype("float32")
+    )
 
 
-def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
+def schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output):
     """schedule optimized for batch size = 1"""
 
     conv = output.op.input_tensors[0]
@@ -1144,6 +993,7 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output):
     # create cache stage
     AT = s.cache_read(pad_data, "global.texture", [OL])
     WT = s.cache_read(kernel, "global.texture", [OL])
+
     def copy_to_texture(stage):
         axes = s[stage].op.axis
         fused = s[stage].fuse(*axes[:-1])
@@ -1151,6 +1001,7 @@ def copy_to_texture(stage):
         s[stage].vectorize(axes[-1])
         s[stage].bind(block, te.thread_axis("blockIdx.x"))
         s[stage].bind(thread, te.thread_axis("threadIdx.x"))
+
     copy_to_texture(AT)
     copy_to_texture(WT)
 
@@ -1190,7 +1041,7 @@ def copy_to_texture(stage):
 
     s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
     s[OL].vectorize(fb)
-    #s[OL].unroll()
+    # s[OL].unroll()
 
     s[AA].compute_at(s[OL], rxo)
     s[WW].compute_at(s[OL], rxo)
@@ -1222,353 +1073,59 @@ def copy_to_texture(stage):
         cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW)
 
 
-def compute_conv2d_NCHWc_KCRSk(
-    cfg, data, kernel, stride, padding, dilation, out_dtype=None
-):
-    """Convolution operator for 'conv2d_NCHWc_KCRSk'.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width] or
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
-        filter_width, num_filter_block, in_channel_block]
-
-    stride : int or a list/tuple of two ints
-        Stride size, or [stride_height, stride_width]
-
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    dilation : int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
-    """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    ic_block_factor = 4
-    oc_block_factor = 4
-
-    pre_computed = len(kernel.shape) == 5
-    if not pre_computed:
-        batch, channels, height, width = get_const_tuple(data.shape)
-        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
-
-        assert (
-            channels % ic_block_factor == 0
-        ), "Number of input channels must divide {}".format(ic_block_factor)
-        assert (
-            out_channels % oc_block_factor == 0
-        ), "Number of output channels must divide {}".format(oc_block_factor)
-
-        packed_data = te.compute(
-            (batch, channels // ic_block_factor, height, width, ic_block_factor),
-            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
-            name="packed_data",
-        )
-        packed_kernel = te.compute(
-            (
-                out_channels // oc_block_factor,
-                in_channels,
-                kernel_h,
-                kernel_w,
-                oc_block_factor
-            ),
-            lambda oc_chunk, ic, kh, kw, oc_block: kernel[
-                oc_chunk * oc_block_factor + oc_block, ic, kh, kw
-            ],
-            name="packed_kernel",
-        )
-    else:
-        packed_data = data
-        packed_kernel = kernel
-
-    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(packed_data.shape)
-    oc_chunk, _, kernel_h, kernel_w, oc_block = get_const_tuple(packed_kernel.shape)
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    # pad the input data
-    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
-    pad_before = [0, 0, pad_top, pad_left, 0]
-    pad_after = [0, 0, pad_down, pad_right, 0]
-    pad_data = nn.pad(packed_data, pad_before, pad_after, name="pad_data")
-
-    # compute the output shape
-    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-
-    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
-
-    icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
-    icb = te.reduce_axis((0, ic_block_factor), name="ic_block")
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-
-    conv = te.compute(
-        oshape,
-        lambda n, occ, oh, ow, ocb: te.sum(
-            pad_data[
-                n,
-                icc,
-                oh * stride_h + kh * dilation_h,
-                ow * stride_w + kw * dilation_w,
-                icb,
-            ]
-            * packed_kernel[occ, icc * ic_block + icb, kh, kw, ocb],
-            axis=[icc, kh, kw, icb],
-        ),
-    )
-
-    # Type conversion
-    output = te.compute(
-        oshape, lambda *index: conv(*index).astype(out_dtype), tag="conv2d_NCHWc_KCRSk"
-    )
-
-    num_flop = (
-        batch
-        * oc_chunk
-        * oc_block
-        * out_height
-        * out_width
-        * ic_chunk
-        * ic_block
-        * kernel_h
-        * kernel_w
-        * 2
-    )
-    cfg.add_flop(num_flop)
-
-    return output
-
-
-def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
-    """Schedule conv2d NCHWc template"""
-
-    conv = output.op.input_tensors[0]
-    packed_data, packed_kernel = conv.op.input_tensors
-
-    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
-        pad_data = packed_data
-        packed_data = pad_data.op.input_tensors[0]
-    else:
-        pad_data = packed_data
-
-    # if autotvm.GLOBAL_SCOPE.in_tuning:
-    #     # skip this part during tuning to make records accurate
-    #     # this part will be pre-computed during NNVM's pre-compute optimization pass
-    #     s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
-    #     s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
-    # else:
-    #     if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel":
-    #         # data and kernel are not pre-computed, schedule layout transform here
-    #         schedule_injective_from_existing(s, packed_data)
-    #         schedule_injective_from_existing(s, packed_kernel)
-
-    if pad_data != packed_data:
-        s[pad_data].compute_inline()
-
-    # create cache stage
-    AA = s.cache_read(pad_data, "shared", [conv])
-    WW = s.cache_read(packed_kernel, "shared", [conv])
-
-    s[conv].set_scope("local")
-
-    # handle bias
-    if output.op not in s.outputs:
-        s[output].compute_inline()
-        output = s.outputs[0].output(0)
-
-    oc_chunk = nn.get_const_int(output.shape[1])
-    # tile and bind spatial axes
-    n, f, y, x, c = s[output].op.axis
-    cfg.define_split("tile_n", n, num_outputs=4)
-    cfg.define_split("tile_f", cfg.axis(oc_chunk), num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-
-    # this is the scope to attach global config inside this kernel
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    s[output].bind(n, te.thread_axis("blockIdx.z"))
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, te.thread_axis("blockIdx.z"))
-    #s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
-    s[output].bind(vn, te.thread_axis("vthread"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-    if cfg["fuse_yx"].val:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(tf, te.thread_axis("threadIdx.y"))
-        tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tyx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2]
-        n_ty = cfg["tile_f"].size[2]
-        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-    else:
-        s[output].bind(tn, te.thread_axis("threadIdx.z"))
-        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[conv].compute_at(s[output], tx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-        n_ty = cfg["tile_y"].size[2]
-        n_tx = cfg["tile_x"].size[2]
-
-    # tile and bind reduction axes
-    n, f, y, x, c = s[conv].op.axis
-    rc, ry, rx, rc_block = s[conv].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
-    rco, rci = cfg["tile_rc"].apply(s, conv, rc)
-    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
-    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
-
-    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
-    #_, rc_block = s[conv].split(rc_block, factor=4)
-    #s[conv].tensorize(rc_block, _dp4a)
-
-    s[AA].compute_at(s[conv], rxo)
-    s[WW].compute_at(s[conv], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        fcd = s[load].op.axis[-1]
-        #fcd_outer, fcd = s[load].split(fcd, factor=4)
-        s[load].vectorize(fcd)
-        #fused = s[load].op.axis[:-1] + [fcd_outer]
-        fused = s[load].op.axis[:-1]
-        fused = s[load].fuse(*fused)
-
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-    return s
-
-
-#@autotvm.template("matmul_vector_accumulator_tune")
-def matmul_vector_acc_template(shapeA, shapeB):
-    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
-    s = schedule_matmul_vector_accumulator_autotvm(*placeholders)
+def scheduler(compute, schedule, *args, **kwargs):
+    placeholders = compute(*args)
+    s = schedule(*placeholders, **kwargs)
     return s, placeholders
 
-#@autotvm.template("conv2d_1x1_NCHWc_RSCKk_tune")
-def conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape):
+
+def conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape):
     placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape)
     s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders)
     return s, placeholders
 
-#@autotvm.template("conv2d_1x1_WCHNc_CRSKk_tune")
-def conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape):
+
+def conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape):
     placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape)
     s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders)
     return s, (placeholders[0], placeholders[1], placeholders[-1])
 
-#@autotvm.template("conv2d_cuda_NCHW_KCRS_tune")
-def conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape):
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    conv = compute_conv2d_cuda_NCHW_KCRS(data, filt, [1,1], [0,0], [0,0], "float32")
-    cfg = autotvm.get_config()
-    s = te.create_schedule([x.op for x in [conv]])
-    schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv)
-    return s, (data, filt, conv)
-
-#@autotvm.template("conv2d_cuda_NCHWc_KCRSk_tune")
-def conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape):
-    cfg = autotvm.get_config()
-    data = te.placeholder(input_shape, name="data", dtype="float32")
-    filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    output = compute_conv2d_NCHWc_KCRSk(cfg, data, filt, [1,1], [0,0], [0,0], "float32")
-    s = te.create_schedule([x.op for x in [output]])
-    s = schedule_conv2d_NCHWc_KCRSk(cfg, s, output)
-    return s, (data, filt, output)
 
-def conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape):
+def conv2d_NCHWc_KCRSk(input_shape, filter_shape):
     data = te.placeholder(input_shape, name="data", dtype="float32")
     filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    conv = compute_conv2d_NCHWc_KCRSk_tx(data, filt, [1,1], [0,0], [1,1], "float32")
+    conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1], "float32")
     cfg = autotvm.get_config()
     s = te.create_schedule([x.op for x in [conv]])
-    schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv)
+    schedule_conv2d_NCHWc_KCRSk(cfg, s, conv)
     return s, (data, filt, conv)
 
-def conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape):
+
+def conv2d_NCHWc_KCRSk_fp32_acc(input_shape, filter_shape):
     data = te.placeholder(input_shape, name="data", dtype="float32")
     filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    output = compute_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32")
+    output = compute_conv2d_NCHWc_KCRSk_acc32(data, filt, [1, 1], [0, 0], [1, 1], "float32")
     cfg = autotvm.get_config()
     s = te.create_schedule([x.op for x in [output]])
-    schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output)
+    schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output)
     return s, (data, filt, output)
 
-def depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape):
+
+def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape):
     data = te.placeholder(input_shape, name="data", dtype="float32")
     filt = te.placeholder(filter_shape, name="filter", dtype="float32")
-    output = compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32")
+    output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32(
+        data, filt, [1, 1], [0, 0], [1, 1], "float32"
+    )
     cfg = autotvm.get_config()
     s = te.create_schedule([x.op for x in [output]])
-    schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output)
+    schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output)
     return s, (data, filt, output)
 
-#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune")
-def conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape):
-    return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape)
-
-#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune2")
-def conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape):
-    return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape)
-
-#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune")
-def conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape):
-    return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape)
-
-#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune2")
-def conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape):
-    return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape)
-
-#@autotvm.template("depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune")
-def depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape):
-    return depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape)
 
 def ref_convolution(data, kernel, stride, pad):
     import mxnet as mx
+
     groups = 1
     kernel_size = (kernel.shape[2], kernel.shape[3])
     num_filter = kernel.shape[0]
@@ -1585,8 +1142,10 @@ def ref_convolution(data, kernel, stride, pad):
     )
     return ref_res.asnumpy()
 
+
 def ref_depthwise_convolution(data, kernel, stride, pad):
     import mxnet as mx
+
     groups = kernel.shape[0]
     kernel_size = (kernel.shape[2], kernel.shape[3])
     num_filter = kernel.shape[0]
@@ -1604,8 +1163,9 @@ def ref_depthwise_convolution(data, kernel, stride, pad):
     )
     return ref_res.asnumpy()
 
-def validate(workload, target, dev):
-    s, placeholders = workload()
+
+def validate(workload, target, dev, input_shapes, *args, **kwargs):
+    s, placeholders = workload(*input_shapes, *args, **kwargs)
     func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction")
 
     args_tvm = []
@@ -1614,154 +1174,227 @@ def validate(workload, target, dev):
         var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype)
         args_np.append(var_np)
         args_tvm.append(tvm.nd.array(var_np, dev))
-    args_tvm.append(tvm.nd.array(np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev))
+    args_tvm.append(
+        tvm.nd.array(
+            np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev
+        )
+    )
     func(*args_tvm)
 
     if "plus_one" in workload.__name__:
-        np_result = args_np[0] + 1.0;
+        np_result = args_np[0] + 1.0
     elif "matmul" in workload.__name__:
-        if 'inner' in workload.__name__:
-            np_result = np.matmul(args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0))
-        elif 'accum' in workload.__name__:
-            np_result = np.matmul(args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64))
+        if "inner" in workload.__name__:
+            np_result = np.matmul(
+                args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0)
+            )
+        elif "accum" in workload.__name__:
+            np_result = np.matmul(
+                args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64)
+            )
         else:
-            np_result = np.matmul(args_np[0].transpose((0, 2, 1)).reshape(128, 64), args_np[1].transpose(1, 0, 2).reshape(64,128))
-    elif "conv2d_1x1_NCHWc_RSCKk_tune" in workload.__name__:
+            np_result = np.matmul(
+                args_np[0].transpose((0, 2, 1)).reshape(128, 64),
+                args_np[1].transpose(1, 0, 2).reshape(64, 128),
+            )
+    elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__:
         vec_length = args_np[1].shape[-1]
         # nchwc -> nchw
-        args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3])
+        args_np[0] = (
+            args_np[0]
+            .transpose((0, 1, 4, 2, 3))
+            .reshape(
+                args_np[0].shape[0],
+                args_np[0].shape[1] * args_np[0].shape[-1],
+                args_np[0].shape[2],
+                args_np[0].shape[3],
+            )
+        )
         # rsckk -> rsck -> kcrs
-        args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 2, 0, 1))
+        args_np[1] = (
+            args_np[1]
+            .reshape(
+                args_np[1].shape[0],
+                args_np[1].shape[1],
+                args_np[1].shape[2],
+                args_np[1].shape[3] * args_np[1].shape[4],
+            )
+            .transpose((3, 2, 0, 1))
+        )
         np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
         # nkhw -> nkhwk
-        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2)
-    elif "conv2d_1x1_WCHNc_CRSKk_tune" in workload.__name__:
+        np_result = np_result.reshape(
+            np_result.shape[0],
+            np_result.shape[1] // vec_length,
+            vec_length,
+            np_result.shape[2],
+            np_result.shape[3],
+        ).transpose(0, 1, 3, 4, 2)
+    elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__:
         vec_length = args_np[1].shape[-1]
         # wchnc -> nchw
-        args_np[0] = args_np[0].transpose((3, 1, 4, 2, 0)).reshape(args_np[0].shape[3], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[0])
+        args_np[0] = (
+            args_np[0]
+            .transpose((3, 1, 4, 2, 0))
+            .reshape(
+                args_np[0].shape[3],
+                args_np[0].shape[1] * args_np[0].shape[-1],
+                args_np[0].shape[2],
+                args_np[0].shape[0],
+            )
+        )
         # crskk -> crsk -> kcrs
-        args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 0, 1, 2))
+        args_np[1] = (
+            args_np[1]
+            .reshape(
+                args_np[1].shape[0],
+                args_np[1].shape[1],
+                args_np[1].shape[2],
+                args_np[1].shape[3] * args_np[1].shape[4],
+            )
+            .transpose((3, 0, 1, 2))
+        )
         np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
         # nkhw -> nkkhw -> wkhnk
-        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(4, 1, 3, 0, 2)
+        np_result = np_result.reshape(
+            np_result.shape[0],
+            np_result.shape[1] // vec_length,
+            vec_length,
+            np_result.shape[2],
+            np_result.shape[3],
+        ).transpose(4, 1, 3, 0, 2)
     elif "NCHW_KCRS" in workload.__name__:
         np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
     elif "NCHWc_KCRSk" in workload.__name__:
         vec_length = args_np[1].shape[-1]
         # nchwc -> nchw
-        args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3])
+        args_np[0] = (
+            args_np[0]
+            .transpose((0, 1, 4, 2, 3))
+            .reshape(
+                args_np[0].shape[0],
+                args_np[0].shape[1] * args_np[0].shape[-1],
+                args_np[0].shape[2],
+                args_np[0].shape[3],
+            )
+        )
         # kcrsk/cmrsc -> kcrs/cmrs
-        args_np[1] = args_np[1].transpose((0, 4, 1, 2, 3)).reshape(args_np[1].shape[0] * args_np[1].shape[4], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3])
+        args_np[1] = (
+            args_np[1]
+            .transpose((0, 4, 1, 2, 3))
+            .reshape(
+                args_np[1].shape[0] * args_np[1].shape[4],
+                args_np[1].shape[1],
+                args_np[1].shape[2],
+                args_np[1].shape[3],
+            )
+        )
         if "depthwise" in workload.__name__:
-            #np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID")
+            # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID")
             np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], [])
         else:
-            #np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
+            # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
             np_result = ref_convolution(args_np[0], args_np[1], [], [])
         # nkhw -> nkhwk
-        np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2)
+        np_result = np_result.reshape(
+            np_result.shape[0],
+            np_result.shape[1] // vec_length,
+            vec_length,
+            np_result.shape[2],
+            np_result.shape[3],
+        ).transpose(0, 1, 3, 4, 2)
     np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2)
 
-def verify_plus_one_rank3():
-    shape =(32, 32, 4)
-    placeholders = compute(shape)
-    s = schedule(*placeholders)
-    return s, placeholders
 
-def verify_matmul():
-    shape = (32, 64, 4)
-    placeholders = compute_matmul(shape)
-    s = schedule_matmul(*placeholders)
-    return s, placeholders
+class BaseSingleShapeValidator:
+    @tvm.testing.parametrize_targets("opencl")
+    def test_unary(self, test_func, input_shape, target, dev):
+        validate(test_func, target, dev, [input_shape])
 
-def verify_matmul_with_local():
-    shape = (32, 64, 4)
-    placeholders = compute_matmul(shape)
-    s = schedule_matmul(*placeholders, local=True)
-    return s, placeholders
 
-def verify_matmul_inner():
-    shape = (32, 64, 4)
-    placeholders = compute_matmul_inner(shape)
-    s = schedule_matmul_inner(*placeholders)
-    return s, placeholders
+class TestPlusOneRank3(BaseSingleShapeValidator):
+    input_shape = tvm.testing.parameter((32, 32, 4))
 
-def verify_matmul_vector_accumulator():
-    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
-    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
-    s = schedule_matmul_vector_accumulator(*placeholders)
-    return s, placeholders
+    def plus_one(input_shape):
+        return scheduler(compute_plus_one_rank3, schedule_plus_one_rank3, input_shape)
 
-def verify_matmul_vector_accumulator_with_local():
-    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
-    placeholders = compute_matmul_vector_accumulator(shapeA, shapeB)
-    s = schedule_matmul_vector_accumulator(*placeholders, local=True)
-    return s, placeholders
+    test_func = tvm.testing.parameter(plus_one)
 
-def verify_plus_one_rank5():
-    shape =(32, 2, 4, 4, 4)
-    placeholders = compute5d(shape)
-    s = schedule5d(*placeholders)
-    return s, placeholders
 
-def verify_matmul_vector_accumulator_tune():
-    shapeA, shapeB = (32, 64, 4), (128, 16, 4)
-    return matmul_vector_acc_template(shapeA, shapeB)
-
-def verify_conv2d_1x1_NCHWc_RSCKk_tune():
-    # mobilenetv1 1x1 conv2d
-    input_shape, filter_shape = (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4)
-    return conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape)
-
-def verify_conv2d_1x1_WCHNc_CRSKk_tune():
-    input_shape, filter_shape = (56, 128//4, 56, 1, 4), (128, 1, 1, 128//4, 4)
-    return conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape)
-
-def verify_conv2d_cuda_NCHW_KCRS_tune():
-    # NCHW, KCRS
-    input_shape, filter_shape = (1, 128, 56, 56), (128, 128, 1, 1)
-    return conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape)
-
-def verify_conv2d_cuda_NCHWc_KCRSk_tune():
-    # NCHWc, KCRSk
-    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
-    return conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape)
-
-def verify_conv2d_NCHWc_KCRSk_tx_tune():
-    # NCHWc, KCRSk
-    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
-    return conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape)
-
-def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune():
-    # NCHWc, KCRSk
-    input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4)
-    return conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape)
-
-def verify_conv2d_NCHWc_KCRSk_tx_tune2():
-    # NCHWc, KCRSk
-    input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4)
-    # input_shape, filter_shape = (1, 128, 7, 7, 4), (256, 512, 1, 1, 4)
-    # input_shape, filter_shape = (1, 128, 7, 7, 4), (128, 512, 3, 3, 4)
-    # input_shape, filter_shape = (1, 128, 7, 7, 4), (512, 512, 1, 1, 4)
-    return conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape)
-
-def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune2():
-    # NCHWc, KCRSk
-    input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4)
-    return conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape)
-
-def verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune():
-    # deeplabv3
-    # [1, 144, 129, 129], [144, 1, 3, 3]
-    # [1, 96, 257, 257], [96, 1, 3, 3]
-    # [N, C, H, W], [K, 1, R, S]
-    # [N, C/4, H, W, 4c], [C/4, 1, R, S, 4c]
-    input_shape, filter_shape = (1, 96//4, 257, 257, 4), (96//4, 1, 3, 3, 4)
-    return depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape)
-
-@tvm.testing.parametrize_targets("opencl")
-def test_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(target, dev):
-    validate(verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune, target, dev)
+class TestPlusOneRank5(BaseSingleShapeValidator):
+    input_shape = tvm.testing.parameter((32, 2, 4, 4, 4))
+
+    def plus_one(input_shape):
+        return scheduler(compute_plus_one_rank5, schedule_plus_one_rank5, input_shape)
+
+    test_func = tvm.testing.parameter(plus_one)
+
+
+class TestMatmul:
+    input_shape = tvm.testing.parameter((32, 64, 4))
+    local = tvm.testing.parameter(False, True)
+
+    def matmul(input_shape, local):
+        return scheduler(compute_matmul, schedule_matmul, input_shape, local=local)
+
+    def matmul_inner(input_shape, local):
+        return scheduler(compute_matmul_inner, schedule_matmul_inner, input_shape, local=local)
+
+    test_func = tvm.testing.parameter(matmul, matmul_inner)
+
+    @tvm.testing.parametrize_targets("opencl")
+    def test_matmul(self, test_func, input_shape, local, target, dev):
+        validate(test_func, target, dev, [input_shape], local=local)
+
+
+class TestMatmulVectorAccumulator:
+    shapeA = tvm.testing.parameter((32, 64, 4))
+    shapeB = tvm.testing.parameter((128, 16, 4))
+    local = tvm.testing.parameter(False, True)
+
+    def matmul_vector_accumulator(shapeA, shapeB, local):
+        return scheduler(
+            compute_matmul_vector_accumulator,
+            schedule_matmul_vector_accumulator,
+            shapeA,
+            shapeB,
+            local=local,
+        )
+
+    test_func = tvm.testing.parameter(matmul_vector_accumulator)
+
+    @tvm.testing.parametrize_targets("opencl")
+    def test_matmul_vec_acc(self, test_func, shapeA, shapeB, local, target, dev):
+        validate(test_func, target, dev, [shapeA, shapeB], local=local)
+
+
+class BaseConv2DValidator:
+    @tvm.testing.parametrize_targets("opencl")
+    def test_conv2d(self, test_func, input_shapes, target, dev):
+        validate(test_func, target, dev, input_shapes)
+
+
+class TestConv2dNCHWcRSCKk(BaseConv2DValidator):
+    input_shapes = tvm.testing.parameter([(1, 32, 56, 56, 4), (1, 1, 128, 32, 4)])
+    test_func = tvm.testing.parameter(conv2d_1x1_NCHWc_RSCKk)
+
+
+class TestConv2dWCHNcCRSKk(BaseConv2DValidator):
+    input_shapes = tvm.testing.parameter([(56, 32, 56, 1, 4), (128, 1, 1, 32, 4)])
+    test_func = tvm.testing.parameter(conv2d_1x1_WCHNc_CRSKk)
+
+
+class TestConv2dNCHWcKCRSk(BaseConv2DValidator):
+    input_shapes = tvm.testing.parameter(
+        [(1, 32, 56, 56, 4), (32, 128, 1, 1, 4)], [(1, 32, 112, 112, 4), (32, 128, 3, 3, 4)]
+    )
+    test_func = tvm.testing.parameter(conv2d_NCHWc_KCRSk, conv2d_NCHWc_KCRSk_fp32_acc)
+
+
+class TestDepthwiseConv2dNCHWcKCRSk(BaseConv2DValidator):
+    input_shapes = tvm.testing.parameter([(1, 24, 257, 257, 4), (24, 1, 3, 3, 4)])
+    test_func = tvm.testing.parameter(depthwise_conv2d_NCHWc_KCRSk_acc32)
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 2903e53e13d104032015484492a52bcf4363b4e8 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Tue, 17 Aug 2021 21:29:20 -0700
Subject: [PATCH 59/59] Respond to CRs.

---
 src/target/source/codegen_opencl.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 8d760a07e032..7abff36a3ddb 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -36,19 +36,19 @@ namespace codegen {
 
 class InferTextureAccess : public StmtExprVisitor {
  public:
-  static constexpr const uint8_t read_access = 1;
-  static constexpr const uint8_t write_access = 2;
+  static constexpr const uint8_t kReadAccess = 1;
+  static constexpr const uint8_t kWriteAccess = 2;
 
   InferTextureAccess() {}
   std::unordered_map<const VarNode*, std::string> Infer(const Stmt& n) {
     StmtExprVisitor::VisitStmt(n);
     std::unordered_map<const VarNode*, std::string> storage_scope_qualifiers;
     for (auto& texture : var_access_map_) {
-      if (texture.second == read_access) {
+      if (texture.second == kReadAccess) {
         storage_scope_qualifiers.insert({texture.first, "texture_read"});
-      } else if (texture.second == write_access) {
+      } else if (texture.second == kWriteAccess) {
         storage_scope_qualifiers.insert({texture.first, "texture_write"});
-      } else if (texture.second == (read_access | write_access)) {
+      } else if (texture.second == (kReadAccess | kWriteAccess)) {
         storage_scope_qualifiers.insert({texture.first, ""});
       }
     }
@@ -56,9 +56,9 @@ class InferTextureAccess : public StmtExprVisitor {
   }
   void VisitExpr_(const CallNode* op) {
     if (op->op.same_as(builtin::texture2d_load())) {
-      var_access_map_[op->args[0].as<VarNode>()] |= read_access;
+      var_access_map_[op->args[0].as<VarNode>()] |= kReadAccess;
     } else if (op->op.same_as(builtin::texture2d_store())) {
-      var_access_map_[op->args[0].as<VarNode>()] |= write_access;
+      var_access_map_[op->args[0].as<VarNode>()] |= kWriteAccess;
     } else {
       StmtExprVisitor::VisitExpr_(op);
     }