From 245ed1d218eb3ed13c88f58c0604376b0572e8f1 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 27 Oct 2020 16:57:48 -0700 Subject: [PATCH 01/59] Add support for kTexture storage rank. --- src/runtime/thread_storage_scope.h | 7 +++++++ src/te/operation/op_utils.cc | 6 ++++-- src/te/schedule/bound.cc | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index ac8260ffbe39..611a40d996ea 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -59,6 +59,8 @@ enum class StorageRank { kWMMAMatrixB = 5, /*! \brief wmma scope memory of accumulator */ kWMMAAccumulator = 6, + /*! \brief global scope texture memory */ + kTexture = 7, }; /*! @@ -108,6 +110,8 @@ struct StorageScope { return "wmma.matrix_b" + tag; case StorageRank::kWMMAAccumulator: return "wmma.accumulator" + tag; + case StorageRank::kTexture: + return "texture" + tag; default: LOG(FATAL) << "unknown storage scope"; return ""; @@ -143,6 +147,9 @@ struct StorageScope { } else if (s.compare(0, 16, "wmma.accumulator") == 0) { r.rank = StorageRank::kWMMAAccumulator; r.tag = s.substr(16, std::string::npos); + } else if (s.compare(0, 7, "texture") == 0) { + r.rank = StorageRank::kTexture; + r.tag = s.substr(7, std::string::npos); } else { LOG(FATAL) << "unknown storage scope " << s; } diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc index b3897e142545..de0d6b5be848 100644 --- a/src/te/operation/op_utils.cc +++ b/src/te/operation/op_utils.cc @@ -156,10 +156,12 @@ std::vector > MakeLoopNest(const Stage& stage, nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op)); if (!debug_keep_trivial_loop && is_one(dom->extent)) { value_map[iv] = dom->min; + } else if (stage->scope == "") { + value_map[iv] = var; } else { runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag); - if (stage->scope == "" || - static_cast(runtime::StorageScope::Create(stage->scope).rank) <= ts.rank) { + runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope); + if (static_cast(ss.rank) <= ts.rank || ss.rank == runtime::StorageRank::kTexture) { value_map[iv] = var; } else if (stage->scope == "warp" && ts.rank == 1) { // To determine whether a thread index is inside or outside a warp, we need diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc index 12c9b5538b44..c7ec8f23892c 100644 --- a/src/te/schedule/bound.cc +++ b/src/te/schedule/bound.cc @@ -66,7 +66,7 @@ bool NeedRelax(const IterVar& iv, bool found_attach, if (scope.rank == StorageRank::kWarp && ts.rank == 1 && ts.dim_index == 0) { return true; } - return static_cast(scope.rank) <= ts.rank; + return static_cast(scope.rank) <= ts.rank || scope.rank == StorageRank::kTexture; } // infer storage scope, if not given From 9c0921320e4a26294547eff4c7d9ed7b92120d2c Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 2 Nov 2020 14:39:40 -0800 Subject: [PATCH 02/59] Add scaffolding for texture_flatten pass. --- python/tvm/tir/transform/transform.py | 20 ++++ src/tir/transforms/texture_flatten.cc | 137 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 src/tir/transforms/texture_flatten.cc diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 537499a27fa9..a93cf2c3b3f8 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -94,6 +94,26 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False): """ return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute) # type: ignore +def TextureFlatten(cache_line_size, create_bound_attribute=False): + """Flatten the multi-dimensional read/write to 1D. + + + Parameters + ---------- + cache_line_size: int + The size of CPU cache line. + + create_bound_attribute: + Whether to create bound attributes. + + + Returns + ------- + fpass : tvm.transform.Pass + The result pass + """ + return _ffi_api.TextureFlatten(cache_line_size, create_bound_attribute) + def InjectCopyIntrin(pragma_key: str, fintrin): """Inject virtual thread loops. diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc new file mode 100644 index 000000000000..b0e7bd0379dc --- /dev/null +++ b/src/tir/transforms/texture_flatten.cc @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file storage_flatten.cc + * \brief Flattens storage from multi-dimensional array to 1D buffer access + */ +// The pass definition originates from Halide pipeline. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../../arith/ir_visitor_with_analyzer.h" +#include "../../runtime/thread_storage_scope.h" +#include "arg_binder.h" +#include "ir_utils.h" + +namespace tvm { +namespace tir { + +using runtime::StorageRank; +using runtime::StorageScope; +using runtime::ThreadScope; + +class TextureFlattener : public StmtExprMutator { + public: + explicit TextureFlattener(const Map& extern_buffer_map, int cache_line_size, + bool create_bound_attributes, IRVisitorWithAnalyzer* bound_analyzer) {} + + Stmt VisitStmt_(const AttrStmtNode* op) final { + if (op->attr_key == attr::realize_scope) { + storage_scope_[op->node.get()] = op->value.as()->value; + } + return StmtExprMutator::VisitStmt_(op); + } + + Stmt VisitStmt_(const BufferStoreNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + + std::string storage_scope; + auto it = storage_scope_.find(op->buffer.get()); + if (it != storage_scope_.end()) + { + storage_scope = it->second; + } + else + { + storage_scope = op->buffer->scope; + } + if (storage_scope == "texture") + { + // TODO(csullivan): Implement texture intrinsic as builtin + // stmt = Evaluate(Call(op->buffer->dtype, builtin::isnan(), {op->value})); + } + return stmt; + } + + PrimExpr VisitExpr_(const BufferLoadNode* op) final { + PrimExpr expr = StmtExprMutator::VisitExpr_(op); + op = expr.as(); + + std::string storage_scope; + auto it = storage_scope_.find(op->buffer.get()); + if (it != storage_scope_.end()) + { + storage_scope = it->second; + } + else + { + storage_scope = op->buffer->scope; + } + if (storage_scope == "texture") + { + // TODO(csullivan): Implement texture intrinsic as builtin + // expr = Call(op->buffer->dtype, builtin::isnan(), {expr}); + } + return expr; + } + private: + // Storage scope + std::unordered_map storage_scope_; +}; + +PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) { + auto fptr = func.CopyOnWrite(); + + IRVisitorWithAnalyzer bound_analyzer; + bound_analyzer(fptr->body); + fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes, + &bound_analyzer)(std::move(fptr->body)); + return func; +} + +namespace transform { + +Pass TextureFlatten(int cache_line_size, bool create_bound_attributes) { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + return TextureFlatten(std::move(f), cache_line_size, create_bound_attributes); + }; + return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {}); +} + +TVM_REGISTER_GLOBAL("tir.transform.TextureFlatten").set_body_typed(TextureFlatten); + +} // namespace transform + +} // namespace tir +} // namespace tvm From d76878cc246660d0ecb6c395d07b22c363336866 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 2 Nov 2020 15:50:21 -0800 Subject: [PATCH 03/59] Add scaffolding for texture allocation. --- src/tir/transforms/texture_flatten.cc | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index b0e7bd0379dc..eedb2ff3bf56 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -62,6 +62,40 @@ class TextureFlattener : public StmtExprMutator { return StmtExprMutator::VisitStmt_(op); } + Stmt VisitStmt_(const BufferRealizeNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + + std::string storage_scope; + auto it = storage_scope_.find(op->buffer.get()); + if (it != storage_scope_.end()) + { + storage_scope = it->second; + } + else + { + storage_scope = op->buffer->scope; + } + if (storage_scope == "texture") + { + // TODO(csullivan): Implement texture intrinsic as builtin + // Stmt body = this->VisitStmt(op->body); + // Array shape; + // for (auto r : op->bounds) { + // shape.push_back(r->extent); + // } + // if (shape.size() == 0) { + // shape.push_back(make_const(DataType::Int(32), 1)); + // } + // DataType storage_type = op->buffer->dtype; + // // TODO(csullivan): Consider check on float only + // stmt = Allocate(op->buffer->data, storage_type, shape, + // make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body); + // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); + } + return stmt; + } + Stmt VisitStmt_(const BufferStoreNode* op) final { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); @@ -111,12 +145,14 @@ class TextureFlattener : public StmtExprMutator { }; PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) { + // std::cout << "Before TextureFlattening: " << func << std::endl; auto fptr = func.CopyOnWrite(); IRVisitorWithAnalyzer bound_analyzer; bound_analyzer(fptr->body); fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes, &bound_analyzer)(std::move(fptr->body)); + // std::cout << "After TextureFlattening: " << func << std::endl; return func; } From e7c276b8fbd9afdeff1d8e83b93a19aec7dd7232 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 3 Nov 2020 14:16:54 -0800 Subject: [PATCH 04/59] Implement 2d texture flattening to builtin tir.text2d_alloca. --- include/tvm/tir/builtin.h | 4 ++++ src/tir/op/builtin.cc | 3 +++ src/tir/transforms/texture_flatten.cc | 27 +++++++++++++-------------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 61280d33f1df..9f58985df758 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -600,6 +600,10 @@ TVM_DLL const Op& vectorcombine(); * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA */ TVM_DLL const Op& atomic_add(); +/*! + * \brief Create a texture 2d memory allocation + */ +TVM_DLL const Op& text2d_alloca(); /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index f0ca04cbd5fd..4d6575eecaf7 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -246,6 +246,9 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine) TIR_DEFINE_BUILTIN_FUNC(atomic_add) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(text2d_alloca) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + } // namespace builtin } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index eedb2ff3bf56..a38a498612e7 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -78,20 +78,19 @@ class TextureFlattener : public StmtExprMutator { } if (storage_scope == "texture") { - // TODO(csullivan): Implement texture intrinsic as builtin - // Stmt body = this->VisitStmt(op->body); - // Array shape; - // for (auto r : op->bounds) { - // shape.push_back(r->extent); - // } - // if (shape.size() == 0) { - // shape.push_back(make_const(DataType::Int(32), 1)); - // } - // DataType storage_type = op->buffer->dtype; - // // TODO(csullivan): Consider check on float only - // stmt = Allocate(op->buffer->data, storage_type, shape, - // make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body); - // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); + Stmt body = this->VisitStmt(op->body); + Array shape; + for (auto r : op->bounds) { + shape.push_back(r->extent); + } + ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; + ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; + + // TODO(csullivan): Consider check on float only + StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype)); + Array args = {dtype, shape[0], shape[1]}; + stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body); + stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); } return stmt; } From bf321c9f4b061353ebb5e552bac68b2df2d7bec1 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 4 Nov 2020 10:06:01 -0800 Subject: [PATCH 05/59] Lower BufferStore/Load to builtin texture store/load. --- include/tvm/tir/builtin.h | 10 ++++++++++ src/tir/op/builtin.cc | 6 ++++++ src/tir/transforms/texture_flatten.cc | 26 ++++++++++++++++++++------ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 9f58985df758..66fa069d62fa 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -605,6 +605,16 @@ TVM_DLL const Op& atomic_add(); */ TVM_DLL const Op& text2d_alloca(); +/*! + * \brief Store to a texture 2d memory + */ +TVM_DLL const Op& text2d_store(); + +/*! + * \brief Load from a texture 2d memory + */ +TVM_DLL const Op& text2d_load(); + /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { // array head address diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 4d6575eecaf7..ae6397ba9e5c 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -249,6 +249,12 @@ TIR_DEFINE_BUILTIN_FUNC(atomic_add) TIR_DEFINE_BUILTIN_FUNC(text2d_alloca) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(text2d_store) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + +TIR_DEFINE_BUILTIN_FUNC(text2d_load) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + } // namespace builtin } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index a38a498612e7..256ac3cda4dc 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -86,12 +86,14 @@ class TextureFlattener : public StmtExprMutator { ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; - // TODO(csullivan): Consider check on float only + // TODO(csullivan): Consider check on float only? StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype)); Array args = {dtype, shape[0], shape[1]}; stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body); - stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); + // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating + // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); } + return stmt; } @@ -111,9 +113,15 @@ class TextureFlattener : public StmtExprMutator { } if (storage_scope == "texture") { - // TODO(csullivan): Implement texture intrinsic as builtin - // stmt = Evaluate(Call(op->buffer->dtype, builtin::isnan(), {op->value})); + // TODO(csullivan): Need autovectorization + Array args = {op->buffer->data, op->value}; + for (auto& i : op->indices) + { + args.push_back(i); + } + stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args)); } + return stmt; } @@ -133,9 +141,15 @@ class TextureFlattener : public StmtExprMutator { } if (storage_scope == "texture") { - // TODO(csullivan): Implement texture intrinsic as builtin - // expr = Call(op->buffer->dtype, builtin::isnan(), {expr}); + // TODO(csullivan): Need autovectorization + Array args = {op->buffer->data}; + for (auto& i : op->indices) + { + args.push_back(i); + } + expr = Call(op->buffer->dtype, builtin::text2d_load(), args); } + return expr; } private: From 8afb61159183762996fbfe9b1442fcc7ec6d2d0c Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 4 Nov 2020 10:45:47 -0800 Subject: [PATCH 06/59] Add vectorizable attribure to texture load and store. --- src/tir/op/builtin.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index ae6397ba9e5c..7705369eb5c8 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -250,9 +250,11 @@ TIR_DEFINE_BUILTIN_FUNC(text2d_alloca) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); TIR_DEFINE_BUILTIN_FUNC(text2d_store) + .set_attr("TVectorizable", true) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); TIR_DEFINE_BUILTIN_FUNC(text2d_load) + .set_attr("TVectorizable", true) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); } // namespace builtin From 17ca755a76749354f65d561fa15dca49b1ce81ce Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 4 Nov 2020 16:57:14 -0800 Subject: [PATCH 07/59] Support auto-vectorization on the innermost (RGBA) axis. --- python/tvm/tir/transform/transform.py | 10 +---- src/tir/transforms/texture_flatten.cc | 53 ++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index a93cf2c3b3f8..4cdf7d47856e 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -94,25 +94,19 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False): """ return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute) # type: ignore -def TextureFlatten(cache_line_size, create_bound_attribute=False): +def TextureFlatten(): """Flatten the multi-dimensional read/write to 1D. Parameters ---------- - cache_line_size: int - The size of CPU cache line. - - create_bound_attribute: - Whether to create bound attributes. - Returns ------- fpass : tvm.transform.Pass The result pass """ - return _ffi_api.TextureFlatten(cache_line_size, create_bound_attribute) + return _ffi_api.TextureFlatten() def InjectCopyIntrin(pragma_key: str, fintrin): diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 256ac3cda4dc..9a92476021f3 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -52,8 +52,7 @@ using runtime::ThreadScope; class TextureFlattener : public StmtExprMutator { public: - explicit TextureFlattener(const Map& extern_buffer_map, int cache_line_size, - bool create_bound_attributes, IRVisitorWithAnalyzer* bound_analyzer) {} + explicit TextureFlattener() : needs_vectorization_(true) {} Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == attr::realize_scope) { @@ -120,6 +119,10 @@ class TextureFlattener : public StmtExprMutator { args.push_back(i); } stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args)); + if (needs_vectorization_) + { + loop_vars_.insert({op->indices.back().get(), true}); + } } return stmt; @@ -148,32 +151,62 @@ class TextureFlattener : public StmtExprMutator { args.push_back(i); } expr = Call(op->buffer->dtype, builtin::text2d_load(), args); + if (needs_vectorization_) + { + loop_vars_.insert({op->indices.back().get(), true}); + } } return expr; } + + // Auto-vectorize texture load and store loops + Stmt VisitStmt_(const ForNode* op) final { + Stmt stmt; + if (!needs_vectorization_) + { + stmt = StmtMutator::VisitStmt_(op); + } + else if (op->for_type == ForType::Serial) + { + stmt = StmtMutator::VisitStmt_(op); + auto it = loop_vars_.find(op->loop_var.get()); + if (it != loop_vars_.end() && it->second) + { + stmt = For(op->loop_var, op->min, op->extent, ForType::Vectorized, op->device_api, op->body); + stmt = StmtMutator::VisitStmt_(stmt.as()); + } + } + else + { + needs_vectorization_ = false; + stmt = StmtMutator::VisitStmt_(op); + needs_vectorization_ = true; + } + + return stmt; + } + private: // Storage scope std::unordered_map storage_scope_; + std::unordered_map loop_vars_; + bool needs_vectorization_; }; -PrimFunc TextureFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) { +PrimFunc TextureFlatten(PrimFunc func) { // std::cout << "Before TextureFlattening: " << func << std::endl; auto fptr = func.CopyOnWrite(); - - IRVisitorWithAnalyzer bound_analyzer; - bound_analyzer(fptr->body); - fptr->body = TextureFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes, - &bound_analyzer)(std::move(fptr->body)); + fptr->body = TextureFlattener()(std::move(fptr->body)); // std::cout << "After TextureFlattening: " << func << std::endl; return func; } namespace transform { -Pass TextureFlatten(int cache_line_size, bool create_bound_attributes) { +Pass TextureFlatten() { auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { - return TextureFlatten(std::move(f), cache_line_size, create_bound_attributes); + return TextureFlatten(std::move(f)); }; return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {}); } From 560baa9cb8f73cef48d46e71d2fd79fd5cea1b0d Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 9 Nov 2020 11:14:37 -0800 Subject: [PATCH 08/59] Add read/write_imagef opencl codegen for builtin texture load/store. --- src/target/source/codegen_opencl.cc | 28 +++++++ src/tir/transforms/texture_flatten.cc | 101 +++++++++++++++++++++++--- src/tir/transforms/vectorize_loop.cc | 12 +++ 3 files changed, 131 insertions(+), 10 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index edb614d9c122..cb7b0f733b1a 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -243,6 +243,34 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << " *)" << this->GetVarID(load->buffer_var.get()) << " + "; this->PrintExpr(load->index, os); os << ')'; + } else if (op->op.same_as(builtin::text2d_store())) { + os << "write_imagef("; + this->PrintExpr(op->args[0], os); + os << ", "; + os << "(int2)("; + this->PrintExpr(op->args[2], os); + os << ", "; + this->PrintExpr(op->args[1], os); + os << "), "; + this->PrintExpr(op->args[3], os); + os << ")"; + } else if (op->op.same_as(builtin::text2d_load())) { + /* + float4 read_imagef(read_only image2d_t image, + sampler_t sampler, + int2 coord) + */ + // std::cout << "LOAD\n"; + // std::cout << op->args << std::endl; + os << "read_imagef("; + this->PrintExpr(op->args[0], os); + os << ", "; + os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; + os << "(int2)("; + this->PrintExpr(op->args[2], os); + os << ", "; + this->PrintExpr(op->args[1], os); + os << "))"; } else if (op->op.same_as(builtin_call_extern_)) { auto func = Downcast(op->args[0]); // Enable atomics extension if used. diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 9a92476021f3..b97c32b391c9 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -62,6 +62,10 @@ class TextureFlattener : public StmtExprMutator { } Stmt VisitStmt_(const BufferRealizeNode* op) final { + //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4); + // Var buffer_var(op->buffer->data->name_hint, vdtype); + // let_binding_.insert({op->buffer->data, buffer_var}); + Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); @@ -86,8 +90,10 @@ class TextureFlattener : public StmtExprMutator { ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; // TODO(csullivan): Consider check on float only? - StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype)); + //StringImm dtype = StringImm(runtime::DLDataType2String(vdtype)); + StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->data.dtype())); Array args = {dtype, shape[0], shape[1]}; + stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body); // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); @@ -96,6 +102,46 @@ class TextureFlattener : public StmtExprMutator { return stmt; } + // Stmt VisitStmt_(const BufferRealizeNode* op) final { + // //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4); + // // Var buffer_var(op->buffer->data->name_hint, vdtype); + // // let_binding_.insert({op->buffer->data, buffer_var}); + + // Stmt stmt = StmtExprMutator::VisitStmt_(op); + // op = stmt.as(); + + // std::string storage_scope; + // auto it = storage_scope_.find(op->buffer.get()); + // if (it != storage_scope_.end()) + // { + // storage_scope = it->second; + // } + // else + // { + // storage_scope = op->buffer->scope; + // } + // if (storage_scope == "texture") + // { + // Stmt body = this->VisitStmt(op->body); + // Array shape; + // for (auto r : op->bounds) { + // shape.push_back(r->extent); + // } + // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; + // ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; + + // // TODO(csullivan): Consider check on float only? + // StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype)); + // Array args = {dtype, shape[0], shape[1]}; + // stmt = Allocate(op->buffer->data, op->buffer->dtype, shape, + // make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body); + // // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating + // //stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); + // } + + // return stmt; + // } + Stmt VisitStmt_(const BufferStoreNode* op) final { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); @@ -112,13 +158,29 @@ class TextureFlattener : public StmtExprMutator { } if (storage_scope == "texture") { - // TODO(csullivan): Need autovectorization - Array args = {op->buffer->data, op->value}; - for (auto& i : op->indices) + Array args; + if (let_binding_.count(op->buffer->data)) + { + args.push_back(let_binding_[op->buffer->data]); + } + else { - args.push_back(i); + args.push_back(op->buffer->data); } - stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args)); + // for (auto& i : op->indices) + // { + // args.push_back(i); + // } + + // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct. + // I don't think this will work generally when tensor dimension doesn't have (4) in the FCD. + for (size_t i = 0u; i < op->indices.size()-1; i++) + { + args.push_back(op->indices[i]); + } + args.push_back(op->value); + + stmt = Evaluate(Call(DataType::Void(), builtin::text2d_store(), args)); if (needs_vectorization_) { loop_vars_.insert({op->indices.back().get(), true}); @@ -144,12 +206,29 @@ class TextureFlattener : public StmtExprMutator { } if (storage_scope == "texture") { - // TODO(csullivan): Need autovectorization - Array args = {op->buffer->data}; - for (auto& i : op->indices) + Array args; + if (let_binding_.count(op->buffer->data)) + { + args.push_back(let_binding_[op->buffer->data]); + } + else + { + args.push_back(op->buffer->data); + } + + + // for (auto& i : op->indices) + // { + // args.push_back(i); + // } + + // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct. + // I don't think this will work generally when tensor dimension doesn't have (4) in the FCD. + for (size_t i = 0u; i < op->indices.size()-1; i++) { - args.push_back(i); + args.push_back(op->indices[i]); } + expr = Call(op->buffer->dtype, builtin::text2d_load(), args); if (needs_vectorization_) { @@ -190,6 +269,8 @@ class TextureFlattener : public StmtExprMutator { private: // Storage scope std::unordered_map storage_scope_; + // Let binding + std::unordered_map let_binding_; std::unordered_map loop_vars_; bool needs_vectorization_; }; diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 64956bc8ee54..3f33667ea2da 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -266,6 +266,18 @@ class Vectorizer : public StmtMutator, public ExprFunctorop.same_as(builtin::if_then_else())) { return MutateIfThenElseExpr_(op); } + else if (op->op.same_as(builtin::text2d_load())) + { + return Call(op->dtype.with_lanes(4), op->op, op->args); + } + else if (op->op.same_as(builtin::text2d_store())) + { + int lane = 0; + Array value{op->args.back()}; + Array mutated_value = MutateArray(value, &lane); + Array new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]}; + return Call(op->dtype.with_lanes(lane), op->op, new_args); + } auto* op_ptr = op->op.as(); bool vectorizable = op_ptr && op_vectorizable_.get(GetRef(op_ptr), false); From 14806f51618cb8bc3d0de37e9b1d84b10a788c85 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 12 Nov 2020 11:42:04 -0800 Subject: [PATCH 09/59] Add TextureType support. --- include/tvm/ir/type.h | 49 +++++++++++++++++++++++++++ include/tvm/ir/type_functor.h | 4 +++ src/ir/type.cc | 27 +++++++++++++++ src/ir/type_functor.cc | 12 +++++++ src/printer/text_printer.h | 1 + src/printer/tir_text_printer.cc | 6 ++++ src/printer/tvmscript_printer.cc | 7 ++++ src/target/source/codegen_opencl.cc | 15 ++++++++ src/target/source/codegen_opencl.h | 1 + src/tir/op/op.cc | 2 +- src/tir/transforms/texture_flatten.cc | 6 ++-- 11 files changed, 126 insertions(+), 4 deletions(-) diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h index c772650809fa..8d073e88b0ab 100644 --- a/include/tvm/ir/type.h +++ b/include/tvm/ir/type.h @@ -189,6 +189,55 @@ class PointerType : public Type { TVM_DEFINE_OBJECT_REF_METHODS(PointerType, Type, PointerTypeNode); }; +/*! + * \brief Low-level texture type. + * + * TextureType represents type hints in the TIR to be + * passed to the final code generator. + * + * TextureType should not occur in the high-level analysis. + * + * \sa TextureType + */ +class TextureTypeNode : public TypeNode { + public: + /*! + * \brief The base type of the texture. + */ + Type element_type; + + void VisitAttrs(AttrVisitor* v) { v->Visit("element_type", &element_type); } + + bool SEqualReduce(const TextureTypeNode* other, SEqualReducer equal) const { + return equal(element_type, other->element_type); + } + + void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(element_type); } + + static constexpr const char* _type_key = "TextureType"; + TVM_DECLARE_FINAL_OBJECT_INFO(TextureTypeNode, TypeNode); +}; + +/* + * \brief Managed reference to TextureTypeNode. + * \sa TextureTypeNode + */ +class TextureType : public Type { + public: + /*! + * \brief Constructor + * \param element_type The base type of the texture. + */ + TVM_DLL explicit TextureType(Type element_type); + /*! + * \brief Constructor + * \param element_type The base type of the texture. + */ + TVM_DLL explicit TextureType(runtime::DataType dtype); + + TVM_DEFINE_OBJECT_REF_METHODS(TextureType, Type, TextureTypeNode); +}; + /*! \brief Possible kinds of TypeVars. */ enum TypeKind : int { kType = 0, diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h index 11bf7d4740d0..c71051e6f61c 100644 --- a/include/tvm/ir/type_functor.h +++ b/include/tvm/ir/type_functor.h @@ -89,6 +89,7 @@ class TypeFunctor { virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitType_(const PrimTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitType_(const PointerTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; + virtual R VisitType_(const TextureTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitTypeDefault_(const Object* op, Args...) { LOG(FATAL) << "Do not have a default for " << op->GetTypeKey(); throw; // unreachable, written to stop compiler warning @@ -112,6 +113,7 @@ class TypeFunctor { TVM_TYPE_FUNCTOR_DISPATCH(TypeDataNode); TVM_TYPE_FUNCTOR_DISPATCH(PrimTypeNode); TVM_TYPE_FUNCTOR_DISPATCH(PointerTypeNode); + TVM_TYPE_FUNCTOR_DISPATCH(TextureTypeNode); return vtable; } }; @@ -135,6 +137,7 @@ class TVM_DLL TypeVisitor : public TypeFunctor { void VisitType_(const TypeDataNode* op) override; void VisitType_(const PrimTypeNode* op) override; void VisitType_(const PointerTypeNode* op) override; + void VisitType_(const TextureTypeNode* op) override; }; /*! @@ -155,6 +158,7 @@ class TVM_DLL TypeMutator : public TypeFunctor { Type VisitType_(const TypeDataNode* op) override; Type VisitType_(const PrimTypeNode* op) override; Type VisitType_(const PointerTypeNode* op) override; + Type VisitType_(const TextureTypeNode* op) override; private: Array MutateArray(Array arr); diff --git a/src/ir/type.cc b/src/ir/type.cc index fe8e00329bbc..5e0c8911c543 100644 --- a/src/ir/type.cc +++ b/src/ir/type.cc @@ -67,6 +67,33 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << '*'; }); +TextureType::TextureType(Type element_type) { + ObjectPtr n = make_object(); + n->element_type = std::move(element_type); + data_ = std::move(n); +} +TextureType::TextureType(runtime::DataType dtype) { + ObjectPtr n = make_object(); + n->element_type = PrimType(dtype); + data_ = std::move(n); +} + + +TVM_REGISTER_NODE_TYPE(TextureTypeNode); + +TVM_REGISTER_GLOBAL("ir.TextureType").set_body_typed([](Type element_type) { + return TextureType(element_type); +}); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { + auto* node = static_cast(ref.get()); + p->stream << "texture "; + p->Print(node->element_type); + p->stream << '*'; + }); + + TypeVar::TypeVar(String name, TypeKind kind, Span span) { ObjectPtr n = make_object(); n->name_hint = std::move(name); diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc index 51d5d3778c10..e084a82ed7be 100644 --- a/src/ir/type_functor.cc +++ b/src/ir/type_functor.cc @@ -89,6 +89,8 @@ void TypeVisitor::VisitType_(const PrimTypeNode* op) {} void TypeVisitor::VisitType_(const PointerTypeNode* op) { this->VisitType(op->element_type); } +void TypeVisitor::VisitType_(const TextureTypeNode* op) { this->VisitType(op->element_type); } + Type TypeMutator::VisitType(const Type& t) { return t.defined() ? TypeFunctor::VisitType(t) : t; } @@ -198,6 +200,16 @@ Type TypeMutator::VisitType_(const PointerTypeNode* op) { } } +Type TypeMutator::VisitType_(const TextureTypeNode* op) { + Type element_type = VisitType(op->element_type); + + if (element_type.same_as(op->element_type)) { + return GetRef(op); + } else { + return TextureType(element_type); + } +} + // Implements bind. class TypeBinder : public TypeMutator { public: diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h index 0332a2d539d2..55f68f3e36cb 100644 --- a/src/printer/text_printer.h +++ b/src/printer/text_printer.h @@ -333,6 +333,7 @@ class TIRTextPrinter : public StmtFunctor, Doc VisitType_(const PrimTypeNode* node) override; Doc VisitType_(const PointerTypeNode* node) override; + Doc VisitType_(const TextureTypeNode* node) override; Doc VisitType_(const TupleTypeNode* node) override; Doc PrintIRModule(const IRModule& module); diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index f232994480f8..b137ae34107d 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -613,6 +613,12 @@ Doc TIRTextPrinter::VisitType_(const PointerTypeNode* node) { return doc; } +Doc TIRTextPrinter::VisitType_(const TextureTypeNode* node) { + Doc doc; + doc << "Texture(" << Print(node->element_type) << ")"; + return doc; +} + Doc TIRTextPrinter::VisitType_(const TupleTypeNode* node) { std::vector fields; for (Type field : node->fields) { diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index cc7536b48cfd..39852c39b82a 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -145,6 +145,7 @@ class TVMScriptPrinter : public StmtFunctor, Doc VisitType_(const PrimTypeNode* node) override; Doc VisitType_(const PointerTypeNode* node) override; + Doc VisitType_(const TextureTypeNode* node) override; Doc VisitType_(const TupleTypeNode* node) override; Doc PrintBody(const Stmt& body); @@ -732,6 +733,12 @@ Doc TVMScriptPrinter::VisitType_(const PointerTypeNode* node) { return doc; } +Doc TVMScriptPrinter::VisitType_(const TextureTypeNode* node) { + Doc doc; + doc << "ty.Texture[" << Print(node->element_type) << "]"; + return doc; +} + Doc TVMScriptPrinter::VisitType_(const TupleTypeNode* node) { if (node->fields.empty()) { return Doc::Text("None"); diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index cb7b0f733b1a..20cccb0b9198 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -162,6 +162,21 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) { // NOLINT(*) LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type"; } +void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) { // NOLINT(*) + if (auto* ptr = type.as()) { + return PrintType(ptr->dtype, os); + } else if (auto* ptr = type.as()) { + PrintType(ptr->element_type, os); + os << '*'; + } else if (auto* ptr = type.as()){ + os << "image2d_t"; + } else if (IsVoidType(type)) { + os << "void"; + } else { + LOG(FATAL) << "Type " << type << " does not have a corresponding C Type"; + } +} + void CodeGenOpenCL::PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base, std::ostream& os) { // NOLINT(*) if (!HandleTypeMatch(buffer, t.element_of())) { diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 32102fec22b9..f2b6a252f16c 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -45,6 +45,7 @@ class CodeGenOpenCL final : public CodeGenC { void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) void PrintStorageSync(const CallNode* op) final; // NOLINT(*) void PrintType(DataType t, std::ostream& os) final; // NOLINT(*) + void PrintType(const Type& type, std::ostream& os) final; // NOLINT(*) std::string GetVecLoad(DataType t, const VarNode* buffer, PrimExpr base) final; void PrintVecStore(const VarNode* buffer, DataType t, PrimExpr base, const std::string& value) final; // NOLINT(*) diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index d29132450227..d03cf22094a8 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -51,7 +51,7 @@ using namespace tir; runtime::DataType GetRuntimeDataType(const Type& type) { if (auto* n = type.as()) { return n->dtype; - } else if (type.as()) { + } else if (type.as() || type.as()) { return DataType::Handle(); } else if (IsVoidType(type)) { return DataType::Void(); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index b97c32b391c9..0b396f2ca56a 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -62,9 +62,9 @@ class TextureFlattener : public StmtExprMutator { } Stmt VisitStmt_(const BufferRealizeNode* op) final { - //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4); - // Var buffer_var(op->buffer->data->name_hint, vdtype); - // let_binding_.insert({op->buffer->data, buffer_var}); + //Var buffer_var(op->buffer->data->name_hint, DataType::Handle()); + Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1))); + let_binding_.insert({op->buffer->data, buffer_var}); Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); From 0a321d1d0d8951a25bdc678d8fd55c4622081812 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 13 Nov 2020 13:02:42 -0800 Subject: [PATCH 10/59] Add InferTextureAccess pass to deduce __read_only and __write_only access qualifiers for texture vars. Also refactor use of restrict keyword to be var dependent. --- src/target/source/codegen_c.cc | 10 +++- src/target/source/codegen_c.h | 2 + src/target/source/codegen_opencl.cc | 72 ++++++++++++++++++++++++++- src/target/source/codegen_opencl.h | 7 ++- src/tir/transforms/texture_flatten.cc | 55 +++----------------- 5 files changed, 93 insertions(+), 53 deletions(-) diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index f676f0f598d8..a311111532c8 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -106,8 +106,8 @@ void CodeGenC::AddFunction(const PrimFunc& f) { } } - if (no_alias && restrict_keyword_.length() != 0) { - stream << ' ' << restrict_keyword_; + if (no_alias) { + PrintRestrict(v, stream); } } else { PrintType(GetType(v), stream); @@ -1018,6 +1018,12 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value, return; } +void CodeGenC::PrintRestrict(const Var& v, std::ostream& os) { + if (restrict_keyword_.length() != 0) { + os << ' ' << restrict_keyword_; + } +} + static bool CheckOutermostBracketMatch(const std::string& s) { if (!s.empty() && s.front() == '(' && s.back() == ')') { size_t len = s.size(); diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h index 6ebade7191f2..299f7e0a9cef 100644 --- a/src/target/source/codegen_c.h +++ b/src/target/source/codegen_c.h @@ -200,6 +200,8 @@ class CodeGenC : public ExprFunctor, virtual std::string CastFromTo(std::string value, DataType from, DataType target); // Get load of single element with expression virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os); + // Print restrict keyword for a given Var if applicable + virtual void PrintRestrict(const Var& v, std::ostream& os); protected: // Print reference to struct location diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 20cccb0b9198..13e65e20bcc6 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -33,12 +33,61 @@ namespace tvm { namespace codegen { +class InferTextureAccess : public StmtExprVisitor { +public: + static constexpr const uint8_t read_access = 1; + static constexpr const uint8_t write_access = 2; + + explicit InferTextureAccess() {} + std::unordered_map Infer(const Stmt& n) { + this->operator()(n); + std::unordered_map storage_scope_qualifiers; + for (auto& texture : var_access_map_) { + if (texture.second == read_access) { + storage_scope_qualifiers.insert({texture.first, "__read_only "}); + } + else if (texture.second == write_access) { + storage_scope_qualifiers.insert({texture.first, "__write_only "}); + } + else if (texture.second == (read_access | write_access)) { + storage_scope_qualifiers.insert({texture.first, ""}); + } + } + return storage_scope_qualifiers; + } + void VisitExpr_(const CallNode* op) { + if (!op->args.size()) + { + return; + } + if (const VarNode* buffer = op->args[0].as()) + { + if (op->op.same_as(builtin::text2d_load())) { + var_access_map_[buffer] |= read_access; + } + else if (op->op.same_as(builtin::text2d_store())) { + var_access_map_[buffer] |= write_access; + } + } + } +private: + std::unordered_map var_access_map_; +}; + + CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; } void CodeGenOpenCL::InitFuncState(const PrimFunc& f) { CodeGenC::InitFuncState(f); + this->SetTextureScope(InferTextureAccess().Infer(f->body)); for (Var arg : f->params) { - if (arg.dtype().is_handle()) { + if (arg->type_annotation.as()) + { + // Storage scope qualifiers for textures are inferred + // and set prior function codegen. + continue; + } + else if (arg.dtype().is_handle()) { alloc_storage_scope_[arg.get()] = "global"; } } @@ -168,7 +217,7 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) { // NOLINT(* } else if (auto* ptr = type.as()) { PrintType(ptr->element_type, os); os << '*'; - } else if (auto* ptr = type.as()){ + } else if (type.as()){ os << "image2d_t"; } else if (IsVoidType(type)) { os << "void"; @@ -226,6 +275,18 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os } else if (scope == "shared") { os << "__local "; } + else + { + os << scope; + } +} + +void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) { + // Only apply restrict qualifer for non-texture types + if (v->type_annotation.as() == nullptr) + { + os << ' ' << restrict_keyword_; + } } std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) { @@ -323,6 +384,13 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) { // N } } +void CodeGenOpenCL::SetTextureScope(const std::unordered_map& scope) { // NOLINT(*) + for (auto& texture : scope) + { + alloc_storage_scope_.insert(texture); + } +} + runtime::Module BuildOpenCL(IRModule mod, Target target) { using tvm::runtime::Registry; bool output_ssa = false; diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index f2b6a252f16c..3bd71ba9dec8 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -51,8 +51,11 @@ class CodeGenOpenCL final : public CodeGenC { const std::string& value) final; // NOLINT(*) // the address of load/store void PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base, - std::ostream& os); // NOLINT(*) - std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*) + std::ostream& os); // NOLINT(*) + void PrintRestrict(const Var& v, std::ostream& os) final; // NOLINT(*) + std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*) + void SetTextureScope(const std::unordered_map&); // NOLINT(*) + // overload visitor void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 0b396f2ca56a..047251e48f00 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -90,58 +90,19 @@ class TextureFlattener : public StmtExprMutator { ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; // TODO(csullivan): Consider check on float only? - //StringImm dtype = StringImm(runtime::DLDataType2String(vdtype)); - StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->data.dtype())); - Array args = {dtype, shape[0], shape[1]}; + StringImm dtype(runtime::DLDataType2String(buffer_var.dtype())); + + // StringImm func("device_api.opencl.AllocImage2d"); + // Array args = {func, dtype, shape[0], shape[1]}; + // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body); - stmt = LetStmt(op->buffer->data, Call(op->buffer->data.dtype(), builtin::text2d_alloca(), args), body); - // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating - // stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); + Array args = {dtype, shape[0], shape[1]}; + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } return stmt; } - // Stmt VisitStmt_(const BufferRealizeNode* op) final { - // //DataType vdtype(op->buffer->dtype.code(), op->buffer->dtype.bits(), 4); - // // Var buffer_var(op->buffer->data->name_hint, vdtype); - // // let_binding_.insert({op->buffer->data, buffer_var}); - - // Stmt stmt = StmtExprMutator::VisitStmt_(op); - // op = stmt.as(); - - // std::string storage_scope; - // auto it = storage_scope_.find(op->buffer.get()); - // if (it != storage_scope_.end()) - // { - // storage_scope = it->second; - // } - // else - // { - // storage_scope = op->buffer->scope; - // } - // if (storage_scope == "texture") - // { - // Stmt body = this->VisitStmt(op->body); - // Array shape; - // for (auto r : op->bounds) { - // shape.push_back(r->extent); - // } - // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; - // ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; - - // // TODO(csullivan): Consider check on float only? - // StringImm dtype = StringImm(runtime::DLDataType2String(op->buffer->dtype)); - // Array args = {dtype, shape[0], shape[1]}; - // stmt = Allocate(op->buffer->data, op->buffer->dtype, shape, - // make_const(DataType::Bool(op->buffer->dtype.lanes()), true), body); - // // TODO(csullivan): Adding the below AttrStmt causes SIGSEGV, worth investigating - // //stmt = AttrStmt(op->buffer->data, attr::storage_scope, StringImm(storage_scope), stmt); - // } - - // return stmt; - // } - Stmt VisitStmt_(const BufferStoreNode* op) final { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); @@ -180,7 +141,7 @@ class TextureFlattener : public StmtExprMutator { } args.push_back(op->value); - stmt = Evaluate(Call(DataType::Void(), builtin::text2d_store(), args)); + stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args)); if (needs_vectorization_) { loop_vars_.insert({op->indices.back().get(), true}); From b96daafa8fafa96cf431b556d297b83c41abeecb Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 19 Mar 2021 15:46:28 -0700 Subject: [PATCH 11/59] Implement texture allocation as external function in TIR lowering. --- src/target/llvm/codegen_cpu.cc | 2 ++ src/tir/op/runtime.cc | 10 +++++++ src/tir/transforms/lower_tvm_builtin.cc | 38 +++++++++++++++++++++++++ src/tir/transforms/texture_flatten.cc | 6 ++-- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index ab96d6e69d14..8b01f9d9186e 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -403,6 +403,8 @@ void CodeGenCPU::InitGlobalContext(bool dynamic_lookup) { // Mark as context functions gv_func_map_["TVMBackendAllocWorkspace"] = nullptr; gv_func_map_["TVMBackendFreeWorkspace"] = nullptr; + gv_func_map_["TVMBackendAllocTexture"] = nullptr; + gv_func_map_["TVMBackendFreeTexture"] = nullptr; } } } diff --git a/src/tir/op/runtime.cc b/src/tir/op/runtime.cc index adabae9e75f7..2a894d00ec0c 100644 --- a/src/tir/op/runtime.cc +++ b/src/tir/op/runtime.cc @@ -37,5 +37,15 @@ TVM_REGISTER_OP("tir.TVMBackendFreeWorkspace") .set_attr("TGlobalSymbol", "TVMBackendFreeWorkspace") .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TVM_REGISTER_OP("tir.TVMBackendAllocTexture") + .set_num_inputs(6) + .set_attr("TGlobalSymbol", "TVMBackendAllocTexture") + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + +TVM_REGISTER_OP("tir.TVMBackendFreeTexture") + .set_num_inputs(3) + .set_attr("TGlobalSymbol", "TVMBackendFreeTexture") + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 8b70817398e4..19d434006b83 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -98,6 +98,15 @@ class BuiltinLower : public StmtExprMutator { } } + Stmt VisitStmt_(const LetStmtNode* op) final { + if (const CallNode* call = op->value.as()) { + if (call->op.same_as(builtin::text2d_alloca())) { + return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call)); + } + } + return StmtExprMutator::VisitStmt_(op); + } + Stmt VisitStmt_(const AllocateNode* op) { // Lower allocate to device allocate when needed. Stmt stmt = StmtExprMutator::VisitStmt_(op); @@ -184,6 +193,7 @@ class BuiltinLower : public StmtExprMutator { return StmtExprMutator::VisitExpr_(op); } } + // call shape PrimExpr MakeShape(const CallNode* op) { // if args.size() == 0, it represents a scalar shape () @@ -341,6 +351,34 @@ class BuiltinLower : public StmtExprMutator { return Call(op->dtype, builtin::tvm_call_trace_packed_lowered(), packed_args); } + Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) { + ICHECK(device_type_.defined()) << "Unknown device type in current IR"; + ICHECK(device_id_.defined()) << "Unknown device id in current IR"; + Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); + + Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), + throw_last_error), + let->body}); + DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; + Stmt alloca = LetStmt( + let->var, + Call(let->var.dtype(), Op::Get("tir.TVMBackendAllocTexture"), + {cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), + cast(DataType::UInt(64), call->args[0]), + cast(DataType::UInt(64), call->args[1]), + IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits())}), + body); + + PrimExpr free_op = Call(DataType::Int(32), Op::Get("tir.TVMBackendFreeTexture"), + {cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), let->var}); + Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); + body = SeqStmt({alloca, free_stmt}); + return body; + } + private: bool IsArrayHandle(const PrimExpr& arg) { // specially set array handle. diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 047251e48f00..d8063105483d 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -90,13 +90,13 @@ class TextureFlattener : public StmtExprMutator { ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; // TODO(csullivan): Consider check on float only? - StringImm dtype(runtime::DLDataType2String(buffer_var.dtype())); + // StringImm dtype(runtime::DLDataType2String(buffer_var.dtype())); // StringImm func("device_api.opencl.AllocImage2d"); // Array args = {func, dtype, shape[0], shape[1]}; // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body); - Array args = {dtype, shape[0], shape[1]}; + Array args = {shape[0], shape[1]}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -141,7 +141,7 @@ class TextureFlattener : public StmtExprMutator { } args.push_back(op->value); - stmt = Evaluate(Call(op->buffer->dtype, builtin::text2d_store(), args)); + stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); if (needs_vectorization_) { loop_vars_.insert({op->indices.back().get(), true}); From 83e9af34922942175c8816c30ad85fa430ac0d68 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 18 Nov 2020 10:59:43 -0800 Subject: [PATCH 12/59] Remove commented lines. --- src/target/source/codegen_opencl.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 13e65e20bcc6..18afc96301b9 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -331,13 +331,6 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args[3], os); os << ")"; } else if (op->op.same_as(builtin::text2d_load())) { - /* - float4 read_imagef(read_only image2d_t image, - sampler_t sampler, - int2 coord) - */ - // std::cout << "LOAD\n"; - // std::cout << op->args << std::endl; os << "read_imagef("; this->PrintExpr(op->args[0], os); os << ", "; From fd0d23aca93fb90a209346442b534ae3094d86b0 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 19 Nov 2020 16:36:54 -0800 Subject: [PATCH 13/59] Add nd->2d texture flattening. --- src/tir/ir/buffer.cc | 2 + src/tir/transforms/texture_flatten.cc | 285 +++++++++++++++++++++++--- 2 files changed, 254 insertions(+), 33 deletions(-) diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 335ff19dd775..90560e0dcac7 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -54,6 +54,7 @@ Buffer decl_buffer(Array shape, DataType dtype, String name, String st Array(), PrimExpr(), name, 0, 0, kDefault, span); } +namespace { // Split the given expression w.r.t the add operator inline std::vector ExprSplitAddition(const PrimExpr& expr) { using namespace tir; @@ -290,6 +291,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array index, DataTyp return offset; } } +} PrimExpr Buffer::vload(Array begin, DataType dtype) const { // specially handle bool, stored as DataType::Int(8) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index d8063105483d..c69c9d68d0b2 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include #include +#include #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/thread_storage_scope.h" @@ -45,10 +47,212 @@ namespace tvm { namespace tir { +namespace { + using IndexMod = tir::FloorModNode; + using IndexDiv = tir::FloorDivNode; + +// Split the given expression w.r.t the add operator +inline std::vector ExprSplitAddition(const PrimExpr& expr) { + using namespace tir; + std::vector ret; + std::stack split_buffer; + split_buffer.push(&expr); + while (!split_buffer.empty()) { + const PrimExpr* top_ele = split_buffer.top(); + split_buffer.pop(); + auto expr_add_match = top_ele->as(); + if (expr_add_match) { + split_buffer.push(&expr_add_match->b); + split_buffer.push(&expr_add_match->a); + } else { + ret.emplace_back(top_ele); + } + } + return ret; +} -using runtime::StorageRank; -using runtime::StorageScope; -using runtime::ThreadScope; +// Searches for the following types of expr: +// mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki +// mod_l_expr = c +// mod_r_expr = k1 * k2 * ... * ki +// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c) +// Currently the we will not search the add/mult combinations exhaustively +// as it will take too much computation. +inline std::pair MergeMulModInner(const PrimExpr& mult_expr, + const PrimExpr& mod_l_expr, + const PrimExpr& mod_r_expr) { + using namespace tir; + const MulNode* mult_ptr = mult_expr.as(); + if (!mult_ptr) return std::make_pair(false, PrimExpr()); + PrimExpr mult_outer = mult_ptr->b; + const PrimExpr* inner = &(mult_ptr->a); + // 1. Calculate the outer multiplier + while (true) { + mult_ptr = inner->as(); + if (mult_ptr) { + inner = &(mult_ptr->a); + mult_outer = mult_ptr->b * mult_outer; + } else { + break; + } + } + // 2. Search for the pattern c / (...) * (...) + c % (...) + // We match the search element with Add, Mul and Div. + // If Add is found, we need to continue our search for the rhs + // If Mult is found, we will expand the inner multiplication factor + // If Div is found, we will go on testing whether lhs matches the lhs of mod expr + // and returns the optimization result. + const PrimExpr* search_ptr = inner; + PrimExpr mult_inner; // The inner multiplication factor + PrimExpr no_opt_sum; // Sum of the exprs that cannot be optimized + tir::ExprDeepEqual expr_equal; + + while (true) { + auto inner_div_ptr = search_ptr->as(); + auto inner_mult_ptr = search_ptr->as(); + auto inner_add_ptr = search_ptr->as(); + if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) { + return std::make_pair(false, PrimExpr()); + } else if (inner_div_ptr) { + PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer; + if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) && + expr_equal(inner_div_ptr->a, mod_l_expr)) { + // Found! + PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr; + return std::make_pair(true, ret); + } else { + return std::make_pair(false, PrimExpr()); + } + } else if (inner_mult_ptr) { + mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b; + search_ptr = &(inner_mult_ptr->a); + } else if (inner_add_ptr) { + if (mult_inner.get()) { + return std::make_pair(false, PrimExpr()); + } + no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a; + search_ptr = &(inner_add_ptr->b); + } else { + LOG(FATAL) << "Unexpected search result!"; + break; + } + } + return std::make_pair(false, PrimExpr()); +} + +// Insert the elements into the corresponding mult_exprs and mod_exprs. +// If the element is found to match Mul, it will be pushed to the mult_exprs. +// If the element it found to match Mod, it will be pused to the mod_exprs. +// Otherwise, the elements will be added to the no_opt_sum variable +inline void MergeMulModInsertElements(const std::vector& eles, + std::list* mult_exprs, + std::list >* mod_exprs, + PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) { + using namespace tir; + *has_mult = false; + *has_mod = false; + for (const PrimExpr* ele : eles) { + auto mod_ptr = ele->as(); + auto mult_ptr = ele->as(); + if (mod_ptr) { + *has_mod = true; + mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b))); + } else if (mult_ptr) { + *has_mult = true; + mult_exprs->emplace_back(*ele); + } else { + *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele; + } + } +} + +// Searches for this types of expr: +// (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki +// + c % (k1 * k2 * ... * ki) +// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c +// The search will be performed repeatively until no pattern is found. +// Return: a pair with (false, Expr()) if cannot be optimized. +// a pair with (true, optimized_expr) if can be optimized +inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { + using namespace tir; + // 1. Prepare the lists. + // We store two lists, a list that contain all the elements that match Mul and + // a list that contain all the elements that match Mod. + // The elements in the Mod will be used to match against the elements in Mul. + // The result will then be split and pushed back to these two lists. + PrimExpr simplified_base = analyzer->Simplify(base); + std::vector eles = ExprSplitAddition(simplified_base); + std::list mult_exprs; + std::list > mod_exprs; + PrimExpr no_opt_sum; + bool has_mult; + bool has_mod; + MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod); + bool find_opt = false; + std::list >::iterator search_mod_it = mod_exprs.begin(); + // 2. Exhaustive Search + while (search_mod_it != mod_exprs.end()) { + std::list::iterator mult_it = mult_exprs.begin(); + bool inner_find_opt = false; + while (mult_it != mult_exprs.end()) { + std::pair ret = + MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second); + if (ret.first) { + inner_find_opt = true; + auto temp_mod_it = search_mod_it; + ++search_mod_it; + mod_exprs.erase(temp_mod_it); + mult_exprs.erase(mult_it); + std::vector ret_eles = ExprSplitAddition(ret.second); + MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, + &has_mod); + if (has_mult) { + search_mod_it = mod_exprs.begin(); + } else if (has_mod && search_mod_it == mod_exprs.end()) { + search_mod_it--; + } + break; + } else { + ++mult_it; + } + } + find_opt = find_opt || inner_find_opt; + if (!inner_find_opt) { + ++search_mod_it; + } + } + if (!find_opt) { + return simplified_base; + } + for (std::list::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) { + no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it; + } + for (std::list >::iterator it = mod_exprs.begin(); + it != mod_exprs.end(); ++it) { + no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second) + : indexmod(it->first, it->second); + } + return no_opt_sum; +} + +inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) { + PrimExpr base = make_const(DataType::Int(32), 0); //IntImm(DataType::Int(32), 0); + ICHECK_EQ(shape.size(), index.size()); + arith::Analyzer ana; + if (index.size() > 0) { + PrimExpr offset = index[0]; + for (size_t i = 1; i < index.size(); ++i) { + offset = MergeMulMod(&ana, offset * shape[i] + index[i]); + } + base = base + offset; + } + return base; +} + +size_t GetAxisSeparator() { + return 1; +} +} class TextureFlattener : public StmtExprMutator { public: @@ -62,7 +266,6 @@ class TextureFlattener : public StmtExprMutator { } Stmt VisitStmt_(const BufferRealizeNode* op) final { - //Var buffer_var(op->buffer->data->name_hint, DataType::Handle()); Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1))); let_binding_.insert({op->buffer->data, buffer_var}); @@ -82,21 +285,25 @@ class TextureFlattener : public StmtExprMutator { if (storage_scope == "texture") { Stmt body = this->VisitStmt(op->body); + ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; + ICHECK_EQ(static_cast(op->bounds.back()->extent.as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; + Array shape; - for (auto r : op->bounds) { - shape.push_back(r->extent); + auto width = IntImm(DataType::Int(32), 1); + auto height = IntImm(DataType::Int(32), 1); + //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4) + for (size_t i = 0; i < op->bounds.size()-1; i++) { + if (i < GetAxisSeparator()) { + width *= op->bounds[i]->extent; + } else { + height *= op->bounds[i]->extent; + } } - ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; - ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; - // TODO(csullivan): Consider check on float only? - // StringImm dtype(runtime::DLDataType2String(buffer_var.dtype())); + // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; + // ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; - // StringImm func("device_api.opencl.AllocImage2d"); - // Array args = {func, dtype, shape[0], shape[1]}; - // stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::tvm_call_packed(), args), body); - - Array args = {shape[0], shape[1]}; + Array args = {width, height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -128,17 +335,24 @@ class TextureFlattener : public StmtExprMutator { { args.push_back(op->buffer->data); } - // for (auto& i : op->indices) - // { - // args.push_back(i); - // } - - // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct. - // I don't think this will work generally when tensor dimension doesn't have (4) in the FCD. - for (size_t i = 0u; i < op->indices.size()-1; i++) + + Array row_dims, row_indices, col_dims, col_indices; + for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - args.push_back(op->indices[i]); + if (i < GetAxisSeparator()) { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); + } else { + col_dims.push_back(op->buffer->shape[i]); + col_indices.push_back(op->indices[i]); + } } + + PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); + PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); + + args.push_back(row_offset); + args.push_back(col_offset); args.push_back(op->value); stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); @@ -178,18 +392,23 @@ class TextureFlattener : public StmtExprMutator { } - // for (auto& i : op->indices) - // { - // args.push_back(i); - // } - - // TODO(csullivan)-BeforePR: Consider whether always dropping the last index is correct. - // I don't think this will work generally when tensor dimension doesn't have (4) in the FCD. - for (size_t i = 0u; i < op->indices.size()-1; i++) + Array row_dims, row_indices, col_dims, col_indices; + for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - args.push_back(op->indices[i]); + if (i < GetAxisSeparator()) { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); + } else { + col_dims.push_back(op->buffer->shape[i]); + col_indices.push_back(op->indices[i]); + } } + PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); + PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); + args.push_back(row_offset); + args.push_back(col_offset); + expr = Call(op->buffer->dtype, builtin::text2d_load(), args); if (needs_vectorization_) { From 56451127617e89b8248bafee8832f0e1fd91e5ec Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 2 Dec 2020 17:00:13 -0800 Subject: [PATCH 14/59] Bug fixes in opencl codegen (row<>col, access quals.) --- src/target/source/codegen_opencl.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 18afc96301b9..8a82e9e78b0d 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -44,10 +44,10 @@ class InferTextureAccess : public StmtExprVisitor { std::unordered_map storage_scope_qualifiers; for (auto& texture : var_access_map_) { if (texture.second == read_access) { - storage_scope_qualifiers.insert({texture.first, "__read_only "}); + storage_scope_qualifiers.insert({texture.first, "texture_read"}); } else if (texture.second == write_access) { - storage_scope_qualifiers.insert({texture.first, "__write_only "}); + storage_scope_qualifiers.insert({texture.first, "texture_write"}); } else if (texture.second == (read_access | write_access)) { storage_scope_qualifiers.insert({texture.first, ""}); @@ -274,10 +274,10 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os os << "__global "; } else if (scope == "shared") { os << "__local "; - } - else - { - os << scope; + } else if (scope == "texture_read") { + os << "__read_only "; + } else if (scope == "texture_write") { + os << "__write_only "; } } @@ -324,9 +324,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args[0], os); os << ", "; os << "(int2)("; - this->PrintExpr(op->args[2], os); - os << ", "; this->PrintExpr(op->args[1], os); + os << ", "; + this->PrintExpr(op->args[2], os); os << "), "; this->PrintExpr(op->args[3], os); os << ")"; @@ -336,9 +336,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << ", "; os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; os << "(int2)("; - this->PrintExpr(op->args[2], os); - os << ", "; this->PrintExpr(op->args[1], os); + os << ", "; + this->PrintExpr(op->args[2], os); os << "))"; } else if (op->op.same_as(builtin_call_extern_)) { auto func = Downcast(op->args[0]); From 34549c7e248efe409d1f7b0a6ed92549a8d37ba0 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 9 Dec 2020 15:29:23 -0800 Subject: [PATCH 15/59] Improve texture codegen by explicitly allocating local vector for the texture load. Also support indexing individual elements of the RGBA vector. --- src/target/source/codegen_opencl.cc | 44 +++++++++++++++++++++------ src/target/source/codegen_opencl.h | 3 ++ src/tir/transforms/texture_flatten.cc | 2 +- src/tir/transforms/vectorize_loop.cc | 8 ++++- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 8a82e9e78b0d..010f30890217 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -305,6 +305,12 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType return os.str(); } +void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { + stored_value_ = op->value; + CodeGenC::VisitStmt_(op); + stored_value_ = PrimExpr(nullptr); +} + void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { if (op->op.same_as(builtin::address_of())) { // Overload tvm_address_of to add storage scope (e.g. __global). @@ -331,15 +337,35 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args[3], os); os << ")"; } else if (op->op.same_as(builtin::text2d_load())) { - os << "read_imagef("; - this->PrintExpr(op->args[0], os); - os << ", "; - os << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; - os << "(int2)("; - this->PrintExpr(op->args[1], os); - os << ", "; - this->PrintExpr(op->args[2], os); - os << "))"; + std::stringstream ss; + ss << "read_imagef("; + this->PrintExpr(op->args[0], ss); + ss << ", "; + ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; + ss << "(int2)("; + this->PrintExpr(op->args[1], ss); + ss << ", "; + this->PrintExpr(op->args[2], ss); + ss << "))"; + + // Only use local SSA if texture is not already being stored + auto value = GetRef(stored_value_.as()); + if (value.same_as(GetRef(op))) + { + os << ss.str(); + } else { + std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4)); + if (op->args.back().as()) + { + os << rhs; + } else { + os << "(("; + this->PrintType(op->dtype.with_lanes(1), os); + os << "*)&" << rhs << ")["; + this->PrintExpr(op->args.back(), os); + os << "]"; + } + } } else if (op->op.same_as(builtin_call_extern_)) { auto func = Downcast(op->args[0]); // Enable atomics extension if used. diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 3bd71ba9dec8..374ae4ae56b6 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -61,6 +61,8 @@ class CodeGenOpenCL final : public CodeGenC { void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const BroadcastNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*) + void VisitStmt_(const StoreNode* op) final; // NOLINT(*) + private: // whether enable fp16 and fp64 extension @@ -68,6 +70,7 @@ class CodeGenOpenCL final : public CodeGenC { bool enable_fp64_{false}; // Whether to enable atomics extension. bool enable_atomics_{false}; + PrimExpr stored_value_{nullptr}; }; } // namespace codegen diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index c69c9d68d0b2..953e6664940d 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -408,7 +408,7 @@ class TextureFlattener : public StmtExprMutator { PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); args.push_back(row_offset); args.push_back(col_offset); - + args.push_back(op->indices.back()); expr = Call(op->buffer->dtype, builtin::text2d_load(), args); if (needs_vectorization_) { diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 3f33667ea2da..9943d1e37938 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -268,11 +268,17 @@ class Vectorizer : public StmtMutator, public ExprFunctorop.same_as(builtin::text2d_load())) { - return Call(op->dtype.with_lanes(4), op->op, op->args); + int lane = 0; + Array fcd = MutateArray({op->args.back()}, &lane); + auto new_args = op->args; + new_args.pop_back(); + new_args.push_back(fcd[0]); + return Call(op->dtype.with_lanes(4), op->op, new_args); } else if (op->op.same_as(builtin::text2d_store())) { int lane = 0; + // Vectorize the value to store Array value{op->args.back()}; Array mutated_value = MutateArray(value, &lane); Array new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]}; From 623f2eb45a2bb1ed9bac1aa3755a790ec4202b6f Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 8 Dec 2020 14:30:11 -0800 Subject: [PATCH 16/59] Remove automatic vectorization code as it is no longer needed. --- src/tir/transforms/texture_flatten.cc | 39 +-------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 953e6664940d..28bbf3e61cb5 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -256,7 +256,7 @@ size_t GetAxisSeparator() { class TextureFlattener : public StmtExprMutator { public: - explicit TextureFlattener() : needs_vectorization_(true) {} + explicit TextureFlattener() {} Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == attr::realize_scope) { @@ -356,10 +356,6 @@ class TextureFlattener : public StmtExprMutator { args.push_back(op->value); stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); - if (needs_vectorization_) - { - loop_vars_.insert({op->indices.back().get(), true}); - } } return stmt; @@ -410,49 +406,16 @@ class TextureFlattener : public StmtExprMutator { args.push_back(col_offset); args.push_back(op->indices.back()); expr = Call(op->buffer->dtype, builtin::text2d_load(), args); - if (needs_vectorization_) - { - loop_vars_.insert({op->indices.back().get(), true}); - } } return expr; } - // Auto-vectorize texture load and store loops - Stmt VisitStmt_(const ForNode* op) final { - Stmt stmt; - if (!needs_vectorization_) - { - stmt = StmtMutator::VisitStmt_(op); - } - else if (op->for_type == ForType::Serial) - { - stmt = StmtMutator::VisitStmt_(op); - auto it = loop_vars_.find(op->loop_var.get()); - if (it != loop_vars_.end() && it->second) - { - stmt = For(op->loop_var, op->min, op->extent, ForType::Vectorized, op->device_api, op->body); - stmt = StmtMutator::VisitStmt_(stmt.as()); - } - } - else - { - needs_vectorization_ = false; - stmt = StmtMutator::VisitStmt_(op); - needs_vectorization_ = true; - } - - return stmt; - } - private: // Storage scope std::unordered_map storage_scope_; // Let binding std::unordered_map let_binding_; - std::unordered_map loop_vars_; - bool needs_vectorization_; }; PrimFunc TextureFlatten(PrimFunc func) { From 5f9ebd1ae78daeb48a17c25b5d6fa48ad6b57a70 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 15 Dec 2020 15:10:22 -0800 Subject: [PATCH 17/59] Improve SSA local use when storing texture read to scalar buffer. --- src/target/source/codegen_opencl.cc | 28 ++++++++++++++++++++++------ src/target/source/codegen_opencl.h | 4 +++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 010f30890217..6367bd1b40d8 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -306,9 +306,26 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType } void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { - stored_value_ = op->value; + if (auto call = op->value.as()) { + if (call->op.same_as(builtin::text2d_load())) { + need_texture_ssa_ = false; + // If storing a texture load into a buffer, don't use an + // intermediate local unless the buffer allocation is a + // single element selected from the texture read. + auto it = allocation_size_.find(op->buffer_var.get()); + if (it != allocation_size_.end() && it->second == 1) + { + need_texture_ssa_ = true; + } + } + } + CodeGenC::VisitStmt_(op); + need_texture_ssa_ = true; +} + +void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) { + allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()}); CodeGenC::VisitStmt_(op); - stored_value_ = PrimExpr(nullptr); } void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { @@ -349,11 +366,8 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { ss << "))"; // Only use local SSA if texture is not already being stored - auto value = GetRef(stored_value_.as()); - if (value.same_as(GetRef(op))) + if (need_texture_ssa_) { - os << ss.str(); - } else { std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4)); if (op->args.back().as()) { @@ -365,6 +379,8 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args.back(), os); os << "]"; } + } else { + os << ss.str(); } } else if (op->op.same_as(builtin_call_extern_)) { auto func = Downcast(op->args[0]); diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 374ae4ae56b6..399dc6c4c007 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -62,6 +62,7 @@ class CodeGenOpenCL final : public CodeGenC { void VisitExpr_(const BroadcastNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*) void VisitStmt_(const StoreNode* op) final; // NOLINT(*) + void VisitStmt_(const AllocateNode* op) final; // NOLINT(*) private: @@ -70,7 +71,8 @@ class CodeGenOpenCL final : public CodeGenC { bool enable_fp64_{false}; // Whether to enable atomics extension. bool enable_atomics_{false}; - PrimExpr stored_value_{nullptr}; + bool need_texture_ssa_{true}; + std::unordered_map allocation_size_; }; } // namespace codegen From d8fbcfdb56c806882866ddd3291c51da26ccfb40 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 10 Dec 2020 11:19:30 -0800 Subject: [PATCH 18/59] Define texture flattening convention such that the outer Nd-1 axes are stored as rows, and the last axis is stored as columns. --- src/tir/transforms/texture_flatten.cc | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 28bbf3e61cb5..0ca908826dac 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -249,8 +249,16 @@ inline PrimExpr SimplifyOffset(const Array& shape, const Array TextureFlattening -> [N*C*H, W, c] + // + + return shape_rank - 2; } } @@ -293,7 +301,7 @@ class TextureFlattener : public StmtExprMutator { auto height = IntImm(DataType::Int(32), 1); //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4) for (size_t i = 0; i < op->bounds.size()-1; i++) { - if (i < GetAxisSeparator()) { + if (i < GetAxisSeparator(op->bounds.size())) { width *= op->bounds[i]->extent; } else { height *= op->bounds[i]->extent; @@ -339,7 +347,7 @@ class TextureFlattener : public StmtExprMutator { Array row_dims, row_indices, col_dims, col_indices; for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator()) { + if (i < GetAxisSeparator(op->buffer->shape.size())) { row_dims.push_back(op->buffer->shape[i]); row_indices.push_back(op->indices[i]); } else { @@ -391,7 +399,7 @@ class TextureFlattener : public StmtExprMutator { Array row_dims, row_indices, col_dims, col_indices; for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator()) { + if (i < GetAxisSeparator(op->buffer->shape.size())) { row_dims.push_back(op->buffer->shape[i]); row_indices.push_back(op->indices[i]); } else { From b81620fe5decc10cdeac266751a1c1bcb60de36b Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 19 Mar 2021 15:49:22 -0700 Subject: [PATCH 19/59] Add tir lowering and opencl codegen support for float16 textures. --- src/target/source/codegen_opencl.cc | 17 +++++++++++++++-- src/tir/transforms/texture_flatten.cc | 10 +++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 6367bd1b40d8..d0efbdeaec88 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -343,7 +343,15 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(load->index, os); os << ')'; } else if (op->op.same_as(builtin::text2d_store())) { - os << "write_imagef("; + auto* texture_type = op->args[0].as()->type_annotation.as(); + ICHECK(texture_type != nullptr) << "builtin::text2d_store() only supports storing to texture buffers"; + DataType buffer_type = texture_type->element_type.as()->dtype; + if (buffer_type.is_float16()) { + os << "write_imageh("; + } + else if (buffer_type.is_float()) { + os << "write_imagef("; + } this->PrintExpr(op->args[0], os); os << ", "; os << "(int2)("; @@ -355,7 +363,12 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << ")"; } else if (op->op.same_as(builtin::text2d_load())) { std::stringstream ss; - ss << "read_imagef("; + if (op->dtype.is_float16()) { + ss << "read_imageh("; + } + else if (op->dtype.is_float()) { + ss << "read_imagef("; + } this->PrintExpr(op->args[0], ss); ss << ", "; ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 0ca908826dac..c144e8ac5742 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -274,7 +274,7 @@ class TextureFlattener : public StmtExprMutator { } Stmt VisitStmt_(const BufferRealizeNode* op) final { - Var buffer_var(op->buffer->data->name_hint, TextureType(DataType::Float(32, 1))); + Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype)); let_binding_.insert({op->buffer->data, buffer_var}); Stmt stmt = StmtExprMutator::VisitStmt_(op); @@ -299,7 +299,8 @@ class TextureFlattener : public StmtExprMutator { Array shape; auto width = IntImm(DataType::Int(32), 1); auto height = IntImm(DataType::Int(32), 1); - //TODO(csulivan): this does not handle the case where the last dimension isn't previously set to a vector(4) + // TODO(csulivan): We do not currently handle the case where + // the last dimension isn't previously set to a vector(4) for (size_t i = 0; i < op->bounds.size()-1; i++) { if (i < GetAxisSeparator(op->bounds.size())) { width *= op->bounds[i]->extent; @@ -308,9 +309,6 @@ class TextureFlattener : public StmtExprMutator { } } - // ICHECK_EQ(shape.size(), 3) << "Only 2d RGBA texture is currently supported"; - // ICHECK_EQ(static_cast(shape[2].as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; - Array args = {width, height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -427,10 +425,8 @@ class TextureFlattener : public StmtExprMutator { }; PrimFunc TextureFlatten(PrimFunc func) { - // std::cout << "Before TextureFlattening: " << func << std::endl; auto fptr = func.CopyOnWrite(); fptr->body = TextureFlattener()(std::move(fptr->body)); - // std::cout << "After TextureFlattening: " << func << std::endl; return func; } From 557e07dfd863b6257180be437a19cd67d6386765 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 4 Feb 2021 15:31:15 -0800 Subject: [PATCH 20/59] Disable SSA when texture load is immediately casted. --- src/target/source/codegen_opencl.cc | 12 +++++++++++- src/target/source/codegen_opencl.h | 5 +++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index d0efbdeaec88..b8ff1d451445 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -315,7 +315,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { auto it = allocation_size_.find(op->buffer_var.get()); if (it != allocation_size_.end() && it->second == 1) { - need_texture_ssa_ = true; + need_texture_ssa_ = true; } } } @@ -323,6 +323,16 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { need_texture_ssa_ = true; } +void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) { + if (auto call = op->value.as()) { + if (call->op.same_as(builtin::text2d_load())) { + need_texture_ssa_ = false; + } + } + CodeGenC::VisitExpr_(op, os); + need_texture_ssa_ = true; +} + void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) { allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()}); CodeGenC::VisitStmt_(op); diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 399dc6c4c007..a456fdd94f5f 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -58,11 +58,12 @@ class CodeGenOpenCL final : public CodeGenC { // overload visitor - void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*) + void VisitStmt_(const AllocateNode* op) final; // NOLINT(*) void VisitExpr_(const BroadcastNode* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const CallNode* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const CastNode* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*) void VisitStmt_(const StoreNode* op) final; // NOLINT(*) - void VisitStmt_(const AllocateNode* op) final; // NOLINT(*) private: From 19469c61f47372b9f933a932ef9364528dc04982 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 1 Mar 2021 15:08:51 -0800 Subject: [PATCH 21/59] Allow RGBA extent to be of length 1. --- src/tir/transforms/texture_flatten.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index c144e8ac5742..5de637fd149e 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -294,7 +294,8 @@ class TextureFlattener : public StmtExprMutator { { Stmt body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; - ICHECK_EQ(static_cast(op->bounds.back()->extent.as()->value), 4) << "FCD of texture must be vector of length 4 (RGBA)"; + int vec_length = static_cast(op->bounds.back()->extent.as()->value); + ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)"; Array shape; auto width = IntImm(DataType::Int(32), 1); From 8b3b3ded0ec327d9fa14aff23c8020b77559efa7 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 19 Mar 2021 15:54:15 -0700 Subject: [PATCH 22/59] Add pass to forward externally allocated textures in place of textures realized from cache_read. Fix to better follow indexing spec. --- src/tir/transforms/texture_flatten.cc | 211 +++++++++++++++++--------- 1 file changed, 141 insertions(+), 70 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 5de637fd149e..f73208e5759e 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -18,10 +18,9 @@ */ /*! - * \file storage_flatten.cc - * \brief Flattens storage from multi-dimensional array to 1D buffer access + * \file texture_flatten.cc + * \brief Flattens texture from multi-dimensional array to 2D buffer access */ -// The pass definition originates from Halide pipeline. #include #include @@ -262,37 +261,70 @@ size_t GetAxisSeparator(size_t shape_rank) { } } -class TextureFlattener : public StmtExprMutator { +class TextureLoweringBase : public StmtExprMutator { public: - explicit TextureFlattener() {} + explicit TextureLoweringBase(const Map& extern_buffer_map) { + for (auto kv : extern_buffer_map) { + extern_buf_.insert(kv.second); + } + } - Stmt VisitStmt_(const AttrStmtNode* op) final { + virtual Stmt VisitStmt_(const AttrStmtNode* op) { if (op->attr_key == attr::realize_scope) { - storage_scope_[op->node.get()] = op->value.as()->value; + std::string realize_scope = op->value.as()->value; + // If realize_scope for external buffer is unset, infer from buffer scope + if (realize_scope == "" && op->body->IsInstance()) { + const auto* realize = Downcast(op->body).get(); + if (extern_buf_.count(realize->buffer)) { + realize_scope = realize->buffer->scope; + } + } + storage_scope_[op->node.get()] = realize_scope; } return StmtExprMutator::VisitStmt_(op); } + protected: + + std::string GetStorageScope(const Buffer& buffer) { + std::string storage_scope; + auto it = storage_scope_.find(buffer.get()); + // If buffer has a realize_scope attr return it + if (it != storage_scope_.end()) { + storage_scope = it->second; + } else { + storage_scope = buffer->scope; + } + return storage_scope; + } + + // Buffer set + std::unordered_set extern_buf_; + // Storage scope + std::unordered_map storage_scope_; +}; + +class TextureFlattener : public TextureLoweringBase { + public: + explicit TextureFlattener(const Map& extern_buffer_map, + const std::unordered_map& extern_buffer_binds_) + : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;} + Stmt VisitStmt_(const BufferRealizeNode* op) final { + if (extern_buf_.count(op->buffer)) { + return this->VisitStmt(op->body); + } + Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype)); let_binding_.insert({op->buffer->data, buffer_var}); Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); + Stmt body = this->VisitStmt(op->body); - std::string storage_scope; - auto it = storage_scope_.find(op->buffer.get()); - if (it != storage_scope_.end()) - { - storage_scope = it->second; - } - else - { - storage_scope = op->buffer->scope; - } - if (storage_scope == "texture") - { - Stmt body = this->VisitStmt(op->body); + std::string storage_scope = GetStorageScope(op->buffer); + if (storage_scope == "texture") { + body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; int vec_length = static_cast(op->bounds.back()->extent.as()->value); ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)"; @@ -304,9 +336,9 @@ class TextureFlattener : public StmtExprMutator { // the last dimension isn't previously set to a vector(4) for (size_t i = 0; i < op->bounds.size()-1; i++) { if (i < GetAxisSeparator(op->bounds.size())) { - width *= op->bounds[i]->extent; - } else { height *= op->bounds[i]->extent; + } else { + width *= op->bounds[i]->extent; } } @@ -321,37 +353,23 @@ class TextureFlattener : public StmtExprMutator { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - std::string storage_scope; - auto it = storage_scope_.find(op->buffer.get()); - if (it != storage_scope_.end()) - { - storage_scope = it->second; - } - else - { - storage_scope = op->buffer->scope; - } - if (storage_scope == "texture") - { + std::string storage_scope = GetStorageScope(op->buffer); + if (storage_scope == "texture") { Array args; - if (let_binding_.count(op->buffer->data)) - { + if (let_binding_.count(op->buffer->data)) { args.push_back(let_binding_[op->buffer->data]); - } - else - { + } else { args.push_back(op->buffer->data); } Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size()-1; i++) - { + for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { if (i < GetAxisSeparator(op->buffer->shape.size())) { - row_dims.push_back(op->buffer->shape[i]); - row_indices.push_back(op->indices[i]); - } else { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); + } else { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); } } @@ -372,38 +390,29 @@ class TextureFlattener : public StmtExprMutator { PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); - std::string storage_scope; - auto it = storage_scope_.find(op->buffer.get()); - if (it != storage_scope_.end()) - { - storage_scope = it->second; + auto buffer = op->buffer; + if (buffer_binds_.count(op->buffer)) { + buffer = buffer_binds_[op->buffer]; } - else - { - storage_scope = op->buffer->scope; - } - if (storage_scope == "texture") - { + + std::string storage_scope = GetStorageScope(buffer); + if (storage_scope == "texture") { Array args; - if (let_binding_.count(op->buffer->data)) - { + if (let_binding_.count(op->buffer->data)) { args.push_back(let_binding_[op->buffer->data]); - } - else - { - args.push_back(op->buffer->data); + } else { + args.push_back(buffer->data); } Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size()-1; i++) - { + for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { if (i < GetAxisSeparator(op->buffer->shape.size())) { - row_dims.push_back(op->buffer->shape[i]); - row_indices.push_back(op->indices[i]); - } else { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); + } else { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); } } @@ -418,16 +427,78 @@ class TextureFlattener : public StmtExprMutator { return expr; } - private: - // Storage scope - std::unordered_map storage_scope_; + protected: + // Let binding std::unordered_map let_binding_; + std::unordered_map buffer_binds_; }; + +class ExternalBufferForwarding : public TextureLoweringBase { + public: + explicit ExternalBufferForwarding(const Map& extern_buffer_map) + : TextureLoweringBase(extern_buffer_map) {;} + + Stmt VisitStmt_(const AttrStmtNode* op) final { + Stmt stmt = TextureLoweringBase::VisitStmt_(op); + if (op->attr_key == attr::realize_scope) { + if (op->body->IsInstance()) { + const auto* realize = Downcast(op->body).get(); + std::string realize_scope = GetStorageScope(realize->buffer); + if (realize_scope == "texture" && extern_buffer_copy_.count(realize->buffer)) { + return realize_attrs_.back(); + } else { + if (realize_attrs_.size()) { + realize_attrs_.pop_back(); + } + realize_attrs_.push_back(stmt); + } + return stmt; + } + } + + return stmt; + } + + Stmt VisitStmt_(const BufferStoreNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + + if (auto load = op->value.as()) { + if (extern_buf_.count(load->buffer)) { + // If the buffer to load and the buffer to store to are both texture + // check for identical access + if (GetStorageScope(load->buffer) == "texture" && GetStorageScope(op->buffer) == "texture") { + auto store_index = SimplifyOffset(op->buffer->shape, op->indices); + auto load_index = SimplifyOffset(load->buffer->shape, load->indices); + if (arith::Analyzer().CanProve(store_index == load_index)) { + extern_buffer_copy_.insert(op->buffer); + buffer_map_.insert({op->buffer, load->buffer}); + } + } + } + } + + return stmt; + } + + const std::unordered_map& GetForwardedBuffers() { + return buffer_map_; + } + + private: + std::deque realize_attrs_; + std::unordered_set extern_buffer_copy_; + std::unordered_map buffer_map_; +}; + + PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); - fptr->body = TextureFlattener()(std::move(fptr->body)); + ExternalBufferForwarding forward(fptr->buffer_map); + fptr->body = forward(std::move(fptr->body)); + fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers())(std::move(fptr->body)); return func; } From d7d3195cf3c1290ee0a12119efe9082106ba1c5f Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 17 Aug 2021 15:44:11 -0700 Subject: [PATCH 23/59] Add buffer_common.h to house buffer offset simplification routines. --- src/tir/ir/buffer.cc | 245 +------------------------- src/tir/transforms/texture_flatten.cc | 190 +------------------- 2 files changed, 3 insertions(+), 432 deletions(-) diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 90560e0dcac7..beee377d8401 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -29,17 +29,13 @@ #include #include -#include -#include +#include "buffer_common.h" #include "../../arith/pattern_match.h" namespace tvm { namespace tir { -using IndexMod = tir::FloorModNode; -using IndexDiv = tir::FloorDivNode; - Array SimplifyArray(arith::Analyzer* ana, Array array) { for (size_t i = 0; i < array.size(); ++i) { array.Set(i, ana->Simplify(array[i])); @@ -54,245 +50,6 @@ Buffer decl_buffer(Array shape, DataType dtype, String name, String st Array(), PrimExpr(), name, 0, 0, kDefault, span); } -namespace { -// Split the given expression w.r.t the add operator -inline std::vector ExprSplitAddition(const PrimExpr& expr) { - using namespace tir; - std::vector ret; - std::stack split_buffer; - split_buffer.push(&expr); - while (!split_buffer.empty()) { - const PrimExpr* top_ele = split_buffer.top(); - split_buffer.pop(); - auto expr_add_match = top_ele->as(); - if (expr_add_match) { - split_buffer.push(&expr_add_match->b); - split_buffer.push(&expr_add_match->a); - } else { - ret.emplace_back(top_ele); - } - } - return ret; -} - -// Searches for the following types of expr: -// mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki -// mod_l_expr = c -// mod_r_expr = k1 * k2 * ... * ki -// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c) -// Currently the we will not search the add/mult combinations exhaustively -// as it will take too much computation. -inline std::pair MergeMulModInner(const PrimExpr& mult_expr, - const PrimExpr& mod_l_expr, - const PrimExpr& mod_r_expr) { - using namespace tir; - const MulNode* mult_ptr = mult_expr.as(); - if (!mult_ptr) return std::make_pair(false, PrimExpr()); - PrimExpr mult_outer = mult_ptr->b; - const PrimExpr* inner = &(mult_ptr->a); - // 1. Calculate the outer multiplier - while (true) { - mult_ptr = inner->as(); - if (mult_ptr) { - inner = &(mult_ptr->a); - mult_outer = mult_ptr->b * mult_outer; - } else { - break; - } - } - // 2. Search for the pattern c / (...) * (...) + c % (...) - // We match the search element with Add, Mul and Div. - // If Add is found, we need to continue our search for the rhs - // If Mult is found, we will expand the inner multiplication factor - // If Div is found, we will go on testing whether lhs matches the lhs of mod expr - // and returns the optimization result. - const PrimExpr* search_ptr = inner; - PrimExpr mult_inner; // The inner multiplication factor - PrimExpr no_opt_sum; // Sum of the exprs that cannot be optimized - tir::ExprDeepEqual expr_equal; - - while (true) { - auto inner_div_ptr = search_ptr->as(); - auto inner_mult_ptr = search_ptr->as(); - auto inner_add_ptr = search_ptr->as(); - if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) { - return std::make_pair(false, PrimExpr()); - } else if (inner_div_ptr) { - PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer; - if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) && - expr_equal(inner_div_ptr->a, mod_l_expr)) { - // Found! - PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr; - return std::make_pair(true, ret); - } else { - return std::make_pair(false, PrimExpr()); - } - } else if (inner_mult_ptr) { - mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b; - search_ptr = &(inner_mult_ptr->a); - } else if (inner_add_ptr) { - if (mult_inner.get()) { - return std::make_pair(false, PrimExpr()); - } - no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a; - search_ptr = &(inner_add_ptr->b); - } else { - LOG(FATAL) << "Unexpected search result!"; - break; - } - } - return std::make_pair(false, PrimExpr()); -} - -// Insert the elements into the corresponding mult_exprs and mod_exprs. -// If the element is found to match Mul, it will be pushed to the mult_exprs. -// If the element it found to match Mod, it will be pused to the mod_exprs. -// Otherwise, the elements will be added to the no_opt_sum variable -inline void MergeMulModInsertElements(const std::vector& eles, - std::list* mult_exprs, - std::list >* mod_exprs, - PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) { - using namespace tir; - *has_mult = false; - *has_mod = false; - for (const PrimExpr* ele : eles) { - auto mod_ptr = ele->as(); - auto mult_ptr = ele->as(); - if (mod_ptr) { - *has_mod = true; - mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b))); - } else if (mult_ptr) { - *has_mult = true; - mult_exprs->emplace_back(*ele); - } else { - *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele; - } - } -} - -// Searches for this types of expr: -// (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki -// + c % (k1 * k2 * ... * ki) -// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c -// The search will be performed repeatively until no pattern is found. -// Return: a pair with (false, Expr()) if cannot be optimized. -// a pair with (true, optimized_expr) if can be optimized -inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { - using namespace tir; - // 1. Prepare the lists. - // We store two lists, a list that contain all the elements that match Mul and - // a list that contain all the elements that match Mod. - // The elements in the Mod will be used to match against the elements in Mul. - // The result will then be split and pushed back to these two lists. - PrimExpr simplified_base = base; - arith::PVar x, y; - if ((floordiv(x, y) * y + floormod(x, y)).Match(simplified_base)) { - simplified_base = x.Eval(); - } - simplified_base = analyzer->Simplify(simplified_base); - std::vector eles = ExprSplitAddition(simplified_base); - std::list mult_exprs; - std::list > mod_exprs; - PrimExpr no_opt_sum; - bool has_mult; - bool has_mod; - MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod); - bool find_opt = false; - std::list >::iterator search_mod_it = mod_exprs.begin(); - // 2. Exhaustive Search - while (search_mod_it != mod_exprs.end()) { - std::list::iterator mult_it = mult_exprs.begin(); - bool inner_find_opt = false; - while (mult_it != mult_exprs.end()) { - std::pair ret = - MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second); - if (ret.first) { - inner_find_opt = true; - auto temp_mod_it = search_mod_it; - ++search_mod_it; - mod_exprs.erase(temp_mod_it); - mult_exprs.erase(mult_it); - std::vector ret_eles = ExprSplitAddition(ret.second); - MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, - &has_mod); - if (has_mult) { - search_mod_it = mod_exprs.begin(); - } else if (has_mod && search_mod_it == mod_exprs.end()) { - search_mod_it--; - } - break; - } else { - ++mult_it; - } - } - find_opt = find_opt || inner_find_opt; - if (!inner_find_opt) { - ++search_mod_it; - } - } - if (!find_opt) { - return simplified_base; - } - for (std::list::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) { - no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it; - } - for (std::list >::iterator it = mod_exprs.begin(); - it != mod_exprs.end(); ++it) { - no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second) - : indexmod(it->first, it->second); - } - return no_opt_sum; -} - -// The buffer offset in convention of number of elements of -// original data ignoring number of lanes. -// We also perform optimization to simplify the indexing expression. -inline PrimExpr ElemOffset(const BufferNode* n, Array index) { - PrimExpr base = n->elem_offset; - arith::Analyzer ana; - if (n->strides.size() == 0) { - // Scalar case - if (n->shape.size() == 0 && index.size() == 1) { - auto is_int = index[0].as(); - ICHECK(is_int && is_int->value == 0); - base = base + index[0]; - } else { - ICHECK_EQ(n->shape.size(), index.size()); - if (index.size() > 0) { - PrimExpr offset = index[0]; - for (size_t i = 1; i < index.size(); ++i) { - offset = MergeMulMod(&ana, offset * n->shape[i] + index[i]); - } - base = base + offset; - } - } - } else { - ICHECK_EQ(n->strides.size(), index.size()); - if (is_zero(base)) { - base = MergeMulMod(&ana, index[0] * n->strides[0]); - } else { - base = MergeMulMod(&ana, base + index[0] * n->strides[0]); - } - for (size_t i = 1; i < index.size(); ++i) { - base = MergeMulMod(&ana, base + index[i] * n->strides[i]); - } - } - return base; -} - -inline PrimExpr BufferOffset(const BufferNode* n, Array index, DataType dtype) { - PrimExpr offset = ElemOffset(n, index); - if (n->dtype.lanes() != 1) { - offset = offset * make_const(offset.dtype(), dtype.lanes()); - } - if (dtype.lanes() != 1) { - return tir::Ramp(offset, make_const(offset.dtype(), 1), dtype.lanes()); - } else { - return offset; - } -} -} - PrimExpr Buffer::vload(Array begin, DataType dtype) const { // specially handle bool, stored as DataType::Int(8) const BufferNode* n = operator->(); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index f73208e5759e..d3544b5c02cf 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -41,201 +41,16 @@ #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/thread_storage_scope.h" +#include "../ir/buffer_common.h" #include "arg_binder.h" #include "ir_utils.h" namespace tvm { namespace tir { namespace { - using IndexMod = tir::FloorModNode; - using IndexDiv = tir::FloorDivNode; - -// Split the given expression w.r.t the add operator -inline std::vector ExprSplitAddition(const PrimExpr& expr) { - using namespace tir; - std::vector ret; - std::stack split_buffer; - split_buffer.push(&expr); - while (!split_buffer.empty()) { - const PrimExpr* top_ele = split_buffer.top(); - split_buffer.pop(); - auto expr_add_match = top_ele->as(); - if (expr_add_match) { - split_buffer.push(&expr_add_match->b); - split_buffer.push(&expr_add_match->a); - } else { - ret.emplace_back(top_ele); - } - } - return ret; -} - -// Searches for the following types of expr: -// mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki -// mod_l_expr = c -// mod_r_expr = k1 * k2 * ... * ki -// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c) -// Currently the we will not search the add/mult combinations exhaustively -// as it will take too much computation. -inline std::pair MergeMulModInner(const PrimExpr& mult_expr, - const PrimExpr& mod_l_expr, - const PrimExpr& mod_r_expr) { - using namespace tir; - const MulNode* mult_ptr = mult_expr.as(); - if (!mult_ptr) return std::make_pair(false, PrimExpr()); - PrimExpr mult_outer = mult_ptr->b; - const PrimExpr* inner = &(mult_ptr->a); - // 1. Calculate the outer multiplier - while (true) { - mult_ptr = inner->as(); - if (mult_ptr) { - inner = &(mult_ptr->a); - mult_outer = mult_ptr->b * mult_outer; - } else { - break; - } - } - // 2. Search for the pattern c / (...) * (...) + c % (...) - // We match the search element with Add, Mul and Div. - // If Add is found, we need to continue our search for the rhs - // If Mult is found, we will expand the inner multiplication factor - // If Div is found, we will go on testing whether lhs matches the lhs of mod expr - // and returns the optimization result. - const PrimExpr* search_ptr = inner; - PrimExpr mult_inner; // The inner multiplication factor - PrimExpr no_opt_sum; // Sum of the exprs that cannot be optimized - tir::ExprDeepEqual expr_equal; - - while (true) { - auto inner_div_ptr = search_ptr->as(); - auto inner_mult_ptr = search_ptr->as(); - auto inner_add_ptr = search_ptr->as(); - if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) { - return std::make_pair(false, PrimExpr()); - } else if (inner_div_ptr) { - PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer; - if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) && - expr_equal(inner_div_ptr->a, mod_l_expr)) { - // Found! - PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr; - return std::make_pair(true, ret); - } else { - return std::make_pair(false, PrimExpr()); - } - } else if (inner_mult_ptr) { - mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b; - search_ptr = &(inner_mult_ptr->a); - } else if (inner_add_ptr) { - if (mult_inner.get()) { - return std::make_pair(false, PrimExpr()); - } - no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a; - search_ptr = &(inner_add_ptr->b); - } else { - LOG(FATAL) << "Unexpected search result!"; - break; - } - } - return std::make_pair(false, PrimExpr()); -} - -// Insert the elements into the corresponding mult_exprs and mod_exprs. -// If the element is found to match Mul, it will be pushed to the mult_exprs. -// If the element it found to match Mod, it will be pused to the mod_exprs. -// Otherwise, the elements will be added to the no_opt_sum variable -inline void MergeMulModInsertElements(const std::vector& eles, - std::list* mult_exprs, - std::list >* mod_exprs, - PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) { - using namespace tir; - *has_mult = false; - *has_mod = false; - for (const PrimExpr* ele : eles) { - auto mod_ptr = ele->as(); - auto mult_ptr = ele->as(); - if (mod_ptr) { - *has_mod = true; - mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b))); - } else if (mult_ptr) { - *has_mult = true; - mult_exprs->emplace_back(*ele); - } else { - *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele; - } - } -} - -// Searches for this types of expr: -// (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki -// + c % (k1 * k2 * ... * ki) -// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c -// The search will be performed repeatively until no pattern is found. -// Return: a pair with (false, Expr()) if cannot be optimized. -// a pair with (true, optimized_expr) if can be optimized -inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { - using namespace tir; - // 1. Prepare the lists. - // We store two lists, a list that contain all the elements that match Mul and - // a list that contain all the elements that match Mod. - // The elements in the Mod will be used to match against the elements in Mul. - // The result will then be split and pushed back to these two lists. - PrimExpr simplified_base = analyzer->Simplify(base); - std::vector eles = ExprSplitAddition(simplified_base); - std::list mult_exprs; - std::list > mod_exprs; - PrimExpr no_opt_sum; - bool has_mult; - bool has_mod; - MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod); - bool find_opt = false; - std::list >::iterator search_mod_it = mod_exprs.begin(); - // 2. Exhaustive Search - while (search_mod_it != mod_exprs.end()) { - std::list::iterator mult_it = mult_exprs.begin(); - bool inner_find_opt = false; - while (mult_it != mult_exprs.end()) { - std::pair ret = - MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second); - if (ret.first) { - inner_find_opt = true; - auto temp_mod_it = search_mod_it; - ++search_mod_it; - mod_exprs.erase(temp_mod_it); - mult_exprs.erase(mult_it); - std::vector ret_eles = ExprSplitAddition(ret.second); - MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, - &has_mod); - if (has_mult) { - search_mod_it = mod_exprs.begin(); - } else if (has_mod && search_mod_it == mod_exprs.end()) { - search_mod_it--; - } - break; - } else { - ++mult_it; - } - } - find_opt = find_opt || inner_find_opt; - if (!inner_find_opt) { - ++search_mod_it; - } - } - if (!find_opt) { - return simplified_base; - } - for (std::list::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) { - no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it; - } - for (std::list >::iterator it = mod_exprs.begin(); - it != mod_exprs.end(); ++it) { - no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second) - : indexmod(it->first, it->second); - } - return no_opt_sum; -} inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) { - PrimExpr base = make_const(DataType::Int(32), 0); //IntImm(DataType::Int(32), 0); + PrimExpr base = make_const(DataType::Int(32), 0); ICHECK_EQ(shape.size(), index.size()); arith::Analyzer ana; if (index.size() > 0) { @@ -256,7 +71,6 @@ size_t GetAxisSeparator(size_t shape_rank) { // // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c] // - return shape_rank - 2; } } From 6be6e6024e97d2b95c1bf5314b901cf0dd4231e7 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 5 Mar 2021 21:28:30 -0800 Subject: [PATCH 24/59] More refactor and clean up in texture lowering. --- src/tir/transforms/texture_flatten.cc | 89 ++++++++++----------------- 1 file changed, 33 insertions(+), 56 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index d3544b5c02cf..cba4751bc668 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -70,7 +70,6 @@ size_t GetAxisSeparator(size_t shape_rank) { // axes are packed into rows. // // e.g. [N,C,H,W,c] -> TextureFlattening -> [N*C*H, W, c] - // return shape_rank - 2; } } @@ -112,7 +111,7 @@ class TextureLoweringBase : public StmtExprMutator { return storage_scope; } - // Buffer set + // External buffer std::unordered_set extern_buf_; // Storage scope std::unordered_map storage_scope_; @@ -146,8 +145,6 @@ class TextureFlattener : public TextureLoweringBase { Array shape; auto width = IntImm(DataType::Int(32), 1); auto height = IntImm(DataType::Int(32), 1); - // TODO(csulivan): We do not currently handle the case where - // the last dimension isn't previously set to a vector(4) for (size_t i = 0; i < op->bounds.size()-1; i++) { if (i < GetAxisSeparator(op->bounds.size())) { height *= op->bounds[i]->extent; @@ -155,7 +152,6 @@ class TextureFlattener : public TextureLoweringBase { width *= op->bounds[i]->extent; } } - Array args = {width, height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -166,34 +162,11 @@ class TextureFlattener : public TextureLoweringBase { Stmt VisitStmt_(const BufferStoreNode* op) final { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - std::string storage_scope = GetStorageScope(op->buffer); + // Lower to two dimensional access if (storage_scope == "texture") { - Array args; - if (let_binding_.count(op->buffer->data)) { - args.push_back(let_binding_[op->buffer->data]); - } else { - args.push_back(op->buffer->data); - } - - Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator(op->buffer->shape.size())) { - col_dims.push_back(op->buffer->shape[i]); - col_indices.push_back(op->indices[i]); - } else { - row_dims.push_back(op->buffer->shape[i]); - row_indices.push_back(op->indices[i]); - } - } - - PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); - PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); - - args.push_back(row_offset); - args.push_back(col_offset); + Array args = GetTextureAccessArgs(op, op->buffer); args.push_back(op->value); - stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); } @@ -203,37 +176,15 @@ class TextureFlattener : public TextureLoweringBase { PrimExpr VisitExpr_(const BufferLoadNode* op) final { PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); - + // Replace with identitcal external buffer if one exists auto buffer = op->buffer; if (buffer_binds_.count(op->buffer)) { buffer = buffer_binds_[op->buffer]; } - + // Lower to two dimensional access std::string storage_scope = GetStorageScope(buffer); if (storage_scope == "texture") { - Array args; - if (let_binding_.count(op->buffer->data)) { - args.push_back(let_binding_[op->buffer->data]); - } else { - args.push_back(buffer->data); - } - - - Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator(op->buffer->shape.size())) { - col_dims.push_back(op->buffer->shape[i]); - col_indices.push_back(op->indices[i]); - } else { - row_dims.push_back(op->buffer->shape[i]); - row_indices.push_back(op->indices[i]); - } - } - - PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); - PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); - args.push_back(row_offset); - args.push_back(col_offset); + Array args = GetTextureAccessArgs(op, buffer); args.push_back(op->indices.back()); expr = Call(op->buffer->dtype, builtin::text2d_load(), args); } @@ -243,6 +194,31 @@ class TextureFlattener : public TextureLoweringBase { protected: + template + Array GetTextureAccessArgs(const T* op, const Buffer& buffer) { + Array args; + if (let_binding_.count(op->buffer->data)) { + args.push_back(let_binding_[op->buffer->data]); + } else { + args.push_back(buffer->data); + } + Array row_dims, row_indices, col_dims, col_indices; + for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { + if (i < GetAxisSeparator(op->buffer->shape.size())) { + col_dims.push_back(op->buffer->shape[i]); + col_indices.push_back(op->indices[i]); + } else { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); + } + } + PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); + PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); + args.push_back(row_offset); + args.push_back(col_offset); + return args; + } + // Let binding std::unordered_map let_binding_; std::unordered_map buffer_binds_; @@ -283,7 +259,8 @@ class ExternalBufferForwarding : public TextureLoweringBase { if (extern_buf_.count(load->buffer)) { // If the buffer to load and the buffer to store to are both texture // check for identical access - if (GetStorageScope(load->buffer) == "texture" && GetStorageScope(op->buffer) == "texture") { + if (GetStorageScope(load->buffer) == "texture" && + GetStorageScope(op->buffer) == "texture") { auto store_index = SimplifyOffset(op->buffer->shape, op->indices); auto load_index = SimplifyOffset(load->buffer->shape, load->indices); if (arith::Analyzer().CanProve(store_index == load_index)) { From 48cf179dfe4bdd1bd959fcb46a55f821d71ee061 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sun, 7 Mar 2021 15:17:43 -0800 Subject: [PATCH 25/59] Add IsTextureType to tir and allow buffer var type annotation to be TextureType in addition to PointerType. --- include/tvm/tir/op.h | 15 +++++++++++++++ src/tir/ir/buffer.cc | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h index 9cf7d0a3cd1f..c45a9d77a446 100644 --- a/include/tvm/tir/op.h +++ b/include/tvm/tir/op.h @@ -895,6 +895,21 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) { } return false; } +/*! + * \brief Check if type is a texture handle of a runtime element type. + * \param type The type to be checked. + * \param element_type The corresponding element type. + * \return The check results + */ +inline bool IsTextureType(const Type& type, const DataType& element_type) { + if (!type.defined()) return false; + if (const auto* ptr_type = type.as()) { + if (const auto* prim_type = ptr_type->element_type.as()) { + return prim_type->dtype == element_type; + } + } + return false; +} /*! * \brief Make a const value with certain data type. diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index beee377d8401..cd6cffabcda4 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -166,7 +166,8 @@ Buffer::Buffer(Var data, DataType dtype, Array shape, Array if (storage_dtype == DataType::Bool()) { storage_dtype = DataType::Int(8); } - ICHECK(IsPointerType(data->type_annotation, storage_dtype)) + ICHECK(IsPointerType(data->type_annotation, storage_dtype) || + IsTextureType(data->type_annotation, storage_dtype)) << "Buffer data field expect to have the right pointer type annotation" << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype; From 91f00ee89ebeeeb31d536ace1b66e68bdbaf83cc Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 9 Mar 2021 14:27:00 -0800 Subject: [PATCH 26/59] Bug fix in texture access qualifier inference pass --- src/target/source/codegen_opencl.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index b8ff1d451445..05136045fc80 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -69,6 +69,7 @@ class InferTextureAccess : public StmtExprVisitor { var_access_map_[buffer] |= write_access; } } + StmtExprVisitor::VisitExpr_(op); } private: std::unordered_map var_access_map_; From 0ac0875c2e03cf2125346cd1324e69c639b66671 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 9 Mar 2021 11:29:04 -0800 Subject: [PATCH 27/59] Step toward handling external texture buffer forwarding when external buffer is not stored directly to cache_read realized buffer. For example when it is conditionally stored via an IfThenElse node when padding is used. --- src/tir/transforms/texture_flatten.cc | 33 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index cba4751bc668..1639f1cc2416 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -252,28 +252,48 @@ class ExternalBufferForwarding : public TextureLoweringBase { } Stmt VisitStmt_(const BufferStoreNode* op) final { + ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store"; + external_loads_.emplace_back(); Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - if (auto load = op->value.as()) { + auto check_identity = [this](const BufferStoreNode* store, const BufferLoad& load) { if (extern_buf_.count(load->buffer)) { // If the buffer to load and the buffer to store to are both texture // check for identical access if (GetStorageScope(load->buffer) == "texture" && - GetStorageScope(op->buffer) == "texture") { - auto store_index = SimplifyOffset(op->buffer->shape, op->indices); + GetStorageScope(store->buffer) == "texture") { + auto store_index = SimplifyOffset(store->buffer->shape, store->indices); auto load_index = SimplifyOffset(load->buffer->shape, load->indices); if (arith::Analyzer().CanProve(store_index == load_index)) { - extern_buffer_copy_.insert(op->buffer); - buffer_map_.insert({op->buffer, load->buffer}); + extern_buffer_copy_.insert(store->buffer); + buffer_map_.insert({store->buffer, load->buffer}); } } } - } + }; + if (auto load_node = op->value.as()) { + check_identity(op, GetRef(load_node)); + } else { + // Stored value is not a load, check for external loads collected + // when visiting the store node's value + for (auto& expr : external_loads_.back()) { + check_identity(op, Downcast(expr)); + } + } + external_loads_.pop_back(); return stmt; } + PrimExpr VisitExpr_(const BufferLoadNode* op) final { + PrimExpr expr = StmtExprMutator::VisitExpr_(op); + if (external_loads_.size() && extern_buf_.count(op->buffer)) { + external_loads_.back().push_back(expr); + } + return expr; + } + const std::unordered_map& GetForwardedBuffers() { return buffer_map_; } @@ -282,6 +302,7 @@ class ExternalBufferForwarding : public TextureLoweringBase { std::deque realize_attrs_; std::unordered_set extern_buffer_copy_; std::unordered_map buffer_map_; + std::vector> external_loads_; }; From 50d4ee523bd38423aa43455f7866c1595a57db2a Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 17 Mar 2021 10:54:27 -0700 Subject: [PATCH 28/59] [Part 2/3] Support texture:weight lowering convention for externally provided texture buffers. Need to propagate this to allocated textures when cache_read(texture) is used for weights. --- src/tir/transforms/texture_flatten.cc | 34 ++++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 1639f1cc2416..80c8accda588 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -63,14 +63,26 @@ inline PrimExpr SimplifyOffset(const Array& shape, const Array TextureFlattening -> [N*C*H, W, c] - return shape_rank - 2; + // Texture weight: + // e.g. [O,I,H,W,c] -> TextureFlattening -> [O, I*H*W, c] + size_t separator; + if (scope == "texture"){ + separator = shape_rank - 2; + } else if (scope == "texture:weight") { + separator = 1; + } + return separator; +} + +bool IsTextureStorage(std::string scope) { + return scope.find("texture") != std::string::npos; } } @@ -136,7 +148,7 @@ class TextureFlattener : public TextureLoweringBase { Stmt body = this->VisitStmt(op->body); std::string storage_scope = GetStorageScope(op->buffer); - if (storage_scope == "texture") { + if (IsTextureStorage(storage_scope)) { body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; int vec_length = static_cast(op->bounds.back()->extent.as()->value); @@ -146,7 +158,7 @@ class TextureFlattener : public TextureLoweringBase { auto width = IntImm(DataType::Int(32), 1); auto height = IntImm(DataType::Int(32), 1); for (size_t i = 0; i < op->bounds.size()-1; i++) { - if (i < GetAxisSeparator(op->bounds.size())) { + if (i < GetAxisSeparator(op->bounds.size(), storage_scope)) { height *= op->bounds[i]->extent; } else { width *= op->bounds[i]->extent; @@ -164,7 +176,7 @@ class TextureFlattener : public TextureLoweringBase { op = stmt.as(); std::string storage_scope = GetStorageScope(op->buffer); // Lower to two dimensional access - if (storage_scope == "texture") { + if (IsTextureStorage(storage_scope)) { Array args = GetTextureAccessArgs(op, op->buffer); args.push_back(op->value); stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); @@ -183,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase { } // Lower to two dimensional access std::string storage_scope = GetStorageScope(buffer); - if (storage_scope == "texture") { + if (IsTextureStorage(storage_scope)) { Array args = GetTextureAccessArgs(op, buffer); args.push_back(op->indices.back()); expr = Call(op->buffer->dtype, builtin::text2d_load(), args); @@ -204,7 +216,7 @@ class TextureFlattener : public TextureLoweringBase { } Array row_dims, row_indices, col_dims, col_indices; for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator(op->buffer->shape.size())) { + if (i < GetAxisSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); } else { @@ -236,7 +248,7 @@ class ExternalBufferForwarding : public TextureLoweringBase { if (op->body->IsInstance()) { const auto* realize = Downcast(op->body).get(); std::string realize_scope = GetStorageScope(realize->buffer); - if (realize_scope == "texture" && extern_buffer_copy_.count(realize->buffer)) { + if (IsTextureStorage(realize_scope) && extern_buffer_copy_.count(realize->buffer)) { return realize_attrs_.back(); } else { if (realize_attrs_.size()) { @@ -261,8 +273,8 @@ class ExternalBufferForwarding : public TextureLoweringBase { if (extern_buf_.count(load->buffer)) { // If the buffer to load and the buffer to store to are both texture // check for identical access - if (GetStorageScope(load->buffer) == "texture" && - GetStorageScope(store->buffer) == "texture") { + if (IsTextureStorage(GetStorageScope(load->buffer)) && + IsTextureStorage(GetStorageScope(store->buffer))) { auto store_index = SimplifyOffset(store->buffer->shape, store->indices); auto load_index = SimplifyOffset(load->buffer->shape, load->indices); if (arith::Analyzer().CanProve(store_index == load_index)) { From fba2d3ff0d30e4bf8ad7b35132f578ffdef03129 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 8 Mar 2021 22:46:51 -0800 Subject: [PATCH 29/59] Bug fix in texture access qualifier inference pass --- src/target/source/codegen_opencl.cc | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 05136045fc80..c8a549a54491 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -40,7 +40,7 @@ class InferTextureAccess : public StmtExprVisitor { explicit InferTextureAccess() {} std::unordered_map Infer(const Stmt& n) { - this->operator()(n); + StmtExprVisitor::VisitStmt(n); std::unordered_map storage_scope_qualifiers; for (auto& texture : var_access_map_) { if (texture.second == read_access) { @@ -56,21 +56,17 @@ class InferTextureAccess : public StmtExprVisitor { return storage_scope_qualifiers; } void VisitExpr_(const CallNode* op) { - if (!op->args.size()) - { - return; + if (op->op.same_as(builtin::text2d_load())) { + var_access_map_[op->args[0].as()] |= read_access; } - if (const VarNode* buffer = op->args[0].as()) - { - if (op->op.same_as(builtin::text2d_load())) { - var_access_map_[buffer] |= read_access; - } - else if (op->op.same_as(builtin::text2d_store())) { - var_access_map_[buffer] |= write_access; - } + else if (op->op.same_as(builtin::text2d_store())) { + var_access_map_[op->args[0].as()] |= write_access; + } else { + StmtExprVisitor::VisitExpr_(op); } StmtExprVisitor::VisitExpr_(op); } + private: std::unordered_map var_access_map_; }; From 2fc4238d97e60d6275d68721d00f3f346dfdc1e2 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 12 Mar 2021 14:03:11 -0800 Subject: [PATCH 30/59] Tighten constraint on external buffer forwarding -- cache_read(texture) cancellation -- to avoid incorrect programs. Currently only forward through if_then_else node and direct external loads. For if_then_else, still need proper analysis of structural equality between buffers and access patterns to determine if an external buffer can replace the texture buffer realized via cache_read. --- src/tir/transforms/texture_flatten.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 80c8accda588..c2398ed9ee39 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -265,7 +265,14 @@ class ExternalBufferForwarding : public TextureLoweringBase { Stmt VisitStmt_(const BufferStoreNode* op) final { ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store"; - external_loads_.emplace_back(); + if (auto* call_node = op->value.as()) { + // Path to supporting external cache_read canceling when padding has induced + // a conditional load into the cache_read buffer. We may be able to elide the + // conditional completely due to hardware support for returning 0 when OOB + if (call_node->op.same_as(builtin::if_then_else())) { + external_loads_.emplace_back(); + } + } Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); @@ -287,14 +294,14 @@ class ExternalBufferForwarding : public TextureLoweringBase { if (auto load_node = op->value.as()) { check_identity(op, GetRef(load_node)); - } else { + } else if (external_loads_.size()) { // Stored value is not a load, check for external loads collected - // when visiting the store node's value + // when visiting the store node's value, e.g. from if_then_else for (auto& expr : external_loads_.back()) { check_identity(op, Downcast(expr)); } + external_loads_.pop_back(); } - external_loads_.pop_back(); return stmt; } From 384dad02637e0b1f08bd71725b8f8aa264273426 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 19 Mar 2021 15:56:31 -0700 Subject: [PATCH 31/59] Use texture lowering convention from texture runtime util. --- src/tir/transforms/texture_flatten.cc | 37 +++++++-------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index c2398ed9ee39..8b0989d9e40c 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -41,6 +41,7 @@ #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/thread_storage_scope.h" +#include "../../runtime/texture.h" #include "../ir/buffer_common.h" #include "arg_binder.h" #include "ir_utils.h" @@ -63,24 +64,6 @@ inline PrimExpr SimplifyOffset(const Array& shape, const Array TextureFlattening -> [N*C*H, W, c] - // Texture weight: - // e.g. [O,I,H,W,c] -> TextureFlattening -> [O, I*H*W, c] - size_t separator; - if (scope == "texture"){ - separator = shape_rank - 2; - } else if (scope == "texture:weight") { - separator = 1; - } - return separator; -} - bool IsTextureStorage(std::string scope) { return scope.find("texture") != std::string::npos; } @@ -155,15 +138,13 @@ class TextureFlattener : public TextureLoweringBase { ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)"; Array shape; - auto width = IntImm(DataType::Int(32), 1); - auto height = IntImm(DataType::Int(32), 1); - for (size_t i = 0; i < op->bounds.size()-1; i++) { - if (i < GetAxisSeparator(op->bounds.size(), storage_scope)) { - height *= op->bounds[i]->extent; - } else { - width *= op->bounds[i]->extent; - } - } + Integer width = 1, height = 1; + size_t axis = runtime::DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); + struct Shape { + Array bounds; + PrimExpr operator[](size_t i) const { return bounds[i]->extent; } + }; + std::tie(width, height) = runtime::ApplyTexture2DFlattening(Shape{op->bounds}, op->bounds.size(), axis); Array args = {width, height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -216,7 +197,7 @@ class TextureFlattener : public TextureLoweringBase { } Array row_dims, row_indices, col_dims, col_indices; for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < GetAxisSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { + if (i < runtime::DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); } else { From 3c1b1220641f6e34a48201ed97d60df6ba75b839 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 19 Mar 2021 15:58:25 -0700 Subject: [PATCH 32/59] Use updated texture lowering utilities --- src/tir/transforms/texture_flatten.cc | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 8b0989d9e40c..e4e8861b492e 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -50,6 +50,10 @@ namespace tvm { namespace tir { namespace { +using runtime::IsTextureStorage; +using runtime::DefaultTextureLayoutSeparator; +using runtime::ApplyTexture2DFlattening; + inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) { PrimExpr base = make_const(DataType::Int(32), 0); ICHECK_EQ(shape.size(), index.size()); @@ -63,10 +67,6 @@ inline PrimExpr SimplifyOffset(const Array& shape, const Array(op->bounds.back()->extent.as()->value); ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)"; - Array shape; - Integer width = 1, height = 1; - size_t axis = runtime::DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); struct Shape { - Array bounds; + const Array& bounds; PrimExpr operator[](size_t i) const { return bounds[i]->extent; } }; - std::tie(width, height) = runtime::ApplyTexture2DFlattening(Shape{op->bounds}, op->bounds.size(), axis); - Array args = {width, height}; + size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); + auto texture = ApplyTexture2DFlattening(Shape{op->bounds}, op->bounds.size(), axis); + Array args = {texture.width, texture.height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); } @@ -197,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase { } Array row_dims, row_indices, col_dims, col_indices; for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { - if (i < runtime::DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { + if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); } else { From 137da362ae50180e8f5e03b3368fa309329161cc Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 15 Mar 2021 16:29:18 -0700 Subject: [PATCH 33/59] Use inherited visitor overloads in texture flattener. --- src/tir/transforms/texture_flatten.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index e4e8861b492e..cbc7a3ced362 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -114,6 +114,7 @@ class TextureLoweringBase : public StmtExprMutator { class TextureFlattener : public TextureLoweringBase { public: + using StmtExprMutator::VisitStmt_; explicit TextureFlattener(const Map& extern_buffer_map, const std::unordered_map& extern_buffer_binds_) : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;} From 5b6787e27e74773a48b577219b8214e68ce11628 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 16 Mar 2021 23:08:03 -0700 Subject: [PATCH 34/59] Add check in codegen for float/half until read/write_image codegen supports other types. --- src/target/source/codegen_opencl.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index c8a549a54491..a17897d2d10a 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -358,6 +358,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { } else if (buffer_type.is_float()) { os << "write_imagef("; + } else { + LOG(FATAL) << "Unsupported type: " << buffer_type + << ", currently only float and half are supported for image2d OpenCL codegen."; } this->PrintExpr(op->args[0], os); os << ", "; @@ -375,6 +378,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { } else if (op->dtype.is_float()) { ss << "read_imagef("; + } else { + LOG(FATAL) << "Unsupported type: " << op->dtype + << ", currently only float and half are supported for image2d OpenCL codegen."; } this->PrintExpr(op->args[0], ss); ss << ", "; From c839505e689712ea428898dfa0f1d4bb363b0bc4 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 17 Mar 2021 09:57:59 -0700 Subject: [PATCH 35/59] Rename tir texture builtins --- include/tvm/tir/builtin.h | 10 +++++----- src/target/source/codegen_opencl.cc | 18 +++++++++--------- src/tir/op/builtin.cc | 6 +++--- src/tir/transforms/lower_tvm_builtin.cc | 2 +- src/tir/transforms/texture_flatten.cc | 6 +++--- src/tir/transforms/vectorize_loop.cc | 8 ++------ 6 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 66fa069d62fa..86857a33cdf4 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -603,17 +603,17 @@ TVM_DLL const Op& atomic_add(); /*! * \brief Create a texture 2d memory allocation */ -TVM_DLL const Op& text2d_alloca(); +TVM_DLL const Op& texture2d_alloca(); /*! - * \brief Store to a texture 2d memory + * \brief Store to texture 2d memory */ -TVM_DLL const Op& text2d_store(); +TVM_DLL const Op& texture2d_store(); /*! - * \brief Load from a texture 2d memory + * \brief Load from texture 2d memory */ -TVM_DLL const Op& text2d_load(); +TVM_DLL const Op& texture2d_load(); /*! \brief The kind of structure field info used in intrinsic */ enum TVMStructFieldKind : int { diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index a17897d2d10a..87cbe8dd4d5a 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -56,10 +56,9 @@ class InferTextureAccess : public StmtExprVisitor { return storage_scope_qualifiers; } void VisitExpr_(const CallNode* op) { - if (op->op.same_as(builtin::text2d_load())) { + if (op->op.same_as(builtin::texture2d_load())) { var_access_map_[op->args[0].as()] |= read_access; - } - else if (op->op.same_as(builtin::text2d_store())) { + } else if (op->op.same_as(builtin::texture2d_store())) { var_access_map_[op->args[0].as()] |= write_access; } else { StmtExprVisitor::VisitExpr_(op); @@ -304,7 +303,7 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { if (auto call = op->value.as()) { - if (call->op.same_as(builtin::text2d_load())) { + if (call->op.same_as(builtin::texture2d_load())) { need_texture_ssa_ = false; // If storing a texture load into a buffer, don't use an // intermediate local unless the buffer allocation is a @@ -322,7 +321,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) { if (auto call = op->value.as()) { - if (call->op.same_as(builtin::text2d_load())) { + if (call->op.same_as(builtin::texture2d_load())) { need_texture_ssa_ = false; } } @@ -349,9 +348,10 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << " *)" << this->GetVarID(load->buffer_var.get()) << " + "; this->PrintExpr(load->index, os); os << ')'; - } else if (op->op.same_as(builtin::text2d_store())) { - auto* texture_type = op->args[0].as()->type_annotation.as(); - ICHECK(texture_type != nullptr) << "builtin::text2d_store() only supports storing to texture buffers"; + } else if (op->op.same_as(builtin::texture2d_store())) { + auto* texture_type = op->args[0].as()->type_annotation.as(); + ICHECK(texture_type != nullptr) + << "builtin::texture2d_store() only supports storing to texture buffers"; DataType buffer_type = texture_type->element_type.as()->dtype; if (buffer_type.is_float16()) { os << "write_imageh("; @@ -371,7 +371,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { os << "), "; this->PrintExpr(op->args[3], os); os << ")"; - } else if (op->op.same_as(builtin::text2d_load())) { + } else if (op->op.same_as(builtin::texture2d_load())) { std::stringstream ss; if (op->dtype.is_float16()) { ss << "read_imageh("; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 7705369eb5c8..c593cbf7290c 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -246,14 +246,14 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine) TIR_DEFINE_BUILTIN_FUNC(atomic_add) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(text2d_alloca) +TIR_DEFINE_BUILTIN_FUNC(texture2d_alloca) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(text2d_store) +TIR_DEFINE_BUILTIN_FUNC(texture2d_store) .set_attr("TVectorizable", true) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TIR_DEFINE_BUILTIN_FUNC(text2d_load) +TIR_DEFINE_BUILTIN_FUNC(texture2d_load) .set_attr("TVectorizable", true) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 19d434006b83..9c28c6f55926 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -100,7 +100,7 @@ class BuiltinLower : public StmtExprMutator { Stmt VisitStmt_(const LetStmtNode* op) final { if (const CallNode* call = op->value.as()) { - if (call->op.same_as(builtin::text2d_alloca())) { + if (call->op.same_as(builtin::texture2d_alloca())) { return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call)); } } diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index cbc7a3ced362..740742b1a0ff 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -145,7 +145,7 @@ class TextureFlattener : public TextureLoweringBase { size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); auto texture = ApplyTexture2DFlattening(Shape{op->bounds}, op->bounds.size(), axis); Array args = {texture.width, texture.height}; - stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::text2d_alloca(), args), body); + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } return stmt; @@ -159,7 +159,7 @@ class TextureFlattener : public TextureLoweringBase { if (IsTextureStorage(storage_scope)) { Array args = GetTextureAccessArgs(op, op->buffer); args.push_back(op->value); - stmt = Evaluate(Call(args[0]->dtype, builtin::text2d_store(), args)); + stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args)); } return stmt; @@ -178,7 +178,7 @@ class TextureFlattener : public TextureLoweringBase { if (IsTextureStorage(storage_scope)) { Array args = GetTextureAccessArgs(op, buffer); args.push_back(op->indices.back()); - expr = Call(op->buffer->dtype, builtin::text2d_load(), args); + expr = Call(op->buffer->dtype, builtin::texture2d_load(), args); } return expr; diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 9943d1e37938..cd2d230f5775 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -265,18 +265,14 @@ class Vectorizer : public StmtMutator, public ExprFunctorop.same_as(builtin::if_then_else())) { return MutateIfThenElseExpr_(op); - } - else if (op->op.same_as(builtin::text2d_load())) - { + } else if (op->op.same_as(builtin::texture2d_load())) { int lane = 0; Array fcd = MutateArray({op->args.back()}, &lane); auto new_args = op->args; new_args.pop_back(); new_args.push_back(fcd[0]); return Call(op->dtype.with_lanes(4), op->op, new_args); - } - else if (op->op.same_as(builtin::text2d_store())) - { + } else if (op->op.same_as(builtin::texture2d_store())) { int lane = 0; // Vectorize the value to store Array value{op->args.back()}; From 8041cc900089977e6915f641fd5b5c82065648d8 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 23 Mar 2021 11:34:54 -0700 Subject: [PATCH 36/59] Remove codegen and tir runtime dependence on for TVMBackendAlloc/FreeTexture. --- src/target/llvm/codegen_cpu.cc | 2 -- src/tir/op/runtime.cc | 10 ---------- 2 files changed, 12 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index 8b01f9d9186e..ab96d6e69d14 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -403,8 +403,6 @@ void CodeGenCPU::InitGlobalContext(bool dynamic_lookup) { // Mark as context functions gv_func_map_["TVMBackendAllocWorkspace"] = nullptr; gv_func_map_["TVMBackendFreeWorkspace"] = nullptr; - gv_func_map_["TVMBackendAllocTexture"] = nullptr; - gv_func_map_["TVMBackendFreeTexture"] = nullptr; } } } diff --git a/src/tir/op/runtime.cc b/src/tir/op/runtime.cc index 2a894d00ec0c..adabae9e75f7 100644 --- a/src/tir/op/runtime.cc +++ b/src/tir/op/runtime.cc @@ -37,15 +37,5 @@ TVM_REGISTER_OP("tir.TVMBackendFreeWorkspace") .set_attr("TGlobalSymbol", "TVMBackendFreeWorkspace") .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); -TVM_REGISTER_OP("tir.TVMBackendAllocTexture") - .set_num_inputs(6) - .set_attr("TGlobalSymbol", "TVMBackendAllocTexture") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); - -TVM_REGISTER_OP("tir.TVMBackendFreeTexture") - .set_num_inputs(3) - .set_attr("TGlobalSymbol", "TVMBackendFreeTexture") - .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); - } // namespace tir } // namespace tvm From 684e513042f32de77aad5a190683cfe92962c034 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 29 Apr 2021 15:32:57 -0700 Subject: [PATCH 37/59] Dispatch texture allocas via target specialized tir.tvm_call_packed --- src/tir/transforms/lower_tvm_builtin.cc | 31 ++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 9c28c6f55926..83e6e97a428f 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -360,20 +360,25 @@ class BuiltinLower : public StmtExprMutator { throw_last_error), let->body}); DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; - Stmt alloca = LetStmt( - let->var, - Call(let->var.dtype(), Op::Get("tir.TVMBackendAllocTexture"), - {cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), - cast(DataType::UInt(64), call->args[0]), - cast(DataType::UInt(64), call->args[1]), - IntImm(DataType::Int(32), dtype.code()), - IntImm(DataType::Int(32), dtype.bits())}), - body); - PrimExpr free_op = Call(DataType::Int(32), Op::Get("tir.TVMBackendFreeTexture"), - {cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), let->var}); + std::string fdevapi_prefix = "device_api."; + fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); + Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocTexture"), + cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), + cast(DataType::UInt(64), call->args[0]), + cast(DataType::UInt(64), call->args[1]), + IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits())}); + + Stmt alloca = LetStmt(let->var, call_packed, body); + + Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeTexture"), + cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), let->var}); + Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); return body; From 069ec771cf2d0d7a07047a3191d7294dd2f94517 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 4 May 2021 15:23:36 -0700 Subject: [PATCH 38/59] Remove kTexture scope and use kGlobal with texture tag. --- src/runtime/thread_storage_scope.h | 7 ------- src/te/operation/op_utils.cc | 2 +- src/te/schedule/bound.cc | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index 611a40d996ea..ac8260ffbe39 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -59,8 +59,6 @@ enum class StorageRank { kWMMAMatrixB = 5, /*! \brief wmma scope memory of accumulator */ kWMMAAccumulator = 6, - /*! \brief global scope texture memory */ - kTexture = 7, }; /*! @@ -110,8 +108,6 @@ struct StorageScope { return "wmma.matrix_b" + tag; case StorageRank::kWMMAAccumulator: return "wmma.accumulator" + tag; - case StorageRank::kTexture: - return "texture" + tag; default: LOG(FATAL) << "unknown storage scope"; return ""; @@ -147,9 +143,6 @@ struct StorageScope { } else if (s.compare(0, 16, "wmma.accumulator") == 0) { r.rank = StorageRank::kWMMAAccumulator; r.tag = s.substr(16, std::string::npos); - } else if (s.compare(0, 7, "texture") == 0) { - r.rank = StorageRank::kTexture; - r.tag = s.substr(7, std::string::npos); } else { LOG(FATAL) << "unknown storage scope " << s; } diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc index de0d6b5be848..ddc78866ae02 100644 --- a/src/te/operation/op_utils.cc +++ b/src/te/operation/op_utils.cc @@ -161,7 +161,7 @@ std::vector > MakeLoopNest(const Stage& stage, } else { runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag); runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope); - if (static_cast(ss.rank) <= ts.rank || ss.rank == runtime::StorageRank::kTexture) { + if (static_cast(ss.rank) <= ts.rank) { value_map[iv] = var; } else if (stage->scope == "warp" && ts.rank == 1) { // To determine whether a thread index is inside or outside a warp, we need diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc index c7ec8f23892c..12c9b5538b44 100644 --- a/src/te/schedule/bound.cc +++ b/src/te/schedule/bound.cc @@ -66,7 +66,7 @@ bool NeedRelax(const IterVar& iv, bool found_attach, if (scope.rank == StorageRank::kWarp && ts.rank == 1 && ts.dim_index == 0) { return true; } - return static_cast(scope.rank) <= ts.rank || scope.rank == StorageRank::kTexture; + return static_cast(scope.rank) <= ts.rank; } // infer storage scope, if not given From 86bb2e6e05234d08829f3ed76f2fbdc315b11cf6 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 10 May 2021 17:02:46 -0700 Subject: [PATCH 39/59] Remove TextureType. --- include/tvm/ir/type.h | 49 -------------------------------- include/tvm/ir/type_functor.h | 4 --- include/tvm/tir/op.h | 15 ---------- src/ir/type.cc | 27 ------------------ src/ir/type_functor.cc | 12 -------- src/printer/text_printer.h | 1 - src/printer/tir_text_printer.cc | 6 ---- src/printer/tvmscript_printer.cc | 7 ----- src/tir/op/op.cc | 2 +- 9 files changed, 1 insertion(+), 122 deletions(-) diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h index 8d073e88b0ab..c772650809fa 100644 --- a/include/tvm/ir/type.h +++ b/include/tvm/ir/type.h @@ -189,55 +189,6 @@ class PointerType : public Type { TVM_DEFINE_OBJECT_REF_METHODS(PointerType, Type, PointerTypeNode); }; -/*! - * \brief Low-level texture type. - * - * TextureType represents type hints in the TIR to be - * passed to the final code generator. - * - * TextureType should not occur in the high-level analysis. - * - * \sa TextureType - */ -class TextureTypeNode : public TypeNode { - public: - /*! - * \brief The base type of the texture. - */ - Type element_type; - - void VisitAttrs(AttrVisitor* v) { v->Visit("element_type", &element_type); } - - bool SEqualReduce(const TextureTypeNode* other, SEqualReducer equal) const { - return equal(element_type, other->element_type); - } - - void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(element_type); } - - static constexpr const char* _type_key = "TextureType"; - TVM_DECLARE_FINAL_OBJECT_INFO(TextureTypeNode, TypeNode); -}; - -/* - * \brief Managed reference to TextureTypeNode. - * \sa TextureTypeNode - */ -class TextureType : public Type { - public: - /*! - * \brief Constructor - * \param element_type The base type of the texture. - */ - TVM_DLL explicit TextureType(Type element_type); - /*! - * \brief Constructor - * \param element_type The base type of the texture. - */ - TVM_DLL explicit TextureType(runtime::DataType dtype); - - TVM_DEFINE_OBJECT_REF_METHODS(TextureType, Type, TextureTypeNode); -}; - /*! \brief Possible kinds of TypeVars. */ enum TypeKind : int { kType = 0, diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h index c71051e6f61c..11bf7d4740d0 100644 --- a/include/tvm/ir/type_functor.h +++ b/include/tvm/ir/type_functor.h @@ -89,7 +89,6 @@ class TypeFunctor { virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitType_(const PrimTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitType_(const PointerTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; - virtual R VisitType_(const TextureTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT; virtual R VisitTypeDefault_(const Object* op, Args...) { LOG(FATAL) << "Do not have a default for " << op->GetTypeKey(); throw; // unreachable, written to stop compiler warning @@ -113,7 +112,6 @@ class TypeFunctor { TVM_TYPE_FUNCTOR_DISPATCH(TypeDataNode); TVM_TYPE_FUNCTOR_DISPATCH(PrimTypeNode); TVM_TYPE_FUNCTOR_DISPATCH(PointerTypeNode); - TVM_TYPE_FUNCTOR_DISPATCH(TextureTypeNode); return vtable; } }; @@ -137,7 +135,6 @@ class TVM_DLL TypeVisitor : public TypeFunctor { void VisitType_(const TypeDataNode* op) override; void VisitType_(const PrimTypeNode* op) override; void VisitType_(const PointerTypeNode* op) override; - void VisitType_(const TextureTypeNode* op) override; }; /*! @@ -158,7 +155,6 @@ class TVM_DLL TypeMutator : public TypeFunctor { Type VisitType_(const TypeDataNode* op) override; Type VisitType_(const PrimTypeNode* op) override; Type VisitType_(const PointerTypeNode* op) override; - Type VisitType_(const TextureTypeNode* op) override; private: Array MutateArray(Array arr); diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h index c45a9d77a446..9cf7d0a3cd1f 100644 --- a/include/tvm/tir/op.h +++ b/include/tvm/tir/op.h @@ -895,21 +895,6 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) { } return false; } -/*! - * \brief Check if type is a texture handle of a runtime element type. - * \param type The type to be checked. - * \param element_type The corresponding element type. - * \return The check results - */ -inline bool IsTextureType(const Type& type, const DataType& element_type) { - if (!type.defined()) return false; - if (const auto* ptr_type = type.as()) { - if (const auto* prim_type = ptr_type->element_type.as()) { - return prim_type->dtype == element_type; - } - } - return false; -} /*! * \brief Make a const value with certain data type. diff --git a/src/ir/type.cc b/src/ir/type.cc index 5e0c8911c543..fe8e00329bbc 100644 --- a/src/ir/type.cc +++ b/src/ir/type.cc @@ -67,33 +67,6 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << '*'; }); -TextureType::TextureType(Type element_type) { - ObjectPtr n = make_object(); - n->element_type = std::move(element_type); - data_ = std::move(n); -} -TextureType::TextureType(runtime::DataType dtype) { - ObjectPtr n = make_object(); - n->element_type = PrimType(dtype); - data_ = std::move(n); -} - - -TVM_REGISTER_NODE_TYPE(TextureTypeNode); - -TVM_REGISTER_GLOBAL("ir.TextureType").set_body_typed([](Type element_type) { - return TextureType(element_type); -}); - -TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) - .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { - auto* node = static_cast(ref.get()); - p->stream << "texture "; - p->Print(node->element_type); - p->stream << '*'; - }); - - TypeVar::TypeVar(String name, TypeKind kind, Span span) { ObjectPtr n = make_object(); n->name_hint = std::move(name); diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc index e084a82ed7be..51d5d3778c10 100644 --- a/src/ir/type_functor.cc +++ b/src/ir/type_functor.cc @@ -89,8 +89,6 @@ void TypeVisitor::VisitType_(const PrimTypeNode* op) {} void TypeVisitor::VisitType_(const PointerTypeNode* op) { this->VisitType(op->element_type); } -void TypeVisitor::VisitType_(const TextureTypeNode* op) { this->VisitType(op->element_type); } - Type TypeMutator::VisitType(const Type& t) { return t.defined() ? TypeFunctor::VisitType(t) : t; } @@ -200,16 +198,6 @@ Type TypeMutator::VisitType_(const PointerTypeNode* op) { } } -Type TypeMutator::VisitType_(const TextureTypeNode* op) { - Type element_type = VisitType(op->element_type); - - if (element_type.same_as(op->element_type)) { - return GetRef(op); - } else { - return TextureType(element_type); - } -} - // Implements bind. class TypeBinder : public TypeMutator { public: diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h index 55f68f3e36cb..0332a2d539d2 100644 --- a/src/printer/text_printer.h +++ b/src/printer/text_printer.h @@ -333,7 +333,6 @@ class TIRTextPrinter : public StmtFunctor, Doc VisitType_(const PrimTypeNode* node) override; Doc VisitType_(const PointerTypeNode* node) override; - Doc VisitType_(const TextureTypeNode* node) override; Doc VisitType_(const TupleTypeNode* node) override; Doc PrintIRModule(const IRModule& module); diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index b137ae34107d..f232994480f8 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -613,12 +613,6 @@ Doc TIRTextPrinter::VisitType_(const PointerTypeNode* node) { return doc; } -Doc TIRTextPrinter::VisitType_(const TextureTypeNode* node) { - Doc doc; - doc << "Texture(" << Print(node->element_type) << ")"; - return doc; -} - Doc TIRTextPrinter::VisitType_(const TupleTypeNode* node) { std::vector fields; for (Type field : node->fields) { diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 39852c39b82a..cc7536b48cfd 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -145,7 +145,6 @@ class TVMScriptPrinter : public StmtFunctor, Doc VisitType_(const PrimTypeNode* node) override; Doc VisitType_(const PointerTypeNode* node) override; - Doc VisitType_(const TextureTypeNode* node) override; Doc VisitType_(const TupleTypeNode* node) override; Doc PrintBody(const Stmt& body); @@ -733,12 +732,6 @@ Doc TVMScriptPrinter::VisitType_(const PointerTypeNode* node) { return doc; } -Doc TVMScriptPrinter::VisitType_(const TextureTypeNode* node) { - Doc doc; - doc << "ty.Texture[" << Print(node->element_type) << "]"; - return doc; -} - Doc TVMScriptPrinter::VisitType_(const TupleTypeNode* node) { if (node->fields.empty()) { return Doc::Text("None"); diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index d03cf22094a8..d29132450227 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -51,7 +51,7 @@ using namespace tir; runtime::DataType GetRuntimeDataType(const Type& type) { if (auto* n = type.as()) { return n->dtype; - } else if (type.as() || type.as()) { + } else if (type.as()) { return DataType::Handle(); } else if (IsVoidType(type)) { return DataType::Void(); From 3e17295194c6acfb2117c7b987940bf18360b0d4 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 19 May 2021 16:54:21 -0700 Subject: [PATCH 40/59] Remove TextureType from OpenCL codegen. --- src/target/source/codegen_opencl.cc | 33 +++++++++++++++++------------ src/tir/ir/buffer.cc | 3 +-- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 87cbe8dd4d5a..5da29ca4643c 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -28,6 +28,7 @@ #include "../../runtime/opencl/opencl_module.h" #include "../../runtime/thread_storage_scope.h" +#include "../../runtime/texture.h" #include "../build_common.h" namespace tvm { @@ -77,10 +78,10 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) { CodeGenC::InitFuncState(f); this->SetTextureScope(InferTextureAccess().Infer(f->body)); for (Var arg : f->params) { - if (arg->type_annotation.as()) - { + auto ptr_type = arg->type_annotation.as(); + if (ptr_type && runtime::IsTextureStorage(std::string(ptr_type->storage_scope))) { // Storage scope qualifiers for textures are inferred - // and set prior function codegen. + // and set prior to function codegen. continue; } else if (arg.dtype().is_handle()) { @@ -211,10 +212,12 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) { // NOLINT(* if (auto* ptr = type.as()) { return PrintType(ptr->dtype, os); } else if (auto* ptr = type.as()) { - PrintType(ptr->element_type, os); - os << '*'; - } else if (type.as()){ - os << "image2d_t"; + if (runtime::IsTextureStorage(std::string(ptr->storage_scope))) { + os << "image2d_t"; + } else { + PrintType(ptr->element_type, os); + os << '*'; + } } else if (IsVoidType(type)) { os << "void"; } else { @@ -278,10 +281,11 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os } void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) { - // Only apply restrict qualifer for non-texture types - if (v->type_annotation.as() == nullptr) - { - os << ' ' << restrict_keyword_; + // Apply restrict qualifer for non-texture types only + if (auto* ptr = v->type_annotation.as()) { + if (!runtime::IsTextureStorage(std::string(ptr->storage_scope))) { + os << ' ' << restrict_keyword_; + } } } @@ -349,10 +353,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(load->index, os); os << ')'; } else if (op->op.same_as(builtin::texture2d_store())) { - auto* texture_type = op->args[0].as()->type_annotation.as(); - ICHECK(texture_type != nullptr) + auto* ptr_type = op->args[0].as()->type_annotation.as(); + ICHECK(ptr_type != nullptr) << "Texture Var's must be of PointerType"; + ICHECK(runtime::IsTextureStorage(std::string(ptr_type->storage_scope))) << "builtin::texture2d_store() only supports storing to texture buffers"; - DataType buffer_type = texture_type->element_type.as()->dtype; + DataType buffer_type = ptr_type->element_type.as()->dtype; if (buffer_type.is_float16()) { os << "write_imageh("; } diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index cd6cffabcda4..beee377d8401 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -166,8 +166,7 @@ Buffer::Buffer(Var data, DataType dtype, Array shape, Array if (storage_dtype == DataType::Bool()) { storage_dtype = DataType::Int(8); } - ICHECK(IsPointerType(data->type_annotation, storage_dtype) || - IsTextureType(data->type_annotation, storage_dtype)) + ICHECK(IsPointerType(data->type_annotation, storage_dtype)) << "Buffer data field expect to have the right pointer type annotation" << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype; From b3cdc52082aab042046d3f216f1a7909152fc16c Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Wed, 19 May 2021 16:55:06 -0700 Subject: [PATCH 41/59] Remove TextureType from TIR lowering. --- src/tir/transforms/lower_tvm_builtin.cc | 2 +- src/tir/transforms/texture_flatten.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 83e6e97a428f..daa868668c47 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -359,7 +359,7 @@ class BuiltinLower : public StmtExprMutator { Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), let->body}); - DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; + DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 740742b1a0ff..1544b68a55a4 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -124,14 +124,14 @@ class TextureFlattener : public TextureLoweringBase { return this->VisitStmt(op->body); } - Var buffer_var(op->buffer->data->name_hint, TextureType(op->buffer->dtype)); + std::string storage_scope = GetStorageScope(op->buffer); + Var buffer_var(op->buffer->data->name_hint, PointerType(PrimType(op->buffer->dtype), String(storage_scope))); let_binding_.insert({op->buffer->data, buffer_var}); Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); Stmt body = this->VisitStmt(op->body); - std::string storage_scope = GetStorageScope(op->buffer); if (IsTextureStorage(storage_scope)) { body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; From 17b8808a648e71e4a4a6181ca3fabd23a258b052 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 18 Jun 2021 22:59:36 -0700 Subject: [PATCH 42/59] Remove dependency on MergeMulMod. --- src/tir/transforms/texture_flatten.cc | 53 +++++++++++++++------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 1544b68a55a4..25770b2438e3 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -48,30 +48,15 @@ namespace tvm { namespace tir { -namespace { - using runtime::IsTextureStorage; using runtime::DefaultTextureLayoutSeparator; using runtime::ApplyTexture2DFlattening; -inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) { - PrimExpr base = make_const(DataType::Int(32), 0); - ICHECK_EQ(shape.size(), index.size()); - arith::Analyzer ana; - if (index.size() > 0) { - PrimExpr offset = index[0]; - for (size_t i = 1; i < index.size(); ++i) { - offset = MergeMulMod(&ana, offset * shape[i] + index[i]); - } - base = base + offset; - } - return base; -} -} class TextureLoweringBase : public StmtExprMutator { public: - explicit TextureLoweringBase(const Map& extern_buffer_map) { + explicit TextureLoweringBase(const Map& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer) + : bound_analyzer_{bound_analyzer} { for (auto kv : extern_buffer_map) { extern_buf_.insert(kv.second); } @@ -92,6 +77,19 @@ class TextureLoweringBase : public StmtExprMutator { return StmtExprMutator::VisitStmt_(op); } + inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) const { + PrimExpr base = make_const(DataType::Int(32), 0); + ICHECK_EQ(shape.size(), index.size()); + if (index.size() > 0) { + PrimExpr offset = index[0]; + for (size_t i = 1; i < index.size(); ++i) { + offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]); + } + base = base + offset; + } + return base; + } + protected: std::string GetStorageScope(const Buffer& buffer) { @@ -106,18 +104,22 @@ class TextureLoweringBase : public StmtExprMutator { return storage_scope; } + // TODO: need docs // External buffer std::unordered_set extern_buf_; // Storage scope std::unordered_map storage_scope_; + // Bound analzer + IRVisitorWithAnalyzer* bound_analyzer_; }; class TextureFlattener : public TextureLoweringBase { public: using StmtExprMutator::VisitStmt_; explicit TextureFlattener(const Map& extern_buffer_map, - const std::unordered_map& extern_buffer_binds_) - : TextureLoweringBase(extern_buffer_map), buffer_binds_(extern_buffer_binds_) {;} + const std::unordered_map& extern_buffer_binds_, + IRVisitorWithAnalyzer* bound_analyzer) + : TextureLoweringBase(extern_buffer_map, bound_analyzer), buffer_binds_(extern_buffer_binds_) {;} Stmt VisitStmt_(const BufferRealizeNode* op) final { if (extern_buf_.count(op->buffer)) { @@ -211,6 +213,7 @@ class TextureFlattener : public TextureLoweringBase { return args; } + // TODO: Need docs // Let binding std::unordered_map let_binding_; std::unordered_map buffer_binds_; @@ -219,8 +222,9 @@ class TextureFlattener : public TextureLoweringBase { class ExternalBufferForwarding : public TextureLoweringBase { public: - explicit ExternalBufferForwarding(const Map& extern_buffer_map) - : TextureLoweringBase(extern_buffer_map) {;} + explicit ExternalBufferForwarding(const Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : TextureLoweringBase(extern_buffer_map, bound_analyzer) {;} Stmt VisitStmt_(const AttrStmtNode* op) final { Stmt stmt = TextureLoweringBase::VisitStmt_(op); @@ -307,9 +311,12 @@ class ExternalBufferForwarding : public TextureLoweringBase { PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); - ExternalBufferForwarding forward(fptr->buffer_map); + + IRVisitorWithAnalyzer bound_analyzer; + bound_analyzer(fptr->body); + ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer); fptr->body = forward(std::move(fptr->body)); - fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers())(std::move(fptr->body)); + fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), &bound_analyzer)(std::move(fptr->body)); return func; } From c758eb9baf0f0109b87e9d124db89dbbcd13ad49 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 18 Jun 2021 23:06:25 -0700 Subject: [PATCH 43/59] Revert "Add buffer_common.h to house buffer offset simplification routines." This reverts commit 027628259229aaee051dbf1dfbed4e63ef820544. --- src/tir/ir/buffer.cc | 240 +++++++++++++++++++++++++- src/tir/transforms/texture_flatten.cc | 2 - 2 files changed, 239 insertions(+), 3 deletions(-) diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index beee377d8401..6a102339bcea 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -29,13 +29,17 @@ #include #include -#include "buffer_common.h" +#include +#include #include "../../arith/pattern_match.h" namespace tvm { namespace tir { +using IndexMod = tir::FloorModNode; +using IndexDiv = tir::FloorDivNode; + Array SimplifyArray(arith::Analyzer* ana, Array array) { for (size_t i = 0; i < array.size(); ++i) { array.Set(i, ana->Simplify(array[i])); @@ -50,6 +54,240 @@ Buffer decl_buffer(Array shape, DataType dtype, String name, String st Array(), PrimExpr(), name, 0, 0, kDefault, span); } +namespace { +// Split the given expression w.r.t the add operator +inline std::vector ExprSplitAddition(const PrimExpr& expr) { + using namespace tir; + std::vector ret; + std::stack split_buffer; + split_buffer.push(&expr); + while (!split_buffer.empty()) { + const PrimExpr* top_ele = split_buffer.top(); + split_buffer.pop(); + auto expr_add_match = top_ele->as(); + if (expr_add_match) { + split_buffer.push(&expr_add_match->b); + split_buffer.push(&expr_add_match->a); + } else { + ret.emplace_back(top_ele); + } + } + return ret; +} + +// Searches for the following types of expr: +// mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki +// mod_l_expr = c +// mod_r_expr = k1 * k2 * ... * ki +// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c) +// Currently the we will not search the add/mult combinations exhaustively +// as it will take too much computation. +inline std::pair MergeMulModInner(const PrimExpr& mult_expr, + const PrimExpr& mod_l_expr, + const PrimExpr& mod_r_expr) { + using namespace tir; + const MulNode* mult_ptr = mult_expr.as(); + if (!mult_ptr) return std::make_pair(false, PrimExpr()); + PrimExpr mult_outer = mult_ptr->b; + const PrimExpr* inner = &(mult_ptr->a); + // 1. Calculate the outer multiplier + while (true) { + mult_ptr = inner->as(); + if (mult_ptr) { + inner = &(mult_ptr->a); + mult_outer = mult_ptr->b * mult_outer; + } else { + break; + } + } + // 2. Search for the pattern c / (...) * (...) + c % (...) + // We match the search element with Add, Mul and Div. + // If Add is found, we need to continue our search for the rhs + // If Mult is found, we will expand the inner multiplication factor + // If Div is found, we will go on testing whether lhs matches the lhs of mod expr + // and returns the optimization result. + const PrimExpr* search_ptr = inner; + PrimExpr mult_inner; // The inner multiplication factor + PrimExpr no_opt_sum; // Sum of the exprs that cannot be optimized + tir::ExprDeepEqual expr_equal; + + while (true) { + auto inner_div_ptr = search_ptr->as(); + auto inner_mult_ptr = search_ptr->as(); + auto inner_add_ptr = search_ptr->as(); + if (!inner_div_ptr && !inner_mult_ptr && !inner_add_ptr) { + return std::make_pair(false, PrimExpr()); + } else if (inner_div_ptr) { + PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer; + if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) && + expr_equal(inner_div_ptr->a, mod_l_expr)) { + // Found! + PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr; + return std::make_pair(true, ret); + } else { + return std::make_pair(false, PrimExpr()); + } + } else if (inner_mult_ptr) { + mult_inner = mult_inner.get() ? inner_mult_ptr->b * mult_inner : inner_mult_ptr->b; + search_ptr = &(inner_mult_ptr->a); + } else if (inner_add_ptr) { + if (mult_inner.get()) { + return std::make_pair(false, PrimExpr()); + } + no_opt_sum = no_opt_sum.get() ? no_opt_sum + inner_add_ptr->a : inner_add_ptr->a; + search_ptr = &(inner_add_ptr->b); + } else { + LOG(FATAL) << "Unexpected search result!"; + break; + } + } + return std::make_pair(false, PrimExpr()); +} + +// Insert the elements into the corresponding mult_exprs and mod_exprs. +// If the element is found to match Mul, it will be pushed to the mult_exprs. +// If the element it found to match Mod, it will be pused to the mod_exprs. +// Otherwise, the elements will be added to the no_opt_sum variable +inline void MergeMulModInsertElements(const std::vector& eles, + std::list* mult_exprs, + std::list >* mod_exprs, + PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) { + using namespace tir; + *has_mult = false; + *has_mod = false; + for (const PrimExpr* ele : eles) { + auto mod_ptr = ele->as(); + auto mult_ptr = ele->as(); + if (mod_ptr) { + *has_mod = true; + mod_exprs->emplace_back(std::make_pair(std::move(mod_ptr->a), std::move(mod_ptr->b))); + } else if (mult_ptr) { + *has_mult = true; + mult_exprs->emplace_back(*ele); + } else { + *no_opt_sum = no_opt_sum->get() ? *no_opt_sum + *ele : *ele; + } + } +} + +// Searches for this types of expr: +// (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki +// + c % (k1 * k2 * ... * ki) +// and simplifies to (a1 + a2 + ... + aj) * kt * ... * ki + c +// The search will be performed repeatively until no pattern is found. +// Return: a pair with (false, Expr()) if cannot be optimized. +// a pair with (true, optimized_expr) if can be optimized +inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { + using namespace tir; + // 1. Prepare the lists. + // We store two lists, a list that contain all the elements that match Mul and + // a list that contain all the elements that match Mod. + // The elements in the Mod will be used to match against the elements in Mul. + // The result will then be split and pushed back to these two lists. + PrimExpr simplified_base = analyzer->Simplify(base); + std::vector eles = ExprSplitAddition(simplified_base); + std::list mult_exprs; + std::list > mod_exprs; + PrimExpr no_opt_sum; + bool has_mult; + bool has_mod; + MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod); + bool find_opt = false; + std::list >::iterator search_mod_it = mod_exprs.begin(); + // 2. Exhaustive Search + while (search_mod_it != mod_exprs.end()) { + std::list::iterator mult_it = mult_exprs.begin(); + bool inner_find_opt = false; + while (mult_it != mult_exprs.end()) { + std::pair ret = + MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second); + if (ret.first) { + inner_find_opt = true; + auto temp_mod_it = search_mod_it; + ++search_mod_it; + mod_exprs.erase(temp_mod_it); + mult_exprs.erase(mult_it); + std::vector ret_eles = ExprSplitAddition(ret.second); + MergeMulModInsertElements(ret_eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, + &has_mod); + if (has_mult) { + search_mod_it = mod_exprs.begin(); + } else if (has_mod && search_mod_it == mod_exprs.end()) { + search_mod_it--; + } + break; + } else { + ++mult_it; + } + } + find_opt = find_opt || inner_find_opt; + if (!inner_find_opt) { + ++search_mod_it; + } + } + if (!find_opt) { + return simplified_base; + } + for (std::list::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) { + no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it; + } + for (std::list >::iterator it = mod_exprs.begin(); + it != mod_exprs.end(); ++it) { + no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second) + : indexmod(it->first, it->second); + } + return no_opt_sum; +} + +// The buffer offset in convention of number of elements of +// original data ignoring number of lanes. +// We also perform optimization to simplify the indexing expression. +inline PrimExpr ElemOffset(const BufferNode* n, Array index) { + PrimExpr base = n->elem_offset; + arith::Analyzer ana; + if (n->strides.size() == 0) { + // Scalar case + if (n->shape.size() == 0 && index.size() == 1) { + auto is_int = index[0].as(); + ICHECK(is_int && is_int->value == 0); + base = base + index[0]; + } else { + ICHECK_EQ(n->shape.size(), index.size()); + if (index.size() > 0) { + PrimExpr offset = index[0]; + for (size_t i = 1; i < index.size(); ++i) { + offset = MergeMulMod(&ana, offset * n->shape[i] + index[i]); + } + base = base + offset; + } + } + } else { + ICHECK_EQ(n->strides.size(), index.size()); + if (is_zero(base)) { + base = MergeMulMod(&ana, index[0] * n->strides[0]); + } else { + base = MergeMulMod(&ana, base + index[0] * n->strides[0]); + } + for (size_t i = 1; i < index.size(); ++i) { + base = MergeMulMod(&ana, base + index[i] * n->strides[i]); + } + } + return base; +} + +inline PrimExpr BufferOffset(const BufferNode* n, Array index, DataType dtype) { + PrimExpr offset = ElemOffset(n, index); + if (n->dtype.lanes() != 1) { + offset = offset * make_const(offset.dtype(), dtype.lanes()); + } + if (dtype.lanes() != 1) { + return tir::Ramp(offset, make_const(offset.dtype(), 1), dtype.lanes()); + } else { + return offset; + } +} +} + PrimExpr Buffer::vload(Array begin, DataType dtype) const { // specially handle bool, stored as DataType::Int(8) const BufferNode* n = operator->(); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 25770b2438e3..e62d736bd80f 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -42,8 +42,6 @@ #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/thread_storage_scope.h" #include "../../runtime/texture.h" -#include "../ir/buffer_common.h" -#include "arg_binder.h" #include "ir_utils.h" namespace tvm { From a794abdd0a5ed6975f0e9f0e17ea7036ff85523d Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Fri, 18 Jun 2021 23:08:57 -0700 Subject: [PATCH 44/59] Prune include list --- src/tir/transforms/texture_flatten.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index e62d736bd80f..85058b9c2112 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -22,27 +22,18 @@ * \brief Flattens texture from multi-dimensional array to 2D buffer access */ -#include -#include #include -#include #include -#include -#include #include #include -#include #include -#include #include #include -#include #include "../../arith/ir_visitor_with_analyzer.h" #include "../../runtime/thread_storage_scope.h" #include "../../runtime/texture.h" -#include "ir_utils.h" namespace tvm { namespace tir { From 11fe640529537b4a263b9ae7c6a444002c6ff09c Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sat, 19 Jun 2021 14:40:19 -0700 Subject: [PATCH 45/59] Add more documentation to texture flattening. --- src/tir/transforms/texture_flatten.cc | 37 +++++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 85058b9c2112..0e571b732090 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -19,7 +19,8 @@ /*! * \file texture_flatten.cc - * \brief Flattens texture from multi-dimensional array to 2D buffer access + * \brief Flattens texture storage from multi-dimensional array + * to 2D (width, height) buffer access */ #include @@ -93,15 +94,17 @@ class TextureLoweringBase : public StmtExprMutator { return storage_scope; } - // TODO: need docs - // External buffer + // Set of all external input and output buffers std::unordered_set extern_buf_; - // Storage scope + // Map to track the storage scope of buffer realization and the + // buffer directly. std::unordered_map storage_scope_; // Bound analzer IRVisitorWithAnalyzer* bound_analyzer_; }; +// Lower Nd storage access to 2d texture access using lowering convention +// specified by the buffers storage scope. class TextureFlattener : public TextureLoweringBase { public: using StmtExprMutator::VisitStmt_; @@ -123,18 +126,19 @@ class TextureFlattener : public TextureLoweringBase { op = stmt.as(); Stmt body = this->VisitStmt(op->body); + // Rewrite any buffer realizations with storage scope to 2d texture allocations if (IsTextureStorage(storage_scope)) { body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; int vec_length = static_cast(op->bounds.back()->extent.as()->value); - ICHECK(vec_length == 4 || vec_length == 1) << "FCD of texture must be vector of length 1 or 4 (RGBA)"; + ICHECK(vec_length == 4 || vec_length == 1) << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)"; - struct Shape { + struct ShapeFromRange { const Array& bounds; PrimExpr operator[](size_t i) const { return bounds[i]->extent; } }; size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); - auto texture = ApplyTexture2DFlattening(Shape{op->bounds}, op->bounds.size(), axis); + auto texture = ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); Array args = {texture.width, texture.height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } @@ -202,13 +206,19 @@ class TextureFlattener : public TextureLoweringBase { return args; } - // TODO: Need docs - // Let binding + // Bindings to new texture vars with texture pointer scope std::unordered_map let_binding_; + // Bindings from realized buffers to external buffers when the memory transfer + // to the realized buffer can be cancelled std::unordered_map buffer_binds_; }; - +// Populate bindings from internal buffers to external ones of the same scope +// when it can be proven that the intermediate buffer access is identical +// to the external access. This can allow for cache_read/write cancellation +// when the external buffers are identical to the realized ones. Currently doesn't +// support forwarding external buffers when the realized buffer is conditionally +// loaded due to padding and other possible access modifying expressions. class ExternalBufferForwarding : public TextureLoweringBase { public: explicit ExternalBufferForwarding(const Map& extern_buffer_map, @@ -291,16 +301,21 @@ class ExternalBufferForwarding : public TextureLoweringBase { } private: + // List of realize_attrs used to mark the last valid attr stmt to use when rewriting + // the AST to remove any unecessary buffer realization. std::deque realize_attrs_; + // Set of buffers which are identical to external buffers and are copied into. std::unordered_set extern_buffer_copy_; + // Binding from internal identical realized buffer and external buffer. std::unordered_map buffer_map_; + // Active set of loads on external buffers contained in the scope of a buffer + // realize node. std::vector> external_loads_; }; PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); - IRVisitorWithAnalyzer bound_analyzer; bound_analyzer(fptr->body); ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer); From 89d832fd1c74a9f362eca3efe0969acc31fa066e Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sat, 19 Jun 2021 14:48:37 -0700 Subject: [PATCH 46/59] Add TextureFlatten transform to refactored tvm lower API. --- include/tvm/tir/transform.h | 9 +++++++++ src/driver/driver_api.cc | 1 + 2 files changed, 10 insertions(+) diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index d1308fe0059e..0ce29fa9da16 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -437,6 +437,15 @@ TVM_DLL Pass LowerMatchBuffer(); */ TVM_DLL Pass FlattenBuffer(); +/* + * \brief Flatten the multi-dimensional read/write + * to two dimensional texture Load/Store and realize + * texture buffer allocations. + * + * \return The Pass + */ +TVM_DLL Pass TextureFlatten(); + /*! * A pass to merge multiple TIR-level dynamic shared memory allocations into one */ diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index d6af9936ca40..2759c8503393 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -215,6 +215,7 @@ Array CreatePassList(bool disable_loop_partition) { // PHASE 1 pass_list.push_back(tir::transform::InjectPrefetch()); + pass_list.push_back(tir::transform::TextureFlatten()); pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers)); pass_list.push_back(tir::transform::LowerInitBlock()); pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation()); From 349eb512bf203cd37dd61bf72474ec5f893ba4ab Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sat, 19 Jun 2021 14:57:17 -0700 Subject: [PATCH 47/59] Apply clang formatting. --- src/target/source/codegen_opencl.cc | 40 +++++++++------------ src/target/source/codegen_opencl.h | 10 +++--- src/tir/ir/buffer.cc | 2 +- src/tir/transforms/lower_tvm_builtin.cc | 31 ++++++++-------- src/tir/transforms/texture_flatten.cc | 47 ++++++++++++++----------- 5 files changed, 63 insertions(+), 67 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 5da29ca4643c..379851b0d8f4 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -27,15 +27,15 @@ #include #include "../../runtime/opencl/opencl_module.h" -#include "../../runtime/thread_storage_scope.h" #include "../../runtime/texture.h" +#include "../../runtime/thread_storage_scope.h" #include "../build_common.h" namespace tvm { namespace codegen { class InferTextureAccess : public StmtExprVisitor { -public: + public: static constexpr const uint8_t read_access = 1; static constexpr const uint8_t write_access = 2; @@ -46,11 +46,9 @@ class InferTextureAccess : public StmtExprVisitor { for (auto& texture : var_access_map_) { if (texture.second == read_access) { storage_scope_qualifiers.insert({texture.first, "texture_read"}); - } - else if (texture.second == write_access) { + } else if (texture.second == write_access) { storage_scope_qualifiers.insert({texture.first, "texture_write"}); - } - else if (texture.second == (read_access | write_access)) { + } else if (texture.second == (read_access | write_access)) { storage_scope_qualifiers.insert({texture.first, ""}); } } @@ -67,11 +65,10 @@ class InferTextureAccess : public StmtExprVisitor { StmtExprVisitor::VisitExpr_(op); } -private: + private: std::unordered_map var_access_map_; }; - CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; } void CodeGenOpenCL::InitFuncState(const PrimFunc& f) { @@ -83,8 +80,7 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) { // Storage scope qualifiers for textures are inferred // and set prior to function codegen. continue; - } - else if (arg.dtype().is_handle()) { + } else if (arg.dtype().is_handle()) { alloc_storage_scope_[arg.get()] = "global"; } } @@ -313,8 +309,7 @@ void CodeGenOpenCL::VisitStmt_(const StoreNode* op) { // intermediate local unless the buffer allocation is a // single element selected from the texture read. auto it = allocation_size_.find(op->buffer_var.get()); - if (it != allocation_size_.end() && it->second == 1) - { + if (it != allocation_size_.end() && it->second == 1) { need_texture_ssa_ = true; } } @@ -334,7 +329,8 @@ void CodeGenOpenCL::VisitExpr_(const CastNode* op, std::ostream& os) { } void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) { - allocation_size_.insert({op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()}); + allocation_size_.insert( + {op->buffer_var.get(), op->constant_allocation_size() * op->dtype.lanes()}); CodeGenC::VisitStmt_(op); } @@ -360,8 +356,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { DataType buffer_type = ptr_type->element_type.as()->dtype; if (buffer_type.is_float16()) { os << "write_imageh("; - } - else if (buffer_type.is_float()) { + } else if (buffer_type.is_float()) { os << "write_imagef("; } else { LOG(FATAL) << "Unsupported type: " << buffer_type @@ -380,8 +375,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { std::stringstream ss; if (op->dtype.is_float16()) { ss << "read_imageh("; - } - else if (op->dtype.is_float()) { + } else if (op->dtype.is_float()) { ss << "read_imagef("; } else { LOG(FATAL) << "Unsupported type: " << op->dtype @@ -397,11 +391,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { ss << "))"; // Only use local SSA if texture is not already being stored - if (need_texture_ssa_) - { + if (need_texture_ssa_) { std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4)); - if (op->args.back().as()) - { + if (op->args.back().as()) { os << rhs; } else { os << "(("; @@ -450,9 +442,9 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) { // N } } -void CodeGenOpenCL::SetTextureScope(const std::unordered_map& scope) { // NOLINT(*) - for (auto& texture : scope) - { +void CodeGenOpenCL::SetTextureScope( + const std::unordered_map& scope) { // NOLINT(*) + for (auto& texture : scope) { alloc_storage_scope_.insert(texture); } } diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index a456fdd94f5f..4c57a84ebeaf 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -51,11 +51,10 @@ class CodeGenOpenCL final : public CodeGenC { const std::string& value) final; // NOLINT(*) // the address of load/store void PrintVecAddr(const VarNode* buffer, DataType t, PrimExpr base, - std::ostream& os); // NOLINT(*) - void PrintRestrict(const Var& v, std::ostream& os) final; // NOLINT(*) - std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*) - void SetTextureScope(const std::unordered_map&); // NOLINT(*) - + std::ostream& os); // NOLINT(*) + void PrintRestrict(const Var& v, std::ostream& os) final; // NOLINT(*) + std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*) + void SetTextureScope(const std::unordered_map&); // NOLINT(*) // overload visitor void VisitStmt_(const AllocateNode* op) final; // NOLINT(*) @@ -65,7 +64,6 @@ class CodeGenOpenCL final : public CodeGenC { void VisitExpr_(const FloatImmNode* op, std::ostream& os) final; // NOLINT(*) void VisitStmt_(const StoreNode* op) final; // NOLINT(*) - private: // whether enable fp16 and fp64 extension bool enable_fp16_{false}; diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 6a102339bcea..5293c7f1fee5 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -286,7 +286,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array index, DataTyp return offset; } } -} +} // namespace PrimExpr Buffer::vload(Array begin, DataType dtype) const { // specially handle bool, stored as DataType::Int(8) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index daa868668c47..83f35b150b24 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -356,28 +356,27 @@ class BuiltinLower : public StmtExprMutator { ICHECK(device_id_.defined()) << "Unknown device id in current IR"; Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {})); - Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), - throw_last_error), - let->body}); - DataType dtype = let->var->type_annotation.as()->element_type.as()->dtype; + Stmt body = SeqStmt( + {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error), + let->body}); + DataType dtype = + let->var->type_annotation.as()->element_type.as()->dtype; std::string fdevapi_prefix = "device_api."; fdevapi_prefix += runtime::DeviceName(device_type_.as()->value); - Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".AllocTexture"), - cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), - cast(DataType::UInt(64), call->args[0]), - cast(DataType::UInt(64), call->args[1]), - IntImm(DataType::Int(32), dtype.code()), - IntImm(DataType::Int(32), dtype.bits())}); + Call call_packed = + Call(let->var.dtype(), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]), + cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()), + IntImm(DataType::Int(32), dtype.bits())}); Stmt alloca = LetStmt(let->var, call_packed, body); - Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(), - {StringImm(fdevapi_prefix + ".FreeTexture"), - cast(DataType::Int(32), device_type_), - cast(DataType::Int(32), device_id_), let->var}); + Call free_op = + Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_), + cast(DataType::Int(32), device_id_), let->var}); Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error); body = SeqStmt({alloca, free_stmt}); diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 0e571b732090..0ca4668a57cf 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -33,20 +33,20 @@ #include #include "../../arith/ir_visitor_with_analyzer.h" -#include "../../runtime/thread_storage_scope.h" #include "../../runtime/texture.h" +#include "../../runtime/thread_storage_scope.h" namespace tvm { namespace tir { -using runtime::IsTextureStorage; -using runtime::DefaultTextureLayoutSeparator; using runtime::ApplyTexture2DFlattening; - +using runtime::DefaultTextureLayoutSeparator; +using runtime::IsTextureStorage; class TextureLoweringBase : public StmtExprMutator { public: - explicit TextureLoweringBase(const Map& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer) - : bound_analyzer_{bound_analyzer} { + explicit TextureLoweringBase(const Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : bound_analyzer_{bound_analyzer} { for (auto kv : extern_buffer_map) { extern_buf_.insert(kv.second); } @@ -81,7 +81,6 @@ class TextureLoweringBase : public StmtExprMutator { } protected: - std::string GetStorageScope(const Buffer& buffer) { std::string storage_scope; auto it = storage_scope_.find(buffer.get()); @@ -108,10 +107,14 @@ class TextureLoweringBase : public StmtExprMutator { class TextureFlattener : public TextureLoweringBase { public: using StmtExprMutator::VisitStmt_; - explicit TextureFlattener(const Map& extern_buffer_map, - const std::unordered_map& extern_buffer_binds_, - IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer), buffer_binds_(extern_buffer_binds_) {;} + explicit TextureFlattener( + const Map& extern_buffer_map, + const std::unordered_map& extern_buffer_binds_, + IRVisitorWithAnalyzer* bound_analyzer) + : TextureLoweringBase(extern_buffer_map, bound_analyzer), + buffer_binds_(extern_buffer_binds_) { + ; + } Stmt VisitStmt_(const BufferRealizeNode* op) final { if (extern_buf_.count(op->buffer)) { @@ -119,7 +122,8 @@ class TextureFlattener : public TextureLoweringBase { } std::string storage_scope = GetStorageScope(op->buffer); - Var buffer_var(op->buffer->data->name_hint, PointerType(PrimType(op->buffer->dtype), String(storage_scope))); + Var buffer_var(op->buffer->data->name_hint, + PointerType(PrimType(op->buffer->dtype), String(storage_scope))); let_binding_.insert({op->buffer->data, buffer_var}); Stmt stmt = StmtExprMutator::VisitStmt_(op); @@ -131,14 +135,16 @@ class TextureFlattener : public TextureLoweringBase { body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; int vec_length = static_cast(op->bounds.back()->extent.as()->value); - ICHECK(vec_length == 4 || vec_length == 1) << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)"; + ICHECK(vec_length == 4 || vec_length == 1) + << "Inner dimension of texture must be vector of length 1 or 4 (RGBA)"; struct ShapeFromRange { const Array& bounds; PrimExpr operator[](size_t i) const { return bounds[i]->extent; } }; size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); - auto texture = ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); + auto texture = + ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); Array args = {texture.width, texture.height}; stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body); } @@ -180,8 +186,7 @@ class TextureFlattener : public TextureLoweringBase { } protected: - - template + template Array GetTextureAccessArgs(const T* op, const Buffer& buffer) { Array args; if (let_binding_.count(op->buffer->data)) { @@ -190,7 +195,7 @@ class TextureFlattener : public TextureLoweringBase { args.push_back(buffer->data); } Array row_dims, row_indices, col_dims, col_indices; - for (size_t i = 0; i < op->buffer->shape.size()-1; i++) { + for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) { if (i < DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer))) { col_dims.push_back(op->buffer->shape[i]); col_indices.push_back(op->indices[i]); @@ -223,7 +228,9 @@ class ExternalBufferForwarding : public TextureLoweringBase { public: explicit ExternalBufferForwarding(const Map& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer) {;} + : TextureLoweringBase(extern_buffer_map, bound_analyzer) { + ; + } Stmt VisitStmt_(const AttrStmtNode* op) final { Stmt stmt = TextureLoweringBase::VisitStmt_(op); @@ -313,14 +320,14 @@ class ExternalBufferForwarding : public TextureLoweringBase { std::vector> external_loads_; }; - PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); IRVisitorWithAnalyzer bound_analyzer; bound_analyzer(fptr->body); ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer); fptr->body = forward(std::move(fptr->body)); - fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), &bound_analyzer)(std::move(fptr->body)); + fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), + &bound_analyzer)(std::move(fptr->body)); return func; } From da3e146324e3d3c23dd5a8bcf6a74753b9b8a808 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sat, 19 Jun 2021 14:57:57 -0700 Subject: [PATCH 48/59] Blacken python APIs. --- python/tvm/tir/transform/transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 4cdf7d47856e..489331b049d4 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -94,8 +94,9 @@ def StorageFlatten(cache_line_size, create_bound_attribute: bool = False): """ return _ffi_api.StorageFlatten(cache_line_size, create_bound_attribute) # type: ignore + def TextureFlatten(): - """Flatten the multi-dimensional read/write to 1D. + """Flatten the multi-dimensional read/write to 2D. Parameters From b43e0e780094b578141815bcd007fb177983899d Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Sun, 20 Jun 2021 21:02:03 -0700 Subject: [PATCH 49/59] Apply cpplint changes. --- src/target/source/codegen_opencl.cc | 2 +- src/target/source/codegen_opencl.h | 1 + src/tir/transforms/texture_flatten.cc | 8 ++------ 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 379851b0d8f4..9137b69e4bb5 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -39,7 +39,7 @@ class InferTextureAccess : public StmtExprVisitor { static constexpr const uint8_t read_access = 1; static constexpr const uint8_t write_access = 2; - explicit InferTextureAccess() {} + InferTextureAccess() {} std::unordered_map Infer(const Stmt& n) { StmtExprVisitor::VisitStmt(n); std::unordered_map storage_scope_qualifiers; diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 4c57a84ebeaf..722db2b7f7b5 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -27,6 +27,7 @@ #include #include +#include #include "codegen_c.h" diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 0ca4668a57cf..53266681e1db 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -112,9 +112,7 @@ class TextureFlattener : public TextureLoweringBase { const std::unordered_map& extern_buffer_binds_, IRVisitorWithAnalyzer* bound_analyzer) : TextureLoweringBase(extern_buffer_map, bound_analyzer), - buffer_binds_(extern_buffer_binds_) { - ; - } + buffer_binds_(extern_buffer_binds_) {} Stmt VisitStmt_(const BufferRealizeNode* op) final { if (extern_buf_.count(op->buffer)) { @@ -228,9 +226,7 @@ class ExternalBufferForwarding : public TextureLoweringBase { public: explicit ExternalBufferForwarding(const Map& extern_buffer_map, IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer) { - ; - } + : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} Stmt VisitStmt_(const AttrStmtNode* op) final { Stmt stmt = TextureLoweringBase::VisitStmt_(op); From 61387263ba227a63e6299fde616ee01f40beba69 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 19 Jul 2021 15:51:46 -0700 Subject: [PATCH 50/59] Attempt to extract storage scope from pointer scope. --- src/tir/ir/buffer.cc | 2 -- src/tir/transforms/lower_tvm_builtin.cc | 1 - src/tir/transforms/texture_flatten.cc | 5 +++++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 5293c7f1fee5..de2d92e0f3ab 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -54,7 +54,6 @@ Buffer decl_buffer(Array shape, DataType dtype, String name, String st Array(), PrimExpr(), name, 0, 0, kDefault, span); } -namespace { // Split the given expression w.r.t the add operator inline std::vector ExprSplitAddition(const PrimExpr& expr) { using namespace tir; @@ -286,7 +285,6 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array index, DataTyp return offset; } } -} // namespace PrimExpr Buffer::vload(Array begin, DataType dtype) const { // specially handle bool, stored as DataType::Int(8) diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 83f35b150b24..f5a553aa0598 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -193,7 +193,6 @@ class BuiltinLower : public StmtExprMutator { return StmtExprMutator::VisitExpr_(op); } } - // call shape PrimExpr MakeShape(const CallNode* op) { // if args.size() == 0, it represents a scalar shape () diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 53266681e1db..738bfd3de2c5 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -89,6 +89,11 @@ class TextureLoweringBase : public StmtExprMutator { storage_scope = it->second; } else { storage_scope = buffer->scope; + if (storage_scope == "global" || storage_scope == "") { + if (auto* ptr = buffer->data->type_annotation.as()) { + storage_scope = ptr->storage_scope; + } + } } return storage_scope; } From bd084c06a8afddb972898d584da99981a869b9c0 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 20 Jul 2021 20:30:00 -0700 Subject: [PATCH 51/59] Remove ExternalBufferForwarding (cache_read cancellation) for now. --- src/tir/transforms/texture_flatten.cc | 156 ++------------------------ 1 file changed, 7 insertions(+), 149 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 738bfd3de2c5..2a7394d01396 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -52,21 +52,6 @@ class TextureLoweringBase : public StmtExprMutator { } } - virtual Stmt VisitStmt_(const AttrStmtNode* op) { - if (op->attr_key == attr::realize_scope) { - std::string realize_scope = op->value.as()->value; - // If realize_scope for external buffer is unset, infer from buffer scope - if (realize_scope == "" && op->body->IsInstance()) { - const auto* realize = Downcast(op->body).get(); - if (extern_buf_.count(realize->buffer)) { - realize_scope = realize->buffer->scope; - } - } - storage_scope_[op->node.get()] = realize_scope; - } - return StmtExprMutator::VisitStmt_(op); - } - inline PrimExpr SimplifyOffset(const Array& shape, const Array& index) const { PrimExpr base = make_const(DataType::Int(32), 0); ICHECK_EQ(shape.size(), index.size()); @@ -82,27 +67,13 @@ class TextureLoweringBase : public StmtExprMutator { protected: std::string GetStorageScope(const Buffer& buffer) { - std::string storage_scope; - auto it = storage_scope_.find(buffer.get()); - // If buffer has a realize_scope attr return it - if (it != storage_scope_.end()) { - storage_scope = it->second; - } else { - storage_scope = buffer->scope; - if (storage_scope == "global" || storage_scope == "") { - if (auto* ptr = buffer->data->type_annotation.as()) { - storage_scope = ptr->storage_scope; - } - } - } - return storage_scope; + auto* ptr = buffer->data->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; } // Set of all external input and output buffers std::unordered_set extern_buf_; - // Map to track the storage scope of buffer realization and the - // buffer directly. - std::unordered_map storage_scope_; // Bound analzer IRVisitorWithAnalyzer* bound_analyzer_; }; @@ -114,10 +85,8 @@ class TextureFlattener : public TextureLoweringBase { using StmtExprMutator::VisitStmt_; explicit TextureFlattener( const Map& extern_buffer_map, - const std::unordered_map& extern_buffer_binds_, IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer), - buffer_binds_(extern_buffer_binds_) {} + : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} Stmt VisitStmt_(const BufferRealizeNode* op) final { if (extern_buf_.count(op->buffer)) { @@ -172,15 +141,10 @@ class TextureFlattener : public TextureLoweringBase { PrimExpr VisitExpr_(const BufferLoadNode* op) final { PrimExpr expr = StmtExprMutator::VisitExpr_(op); op = expr.as(); - // Replace with identitcal external buffer if one exists - auto buffer = op->buffer; - if (buffer_binds_.count(op->buffer)) { - buffer = buffer_binds_[op->buffer]; - } // Lower to two dimensional access - std::string storage_scope = GetStorageScope(buffer); + std::string storage_scope = GetStorageScope(op->buffer); if (IsTextureStorage(storage_scope)) { - Array args = GetTextureAccessArgs(op, buffer); + Array args = GetTextureAccessArgs(op, op->buffer); args.push_back(op->indices.back()); expr = Call(op->buffer->dtype, builtin::texture2d_load(), args); } @@ -216,119 +180,13 @@ class TextureFlattener : public TextureLoweringBase { // Bindings to new texture vars with texture pointer scope std::unordered_map let_binding_; - // Bindings from realized buffers to external buffers when the memory transfer - // to the realized buffer can be cancelled - std::unordered_map buffer_binds_; -}; - -// Populate bindings from internal buffers to external ones of the same scope -// when it can be proven that the intermediate buffer access is identical -// to the external access. This can allow for cache_read/write cancellation -// when the external buffers are identical to the realized ones. Currently doesn't -// support forwarding external buffers when the realized buffer is conditionally -// loaded due to padding and other possible access modifying expressions. -class ExternalBufferForwarding : public TextureLoweringBase { - public: - explicit ExternalBufferForwarding(const Map& extern_buffer_map, - IRVisitorWithAnalyzer* bound_analyzer) - : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} - - Stmt VisitStmt_(const AttrStmtNode* op) final { - Stmt stmt = TextureLoweringBase::VisitStmt_(op); - if (op->attr_key == attr::realize_scope) { - if (op->body->IsInstance()) { - const auto* realize = Downcast(op->body).get(); - std::string realize_scope = GetStorageScope(realize->buffer); - if (IsTextureStorage(realize_scope) && extern_buffer_copy_.count(realize->buffer)) { - return realize_attrs_.back(); - } else { - if (realize_attrs_.size()) { - realize_attrs_.pop_back(); - } - realize_attrs_.push_back(stmt); - } - return stmt; - } - } - - return stmt; - } - - Stmt VisitStmt_(const BufferStoreNode* op) final { - ICHECK_EQ(external_loads_.size(), 0) << "Found external loads bound to a different store"; - if (auto* call_node = op->value.as()) { - // Path to supporting external cache_read canceling when padding has induced - // a conditional load into the cache_read buffer. We may be able to elide the - // conditional completely due to hardware support for returning 0 when OOB - if (call_node->op.same_as(builtin::if_then_else())) { - external_loads_.emplace_back(); - } - } - Stmt stmt = StmtExprMutator::VisitStmt_(op); - op = stmt.as(); - - auto check_identity = [this](const BufferStoreNode* store, const BufferLoad& load) { - if (extern_buf_.count(load->buffer)) { - // If the buffer to load and the buffer to store to are both texture - // check for identical access - if (IsTextureStorage(GetStorageScope(load->buffer)) && - IsTextureStorage(GetStorageScope(store->buffer))) { - auto store_index = SimplifyOffset(store->buffer->shape, store->indices); - auto load_index = SimplifyOffset(load->buffer->shape, load->indices); - if (arith::Analyzer().CanProve(store_index == load_index)) { - extern_buffer_copy_.insert(store->buffer); - buffer_map_.insert({store->buffer, load->buffer}); - } - } - } - }; - - if (auto load_node = op->value.as()) { - check_identity(op, GetRef(load_node)); - } else if (external_loads_.size()) { - // Stored value is not a load, check for external loads collected - // when visiting the store node's value, e.g. from if_then_else - for (auto& expr : external_loads_.back()) { - check_identity(op, Downcast(expr)); - } - external_loads_.pop_back(); - } - return stmt; - } - - PrimExpr VisitExpr_(const BufferLoadNode* op) final { - PrimExpr expr = StmtExprMutator::VisitExpr_(op); - if (external_loads_.size() && extern_buf_.count(op->buffer)) { - external_loads_.back().push_back(expr); - } - return expr; - } - - const std::unordered_map& GetForwardedBuffers() { - return buffer_map_; - } - - private: - // List of realize_attrs used to mark the last valid attr stmt to use when rewriting - // the AST to remove any unecessary buffer realization. - std::deque realize_attrs_; - // Set of buffers which are identical to external buffers and are copied into. - std::unordered_set extern_buffer_copy_; - // Binding from internal identical realized buffer and external buffer. - std::unordered_map buffer_map_; - // Active set of loads on external buffers contained in the scope of a buffer - // realize node. - std::vector> external_loads_; }; PrimFunc TextureFlatten(PrimFunc func) { auto fptr = func.CopyOnWrite(); IRVisitorWithAnalyzer bound_analyzer; bound_analyzer(fptr->body); - ExternalBufferForwarding forward(fptr->buffer_map, &bound_analyzer); - fptr->body = forward(std::move(fptr->body)); - fptr->body = TextureFlattener(fptr->buffer_map, forward.GetForwardedBuffers(), - &bound_analyzer)(std::move(fptr->body)); + fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body)); return func; } From d2f8bda8c44beceec814a91d498b65ed10cbbbbc Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 20 Jul 2021 20:30:24 -0700 Subject: [PATCH 52/59] Apply MyPy. --- python/tvm/tir/transform/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 489331b049d4..732bf0f1ca11 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -107,7 +107,7 @@ def TextureFlatten(): fpass : tvm.transform.Pass The result pass """ - return _ffi_api.TextureFlatten() + return _ffi_api.TextureFlatten() # type: ignore def InjectCopyIntrin(pragma_key: str, fintrin): From b0ea2834f9e353f52cce03835583522a978c1bdf Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 20 Jul 2021 20:30:44 -0700 Subject: [PATCH 53/59] Clang format --- src/tir/transforms/texture_flatten.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index 2a7394d01396..d74202a3ab79 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -83,9 +83,8 @@ class TextureLoweringBase : public StmtExprMutator { class TextureFlattener : public TextureLoweringBase { public: using StmtExprMutator::VisitStmt_; - explicit TextureFlattener( - const Map& extern_buffer_map, - IRVisitorWithAnalyzer* bound_analyzer) + explicit TextureFlattener(const Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} Stmt VisitStmt_(const BufferRealizeNode* op) final { From 11027316774fe602c528a245423203c0af63f80d Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 22 Jul 2021 09:33:39 -0700 Subject: [PATCH 54/59] Only visit RealizeBuffer body for texture storage. --- src/tir/transforms/texture_flatten.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index d74202a3ab79..7dc800737944 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -99,11 +99,10 @@ class TextureFlattener : public TextureLoweringBase { Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - Stmt body = this->VisitStmt(op->body); // Rewrite any buffer realizations with storage scope to 2d texture allocations if (IsTextureStorage(storage_scope)) { - body = this->VisitStmt(op->body); + Stmt body = this->VisitStmt(op->body); ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; int vec_length = static_cast(op->bounds.back()->extent.as()->value); ICHECK(vec_length == 4 || vec_length == 1) From 46ae5abd38cfdec18f8a439ba0858c35329a48ad Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 22 Jul 2021 21:14:12 -0700 Subject: [PATCH 55/59] Fix bad merge. --- src/tir/ir/buffer.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index de2d92e0f3ab..335ff19dd775 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -183,7 +183,12 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { // a list that contain all the elements that match Mod. // The elements in the Mod will be used to match against the elements in Mul. // The result will then be split and pushed back to these two lists. - PrimExpr simplified_base = analyzer->Simplify(base); + PrimExpr simplified_base = base; + arith::PVar x, y; + if ((floordiv(x, y) * y + floormod(x, y)).Match(simplified_base)) { + simplified_base = x.Eval(); + } + simplified_base = analyzer->Simplify(simplified_base); std::vector eles = ExprSplitAddition(simplified_base); std::list mult_exprs; std::list > mod_exprs; From 9fa362d1c7f9190a998cf86420404962c229fcc5 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Mon, 26 Jul 2021 14:31:29 -0700 Subject: [PATCH 56/59] Utilize OpenCL preprocessor to switch between sampler-less and codegen provided sampler for texture reads depending on whether the opencl runtime is 2.0 compliant. --- src/target/source/codegen_opencl.cc | 48 ++++++++++++++++++++++++++--- src/target/source/codegen_opencl.h | 7 +++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 9137b69e4bb5..8d760a07e032 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -69,7 +69,10 @@ class InferTextureAccess : public StmtExprVisitor { std::unordered_map var_access_map_; }; -CodeGenOpenCL::CodeGenOpenCL() { restrict_keyword_ = "restrict"; } +CodeGenOpenCL::CodeGenOpenCL() { + // Set OpenCL specific restrict keyword + restrict_keyword_ = "restrict"; +} void CodeGenOpenCL::InitFuncState(const PrimFunc& f) { CodeGenC::InitFuncState(f); @@ -117,6 +120,40 @@ std::string CodeGenOpenCL::Finish() { decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n\n"; } + + // Enable OpenCL 1.2 sampler-less texture reads, but utilize + // provided sampler in OpenCL 2.0. + if (enable_compliant_texture_reads_) { + // TODO(csullivan, lunderberg): Extend device attribute querying to support remote devices + // generically through the device API such that a target can be created from a specific device's + // attributes and utilized during codegen. Potential generlization of #8127 (c02cafb) for remote + // devices. + // + // E.g. Only provide an image sampler when the local or remote device supports OpenCL 2.0, + // see below for context. + // + // For backwards compatibility with OpenCL 1.2, sampler-less read_image calls are used. + // By default in sampler-less read_image calls OpenCL defaults to + // sampler_ = "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST"; + // See section 6.12.14.3 Built-in Image Sampler-less Read Functions in the OpenCL 1.2 + // specification. For OpenCL 2.0 it can be preferable to use, + // sampler_ = "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST"; + // For now we rely on OpenCL preprocessor directives to utilize the correct behavior + // depending on the OpenCL version detected at OpenCL compile time. + decl_stream << "#ifdef __OPENCL_VERSION__\n" + << "#if __OPENCL_VERSION__ == CL_VERSION_2_0\n" + << "#define READ_IMAGEH(image, sampler, coord) " + << "read_imageh(image, sampler, coord)\n" + << "#define READ_IMAGEF(image, sampler, coord) " + << "read_imagef(image, sampler, coord)\n" + << "#else\n" + << "#define READ_IMAGEH(image, sampler, coord) " + << "read_imageh(image, coord)\n" + << "#define READ_IMAGEF(image, sampler, coord) " + << "read_imagef(image, coord)\n" + << "#endif\n" + << "#endif\n\n"; + } return CodeGenC::Finish(); } @@ -372,11 +409,12 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args[3], os); os << ")"; } else if (op->op.same_as(builtin::texture2d_load())) { + enable_compliant_texture_reads_ = true; std::stringstream ss; if (op->dtype.is_float16()) { - ss << "read_imageh("; + ss << "READ_IMAGEH("; } else if (op->dtype.is_float()) { - ss << "read_imagef("; + ss << "READ_IMAGEF("; } else { LOG(FATAL) << "Unsupported type: " << op->dtype << ", currently only float and half are supported for image2d OpenCL codegen."; @@ -384,11 +422,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { this->PrintExpr(op->args[0], ss); ss << ", "; ss << "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST, "; - ss << "(int2)("; + ss << "((int2)("; this->PrintExpr(op->args[1], ss); ss << ", "; this->PrintExpr(op->args[2], ss); - ss << "))"; + ss << ")))"; // Only use local SSA if texture is not already being stored if (need_texture_ssa_) { diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h index 722db2b7f7b5..a8c293c03056 100644 --- a/src/target/source/codegen_opencl.h +++ b/src/target/source/codegen_opencl.h @@ -71,7 +71,14 @@ class CodeGenOpenCL final : public CodeGenC { bool enable_fp64_{false}; // Whether to enable atomics extension. bool enable_atomics_{false}; + // Whether to enable sampler or sampler-less texture reads, + // where the choice depends on the OpenCL version used. + bool enable_compliant_texture_reads_{false}; + // Key to disable use of texture SSA in certain scenarios. For example, + // when loaded value is stored directly to a user declared l-value buffer bool need_texture_ssa_{true}; + // Mapping from buffer to allocation size. + // Useful to track when a scalar store of a vectorized texture load is required. std::unordered_map allocation_size_; }; From 213492ce6994b5a341afaef6fe78c09b99903f20 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Thu, 5 Aug 2021 16:10:45 -0700 Subject: [PATCH 57/59] Add texture codegen test example. --- .../test_target_texture_codegen_opencl.py | 1767 +++++++++++++++++ 1 file changed, 1767 insertions(+) create mode 100644 tests/python/unittest/test_target_texture_codegen_opencl.py diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py new file mode 100644 index 000000000000..b155d56f1346 --- /dev/null +++ b/tests/python/unittest/test_target_texture_codegen_opencl.py @@ -0,0 +1,1767 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# 'License'); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys + +import numpy as np +import pytest + +import tvm +from tvm import autotvm +from tvm import te +from tvm.topi import testing +from tvm.topi.utils import get_const_tuple, simplify +from tvm.topi import nn + + +def compute(shape): + X = te.placeholder(shape, name="X", dtype="float32") + Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y") + return X, Y + +def schedule(X, Y): + s = te.create_schedule(Y.op) + #Xt = s.cache_read(X, "texture", [Y]) + #Xt = s.cache_read(X, "global", [Y]) + Xt = s.cache_read(X, "global.texture", [Y]) + + # copy to texture stage + x, y, c = s[Xt].op.axis + s[Xt].bind(x, te.thread_axis("blockIdx.x")) + s[Xt].bind(y, te.thread_axis("threadIdx.x")) + s[Xt].vectorize(c) + + # the compute stage + x, y, c = s[Y].op.axis + xo, yo, xi, yi = s[Y].tile(x, y, 4, 4) + s[Y].bind(xo, te.thread_axis("blockIdx.x")) + s[Y].bind(yo, te.thread_axis("threadIdx.x")) + s[Y].vectorize(c) + return s + +def compute5d(shape): + X = te.placeholder(shape, name="X", dtype="float32") + Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y") + return X, Y + +def schedule5d(X, Y): + s = te.create_schedule(Y.op) + Xt = s.cache_read(X, "global.texture", [Y]) + + # copy to texture stage + a, b, c, d, e = s[Xt].op.axis + abc = s[Xt].fuse(a, b, c) + s[Xt].bind(abc, te.thread_axis("blockIdx.x")) + s[Xt].bind(d, te.thread_axis("threadIdx.x")) + s[Xt].vectorize(e) + + # the compute stage + a, b, c, d, e = s[Y].op.axis + abc = s[Y].fuse(a, b, c) + xo, yo, xi, yi = s[Y].tile(abc, d, 4, 4) + s[Y].bind(xo, te.thread_axis("blockIdx.x")) + s[Y].bind(yo, te.thread_axis("threadIdx.x")) + s[Y].vectorize(e) + return s + +def compute_matmul(shape): + A = te.placeholder(shape, name="A", dtype="float32") + B = te.placeholder(shape, name="B", dtype="float32") + k = te.reduce_axis((0, shape[1]), name="k") + C = te.compute( + (shape[0]*shape[2], shape[0]*shape[2]), + lambda i, j: te.sum( + A[i//shape[2], k, i%shape[2]].astype("float32") * B[j//shape[2], k, j%shape[2]].astype("float32"), axis=[k] + ), + name="Compute_MatMul", + ) + return A, B, C + +def schedule_matmul(A, B, C, local=False): + s = te.create_schedule(C.op) + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + if local: + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + bx = te.thread_axis("blockIdx.x") + tx = te.thread_axis("threadIdx.x") + def copy_to_texture(stage): + _io, _k, _ii = s[stage].op.axis + s[stage].vectorize(_ii) + s[stage].bind(_io, bx) + s[stage].bind(_k, tx) + + copy_to_texture(At) + copy_to_texture(Bt) + + # copy to global stage + _i, _j = s[C].op.axis + xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) + s[C].unroll(xi) + s[C].vectorize(yi) + s[C].bind(xo, te.thread_axis("blockIdx.x")) + s[C].bind(yo, te.thread_axis("threadIdx.x")) + + # the compute stage + s[Cl].compute_at(s[C], yo) + (_k,) = Cl.op.reduce_axis + _x, _y = s[Cl].op.axis + s[Cl].reorder(_k, _x, _y) + s[Cl].unroll(_x) + s[Cl].vectorize(_y) + + if local: + s[Al].compute_at(s[Cl], _k) + s[Al].vectorize(s[Al].op.axis[-1]) + s[Bl].compute_at(s[Cl], _k) + s[Bl].vectorize(s[Bl].op.axis[-1]) + + return s + + +def compute_matmul_inner(shape): + A = te.placeholder(shape, name="A", dtype="float32") + B = te.placeholder(shape, name="B", dtype="float32") + k = te.reduce_axis((0, shape[1]*shape[2]), name="k") + # (M, K) x (N, K) + # (32, 256) x (32, 256) + # (32, 64, 4) x (32, 64, 4) + C = te.compute( + (shape[0], shape[0]), + lambda i, j: te.sum( + A[i, k//shape[2], k%shape[2]].astype("float32") * B[j, k//shape[2], k%shape[2]].astype("float32"), axis=[k] + ), + name="Compute_MatMul", + ) + return A, B, C + +def schedule_matmul_inner(A, B, C, local=False): + s = te.create_schedule(C.op) + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + if local: + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + bx = te.thread_axis("blockIdx.x") + tx = te.thread_axis("threadIdx.x") + def copy_to_texture(stage): + _i, _ko, _ki = s[stage].op.axis + s[stage].vectorize(_ki) + s[stage].bind(_i, bx) + s[stage].bind(_ko, tx) + + copy_to_texture(At) + copy_to_texture(Bt) + + # copy to global stage + _i, _j = s[C].op.axis + xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) + s[C].unroll(xi) + s[C].vectorize(yi) + s[C].bind(xo, te.thread_axis("blockIdx.x")) + s[C].bind(yo, te.thread_axis("threadIdx.x")) + + # the compute stage + s[Cl].compute_at(s[C], yo) + (_k,) = Cl.op.reduce_axis + _x, _y = s[Cl].op.axis + s[Cl].reorder(_x, _y, _k) + s[Cl].unroll(_x) + # TODO(csullivan): consider whether the below error is worth resolving + # s[Cl].vectorize(_y) # error + + if local: + s[Al].compute_at(s[Cl], _x) + s[Al].vectorize(s[Al].op.axis[-1]) + s[Bl].compute_at(s[Cl], _x) + s[Bl].vectorize(s[Bl].op.axis[-1]) + + return s + +def compute_matmul_vector_accumulator(shapeA, shapeB): + # A x B + # (K/4, M, K%4) x (K, N/4, N%4) = (M, N) + # (32, 64, 4) x (128, 16, 4) = (64, 64) + A = te.placeholder(shapeA, name="A", dtype="float32") + B = te.placeholder(shapeB, name="B", dtype="float32") + k = te.reduce_axis((0, shapeB[0]), name="k") + C = te.compute( + (shapeA[1], shapeB[1]*shapeB[2]), + lambda i, j: te.sum( + A[k//shapeA[-1], i, k%shapeA[-1]].astype("float32") * B[k, j//shapeB[-1], j%shapeB[-1]].astype("float32"), axis=[k] + ), + name="Compute_MatMul", + ) + return A, B, C + +def schedule_matmul_vector_accumulator(A, B, C, local=False): + s = te.create_schedule(C.op) + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + if local: + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + def copy_to_texture(stage): + _y, _x, _v = s[stage].op.axis + # TODO(csullivan): removing this vectorize results in numerical errors, autovectorize + s[stage].vectorize(_v) + s[stage].bind(_y, te.thread_axis("blockIdx.x")) + s[stage].bind(_x, te.thread_axis("threadIdx.x")) + + copy_to_texture(At) + copy_to_texture(Bt) + + # copy to global stage + _i, _j = s[C].op.axis + xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) + s[C].unroll(xi) + s[C].vectorize(yi) + s[C].bind(xo, te.thread_axis("blockIdx.x")) + s[C].bind(yo, te.thread_axis("threadIdx.x")) + + # the compute stage + s[Cl].compute_at(s[C], yo) + (_k,) = Cl.op.reduce_axis + _a, _b = s[Cl].op.axis + _ko, _ki = s[Cl].split(_k, factor=4) + s[Cl].reorder(_ko, _a, _ki, _b) + s[Cl].unroll(_ki) + s[Cl].unroll(_a) + s[Cl].vectorize(_b) + + if local: + s[Al].compute_at(s[Cl], _a) + _aa, _ka, _ba = s[Al].op.axis + # TODO(csullivan)[BEFORE PR]: removing this vectorize command causes a crash. This needs to be autovectorized. + s[Al].vectorize(_ba) + s[Bl].compute_at(s[Cl], _ko) + _ab, _kb, _bb = s[Bl].op.axis + s[Bl].vectorize(_bb) + s[Bl].unroll(_ab) + + return s + +def schedule_matmul_vector_accumulator_autotvm(A, B, C): + s = te.create_schedule(C.op) + cfg = autotvm.get_config() + + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + def copy_to_texture(stage): + _y, _x, _v = s[stage].op.axis + s[stage].vectorize(_v) + s[stage].bind(_y, te.thread_axis("blockIdx.x")) + s[stage].bind(_x, te.thread_axis("threadIdx.x")) + + copy_to_texture(At) + copy_to_texture(Bt) + + # copy to global stage + _i, _j = s[C].op.axis + xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) + s[C].unroll(xi) + s[C].vectorize(yi) + s[C].bind(xo, te.thread_axis("blockIdx.x")) + s[C].bind(yo, te.thread_axis("threadIdx.x")) + + # the compute stage + s[Cl].compute_at(s[C], yo) + (_k,) = Cl.op.reduce_axis + _a, _b = s[Cl].op.axis + _ko, _ki = s[Cl].split(_k, factor=4) + + s[Cl].reorder(_ko, _a, _ki, _b) + cfg.define_knob("unroll", [0, 1]) + if cfg["unroll"] == 1: + s[Cl].unroll(_ki) + s[Cl].unroll(_a) + s[Cl].vectorize(_b) + + s[Al].compute_at(s[Cl], _a) + _aa, _ka, _ba = s[Al].op.axis + s[Al].vectorize(_ba) + s[Bl].compute_at(s[Cl], _ko) + _ab, _kb, _bb = s[Bl].op.axis + s[Bl].vectorize(_bb) + s[Bl].unroll(_ab) + + + return s + +def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): + # conv2d( [N, C, H, W, c] , [1, 1, C, K, k] + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + c = te.reduce_axis((0, input_shape[1]), name="C") + c4 = te.reduce_axis((0, input_shape[-1]), name="c4") + kh = te.reduce_axis((0, filter_shape[0]), name="kh") + kw = te.reduce_axis((0, filter_shape[1]), name="kw") + conv = te.compute( + (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]), + lambda n, ko, i, j, ki: te.sum( + data[n, c, i, j, c4].astype("float32") * filt[kh, kw, c*input_shape[-1] + c4, ko, ki].astype("float32"), axis=[kh, kw, c, c4] + ), + #name="Compute_conv2d_1x1_NCHWc_RSCKk", + name = "conv2d_1x1" + ) + return data, filt, conv + +def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv): + # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) + # outputs: + s = te.create_schedule(conv.op) + A, B, C = data, filt, conv + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + def copy_to_texture(stage): + axes = s[stage].op.axis + fused = s[stage].fuse(*axes[:-1]) + block, thread = s[stage].split(fused, factor=32) + s[stage].vectorize(axes[-1]) + s[stage].bind(block, te.thread_axis("blockIdx.x")) + s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(At) + copy_to_texture(Bt) + + _n, _ko, _h, _w, _ki = s[C].op.axis + s[C].vectorize(_ki) + s[C].bind(_n, te.thread_axis("blockIdx.x")) + s[C].bind(_ko, te.thread_axis("threadIdx.x")) + + s[Cl].compute_at(s[C], _w) + _nl, _kol, _hl, _wl, _kil = s[Cl].op.axis + _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis + _clo, _cli = s[Cl].split(_cl, factor=4) + s[Cl].reorder(_clo, _cli, _cl4, _kil) + s[Cl].unroll(_cli) + s[Cl].unroll(_cl4) + s[Cl].vectorize(_kil) + + s[Al].compute_at(s[Cl], _cli) + s[Al].vectorize(s[Al].op.axis[-1]) + s[Bl].compute_at(s[Cl], _kwl) + s[Bl].vectorize(s[Bl].op.axis[-1]) + + return s + + +def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): + # input_shape = [W, C, H, N, c] -> [W, C, H*N, c] + # filter_shape = [C, R, S, K, k] -> [C, R*S*K, k] + # output_shape: [WK, HN, k] -> [W, K, H, N, k] + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + + packed_data = te.compute( + (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]), + lambda i, j, k, l: data[i, j, k//input_shape[3], k%input_shape[3], l], + name = "packed_data" + ) + + # Logical transformation of Nd -> 3d tensor + # CRSKk -> C|RSK|k + # r = rsk // SK + # sk = rsk % SK + # s = sk // K == (rsk % SK) // K == (rsk // K) % S + # k = sk % K == (rsk % SK) % K == rsk % K + packed_filter = te.compute( + (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]), + lambda i, j, k: filt[i, j//(filter_shape[3] * filter_shape[2]), (j//filter_shape[3])%filter_shape[2], j%filter_shape[3], k], + name = "packed_filter" + ) + + c = te.reduce_axis((0, input_shape[1]), name="C") + c4 = te.reduce_axis((0, input_shape[-1]), name="c4") + r = te.reduce_axis((0, filter_shape[1]), name="r") + s = te.reduce_axis((0, filter_shape[2]), name="s") + + conv = te.compute( + (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]), + lambda w, ko, h, n, ki: te.sum( + packed_data[w, c, h * input_shape[3] + n, c4].astype("float32") + * + packed_filter[c*input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki].astype("float32"), axis=[r, s, c, c4] + ), + name = "conv2d_1x1" + ) + return data, filt, packed_data, packed_filter, conv + +def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv): + # data: [W, C, H*N, c] + # filter: [C, R*S*K, k] + # output: [W, K, H, N, k] + + # conv2d( [N, C, H, W, c] , [1, 1, C, K, k] + # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) + + # data: (56, 128//4, 56*1, 4) = (56, 32, 56, 4) + # filt: (128, 1*1*128//4, 4) = (128, 32, 4) + # conv: (56, 32, 56, 1, 4) + + s = te.create_schedule(conv.op) + cfg = autotvm.get_config() + + s[packed_data].compute_inline() + s[packed_filter].compute_inline() + A, B, C = packed_data, packed_filter, conv + At = s.cache_read(A, "global.texture", [C]) + Bt = s.cache_read(B, "global.texture", [C]) + Al = s.cache_read(At, "local", [C]) + Bl = s.cache_read(Bt, "local", [C]) + Cl = s.cache_write(C, "local") + + def copy_to_texture(stage): + axes = s[stage].op.axis + fused = s[stage].fuse(*axes[:-1]) + block, thread = s[stage].split(fused, factor=32) + s[stage].vectorize(axes[-1]) + s[stage].bind(block, te.thread_axis("blockIdx.x")) + s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(At) + copy_to_texture(Bt) + + _w, _ko, _h, _n, _ki = s[C].op.axis + kernel_scope, _n = s[C].split(_n, nparts=1) + + cfg.define_split("tile_f", _ko, num_outputs=4) + cfg.define_split("tile_w", _w, num_outputs=4) + cfg.define_split("tile_h", _h, num_outputs=4) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + + bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko) + bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w) + bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h) + s[C].reorder(bh, _n, vh, th, hi) + bhn = s[C].fuse(bh, _n) + + s[C].bind(bk, te.thread_axis("blockIdx.z")) + s[C].bind(bhn, te.thread_axis("blockIdx.y")) + s[C].bind(bw, te.thread_axis("blockIdx.x")) + s[C].bind(vk, te.thread_axis("vthread")) + s[C].bind(vh, te.thread_axis("vthread")) + s[C].bind(vw, te.thread_axis("vthread")) + s[C].bind(tk, te.thread_axis("threadIdx.z")) + s[C].bind(th, te.thread_axis("threadIdx.y")) + s[C].bind(tw, te.thread_axis("threadIdx.x")) + s[C].reorder(bw, bk, bhn, vw, vk, vh, tw, tk, th, ki, hi, wi, _ki) + s[C].vectorize(_ki) + + # TODO(csullivan): Try uneven workgroup split + # _wo, _wi = s[C].split(_w, factor=4) + # #_hno, _hni = s[C].split(_hn, factor=8) + # #s[C].reorder(_wo, _wi, _ko, _hno, _hni, _ki) + # s[C].reorder(_wo, _ko, _hn, _ki, _wi) + # s[C].unroll(_wi) + + # # mace: + # # const int out_ch_blk = get_global_id(0); + # # const int out_w_blk = get_global_id(1); + # # const int out_hb = get_global_id(2); + + # bx = te.thread_axis("blockIdx.x") + # by = te.thread_axis("blockIdx.y") + # bz = te.thread_axis("blockIdx.z") + # s[C].bind(_ko, bx) + # s[C].bind(_wo, by) + # s[C].bind(_hn, bz) + + #s[Cl].compute_at(s[C], _hn) + s[Cl].compute_at(s[C], th) + + _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis + _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis + + cfg.define_split("tile_c", _cl, num_outputs=2) + cfg.define_split("tile_kh", _khl, num_outputs=2) + cfg.define_split("tile_kw", _kwl, num_outputs=2) + + + + _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl) + _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl) + _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl) + #s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) + s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl) + #s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli) + # s[Cl].reorder(_cl, _cl4, _kil, _wl) + s[Cl].unroll(_cl4) + s[Cl].unroll(_wl) + s[Cl].vectorize(_kil) + + + _wla, _cla, _hnla, _cl4a = s[Al].op.axis + s[Al].compute_at(s[Cl], _cli) + s[Al].vectorize(_cl4a) + s[Al].unroll(_wla) + + _clb, _rskolb, _kilb = s[Bl].op.axis + s[Bl].compute_at(s[Cl], _cli) + s[Bl].vectorize(_kilb) + s[Bl].unroll(_clb) + + s[C].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + + WO, K, HO, N, K4 = get_const_tuple(C.shape) + RSC, _, _ = get_const_tuple(B.shape) + cfg.add_flop(2 * N * K * K4 * HO * WO * RSC) + + return s + +def compute_conv2d_cuda_NCHW_KCRS(Input, Filter, stride, padding, dilation, out_dtype=None): + """Convolution operator in NCHW layout. + + Parameters + ---------- + Input : tvm.te.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + Filter : tvm.te.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of 2 or 4 ints + padding size, or + [pad_height, pad_width] for 2 ints, or + [pad_top, pad_left, pad_bottom, pad_right] for 4 ints + + dilation: int or a list/tuple of two ints + dilation size, or [dilation_height, dilation_width] + + Returns + ------- + Output : tvm.te.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch, in_channel, in_height, in_width = Input.shape + num_filter, channel, kernel_h, kernel_w = Filter.shape + # compute the output shape + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + out_channel = num_filter + out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) + # compute graph + pad_before = [0, 0, pad_top, pad_left] + pad_after = [0, 0, pad_down, pad_right] + temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") + + rc = te.reduce_axis((0, in_channel), name="rc") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + return te.compute( + (batch, out_channel, out_height, out_width), + lambda nn, ff, yy, xx: te.sum( + temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype( + out_dtype + ) + * Filter[ff, rc, ry, rx].astype(out_dtype), + axis=[rc, ry, rx], + ), + tag="conv2d_nchw", + ) + + +def schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv): + """schedule optimized for batch size = 1""" + + ##### space definition begin ##### + n, f, y, x = s[conv].op.axis + rc, ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_f", f, num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_split("tile_rc", rc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, "local") + else: + output = s.outputs[0].output(0) + s[conv].set_scope("local") + OL = conv + + AA = s.cache_read(pad_data, "shared", [OL]) + WW = s.cache_read(kernel, "shared", [OL]) + + # tile and bind spatial axes + n, f, y, x = s[output].op.axis + kernel_scope, n = s[output].split(n, nparts=1) + + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + bf = s[output].fuse(n, bf) + s[output].bind(bf, te.thread_axis("blockIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + s[output].bind(tf, te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) + s[OL].compute_at(s[output], tx) + + # tile reduction axes + n, f, y, x = s[OL].op.axis + rc, ry, rx = s[OL].op.reduce_axis + rco, rci = cfg["tile_rc"].apply(s, OL, rc) + ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) + rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) + s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + + # cooperative fetching + for load in [AA, WW]: + n, f, y, x = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) + ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) + tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + + # unroll + s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + + N, CO, OH, OW = get_const_tuple(output.shape) + _, KH, KW, CI = get_const_tuple(kernel.shape) + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW) + + +def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_dtype=None): + """Convolution operator in NCHWc layout. """ + + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape + num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape + # compute the output shape + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + + out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) + # compute graph + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down, pad_right, 0] + temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") + + rcc = te.reduce_axis((0, in_channel_chunk), name="rc") + rcb = te.reduce_axis((0, in_channel_block), name="rc") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + + # NCHWc x KCRSk + # texture: NCH|W|c + # texture: K|CRS|k + # c = crs//RS + # rs = crs % RS + # r = rs // W == (crs // S) % R + # s = rs % W == crs % S + Filter_tx = te.compute( + (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), + lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb], + name = "packed_filter" + ) + return te.compute( + (batch, num_filter_chunk, out_height, out_width, num_filter_block), + lambda nn, ffc, yy, xx, ffb: te.sum( + temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb].astype( + out_dtype + ) + * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb].astype(out_dtype), + axis=[rcc, rcb, ry, rx], + ), + tag="conv2d_nchwc_kcrsk_texture", + ) + +def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv): + """schedule optimized for batch size = 1""" + + ##### space definition begin ##### + n, fc, y, x, fb = s[conv].op.axis + rcc, rcb, ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_fc", fc, num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_split("tile_rcc", rcc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + pad_data, flattened_kernel = s[conv].op.input_tensors + kernel = s[flattened_kernel].op.input_tensors[0] + s[flattened_kernel].compute_inline() + + s[pad_data].compute_inline() + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + kernel = flattened_kernel + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, "local") + else: + output = s.outputs[0].output(0) + s[conv].set_scope("local") + OL = conv + + # create cache stage + AT = s.cache_read(pad_data, "global.texture", [OL]) + WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): + axes = s[stage].op.axis + fused = s[stage].fuse(*axes[:-1]) + block, thread = s[stage].split(fused, factor=32) + s[stage].vectorize(axes[-1]) + s[stage].bind(block, te.thread_axis("blockIdx.x")) + s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) + copy_to_texture(WT) + + # tile and bind spatial axes + n, fc, y, x, fb = s[output].op.axis + + kernel_scope, n = s[output].split(n, nparts=1) + + bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + bf = s[output].fuse(n, bf) + s[output].bind(bf, te.thread_axis("blockIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + s[output].bind(tf, te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) + s[output].vectorize(fb) + s[OL].compute_at(s[output], tx) + + # tile reduction axes + n, fc, y, x, fb = s[OL].op.axis + + rcc, rcb, ry, rx = s[OL].op.reduce_axis + rco, rci = cfg["tile_rcc"].apply(s, OL, rcc) + ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) + rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) + + # TODO(csullivan): check position of rcb + s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) + s[OL].vectorize(fb) + s[OL].unroll(rcb) + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + # cooperative fetching + for load in [AA, WW]: + if load == WW: + n, fyx, v = s[load].op.axis + fused = s[load].fuse(n, fyx) + else: + n, f, y, x, v = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) + ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) + tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + s[load].vectorize(v) + + # unroll + s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + + N, OCC, OH, OW, OCB = get_const_tuple(output.shape) + _, ICKHKW, _ = get_const_tuple(kernel.shape) + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) + + +def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): + """Convolution operator in NCHWc layout. """ + + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape + num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape + # compute the output shape + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + + out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) + # compute graph + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down, pad_right, 0] + temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") + + rcc = te.reduce_axis((0, in_channel_chunk), name="rc") + rcb = te.reduce_axis((0, in_channel_block), name="rc") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + + # NCHWc x KCRSk + # texture: NCH|W|c + # texture: K|CRS|k + # c = crs//RS + # rs = crs % RS + # r = rs // W == (crs // S) % R + # s = rs % W == crs % S + Filter_tx = te.compute( + (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), + lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb], + name = "packed_filter" + ) + conv = te.compute( + (batch, num_filter_chunk, out_height, out_width, num_filter_block), + lambda nn, ffc, yy, xx, ffb: te.sum( + (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] + * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb]).astype(out_dtype), + axis=[rcc, rcb, ry, rx], + ), + tag="conv2d_nchwc_kcrsk_texture", + ) + output = te.compute(conv.shape, lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype("float32")) + return output + + + +def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): + """schedule optimized for batch size = 1""" + + conv = output.op.input_tensors[0] + + ##### space definition begin ##### + n, fc, y, x, fb = s[conv].op.axis + rcc, rcb, ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_fc", fc, num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_split("tile_rcc", rcc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + pad_data, flattened_kernel = s[conv].op.input_tensors + kernel = s[flattened_kernel].op.input_tensors[0] + s[flattened_kernel].compute_inline() + + s[pad_data].compute_inline() + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + kernel = flattened_kernel + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, "local") + else: + output = s.outputs[0].output(0) + s[conv].set_scope("local") + OL = conv + + # create cache stage + AT = s.cache_read(pad_data, "global.texture", [OL]) + WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): + axes = s[stage].op.axis + fused = s[stage].fuse(*axes[:-1]) + block, thread = s[stage].split(fused, factor=32) + s[stage].vectorize(axes[-1]) + s[stage].bind(block, te.thread_axis("blockIdx.x")) + s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) + copy_to_texture(WT) + + AA = s.cache_read(AT, "shared", [OL]) + WW = s.cache_read(WT, "shared", [OL]) + + # tile and bind spatial axes + n, fc, y, x, fb = s[output].op.axis + + kernel_scope, n = s[output].split(n, nparts=1) + + bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + bf = s[output].fuse(n, bf) + s[output].bind(bf, te.thread_axis("blockIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + s[output].bind(tf, te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) + s[output].vectorize(fb) + + s[OL].compute_at(s[output], tx) + + # tile reduction axes + n, fc, y, x, fb = s[OL].op.axis + + rcc, rcb, ry, rx = s[OL].op.reduce_axis + rco, rci = cfg["tile_rcc"].apply(s, OL, rcc) + ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) + rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) + + # TODO(csullivan): check position of rcb + s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) + s[OL].vectorize(fb) + s[OL].unroll(rcb) + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + # cooperative fetching + for load in [AA, WW]: + if load == WW: + n, fyx, v = s[load].op.axis + fused = s[load].fuse(n, fyx) + else: + n, f, y, x, v = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) + ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) + tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + s[load].vectorize(v) + + # unroll + s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + + N, OCC, OH, OW, OCB = get_const_tuple(output.shape) + _, ICKHKW, _ = get_const_tuple(kernel.shape) + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) + + + +def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): + """Depthwise convolution operator in NCHWc layout. """ + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch, channel_chunk, in_height, in_width, channel_block = Input.shape + _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape + + # compute the output shape + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + out_channel_chunk = simplify(channel_chunk * channel_multiplier) + out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) + # compute graph + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down, pad_right, 0] + temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") + + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + + + # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c] + # NCHWc x CMRS + # texture: NCH|W|c + # texture: C|MRS|c + # output: N + # m = mrs//RS + # rs = mrs % RS + # r = rs // W == (mrs // S) % R + # s = rs % W == mrs % S + Filter_tx = te.compute( + (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block), + lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb], + name = "packed_filter" + ) + + conv = te.compute( + (batch, out_channel_chunk, out_height, out_width, channel_block), + lambda nn, ffc, yy, xx, ffb: te.sum( + (temp[nn, ffc//channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] + * Filter_tx[ffc//channel_multiplier, ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(out_dtype), + axis=[ry, rx], + ), + tag="depthwise_conv2d_nchwc_kcrsk_texture", + ) + return te.compute(conv.shape, lambda n,ffc,y,x,ffb: conv[n,ffc,y,x,ffb].astype("float32")) + + + +def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): + """schedule optimized for batch size = 1""" + + conv = output.op.input_tensors[0] + + ##### space definition begin ##### + n, fc, y, x, fb = s[conv].op.axis + ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_fc", fc, num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + pad_data, flattened_kernel = s[conv].op.input_tensors + kernel = s[flattened_kernel].op.input_tensors[0] + s[flattened_kernel].compute_inline() + + s[pad_data].compute_inline() + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + kernel = flattened_kernel + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, "local") + else: + output = s.outputs[0].output(0) + s[conv].set_scope("local") + OL = conv + + # create cache stage + AT = s.cache_read(pad_data, "global.texture", [OL]) + WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): + axes = s[stage].op.axis + fused = s[stage].fuse(*axes[:-1]) + block, thread = s[stage].split(fused, factor=32) + s[stage].vectorize(axes[-1]) + s[stage].bind(block, te.thread_axis("blockIdx.x")) + s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) + copy_to_texture(WT) + + AA = s.cache_read(AT, "shared", [OL]) + WW = s.cache_read(WT, "shared", [OL]) + + # tile and bind spatial axes + n, fc, y, x, fb = s[output].op.axis + + kernel_scope, n = s[output].split(n, nparts=1) + + bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + bf = s[output].fuse(n, bf) + s[output].bind(bf, te.thread_axis("blockIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + s[output].bind(tf, te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) + s[output].vectorize(fb) + + s[OL].compute_at(s[output], tx) + + # tile reduction axes + n, fc, y, x, fb = s[OL].op.axis + + ry, rx = s[OL].op.reduce_axis + ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) + rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) + + s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb) + s[OL].vectorize(fb) + #s[OL].unroll() + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + # cooperative fetching + for load in [AA, WW]: + if load == WW: + n, fyx, v = s[load].op.axis + fused = s[load].fuse(n, fyx) + else: + n, f, y, x, v = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) + ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) + tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + s[load].vectorize(v) + + # unroll + s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + + N, OCC, OH, OW, OCB = get_const_tuple(output.shape) + ICC, MKHKW, ICB = get_const_tuple(kernel.shape) + M = (OCC * OCB) // (ICC * ICB) + KHKW = MKHKW // M + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW) + + +def compute_conv2d_NCHWc_KCRSk( + cfg, data, kernel, stride, padding, dilation, out_dtype=None +): + """Convolution operator for 'conv2d_NCHWc_KCRSk'. + + Parameters + ---------- + data : tvm.te.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] or + 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] + + kernel : tvm.te.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] or + 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, + filter_width, num_filter_block, in_channel_block] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + dilation : int or a list/tuple of two ints + dilation size, or [dilation_height, dilation_width] + + out_dtype : str + The output type. This is used for mixed precision. + + Returns + ------- + Output : tvm.te.Tensor + 5-D with shape [batch, out_channel, out_height, out_width, out_channel_block] + """ + if out_dtype is None: + out_dtype = data.dtype + ic_block_factor = 4 + oc_block_factor = 4 + + pre_computed = len(kernel.shape) == 5 + if not pre_computed: + batch, channels, height, width = get_const_tuple(data.shape) + out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape) + + assert ( + channels % ic_block_factor == 0 + ), "Number of input channels must divide {}".format(ic_block_factor) + assert ( + out_channels % oc_block_factor == 0 + ), "Number of output channels must divide {}".format(oc_block_factor) + + packed_data = te.compute( + (batch, channels // ic_block_factor, height, width, ic_block_factor), + lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w], + name="packed_data", + ) + packed_kernel = te.compute( + ( + out_channels // oc_block_factor, + in_channels, + kernel_h, + kernel_w, + oc_block_factor + ), + lambda oc_chunk, ic, kh, kw, oc_block: kernel[ + oc_chunk * oc_block_factor + oc_block, ic, kh, kw + ], + name="packed_kernel", + ) + else: + packed_data = data + packed_kernel = kernel + + batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(packed_data.shape) + oc_chunk, _, kernel_h, kernel_w, oc_block = get_const_tuple(packed_kernel.shape) + + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + # pad the input data + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w)) + pad_before = [0, 0, pad_top, pad_left, 0] + pad_after = [0, 0, pad_down, pad_right, 0] + pad_data = nn.pad(packed_data, pad_before, pad_after, name="pad_data") + + # compute the output shape + out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 + + oshape = (batch, oc_chunk, out_height, out_width, oc_block) + + icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") + icb = te.reduce_axis((0, ic_block_factor), name="ic_block") + kh = te.reduce_axis((0, kernel_h), name="kh") + kw = te.reduce_axis((0, kernel_w), name="kw") + + conv = te.compute( + oshape, + lambda n, occ, oh, ow, ocb: te.sum( + pad_data[ + n, + icc, + oh * stride_h + kh * dilation_h, + ow * stride_w + kw * dilation_w, + icb, + ] + * packed_kernel[occ, icc * ic_block + icb, kh, kw, ocb], + axis=[icc, kh, kw, icb], + ), + ) + + # Type conversion + output = te.compute( + oshape, lambda *index: conv(*index).astype(out_dtype), tag="conv2d_NCHWc_KCRSk" + ) + + num_flop = ( + batch + * oc_chunk + * oc_block + * out_height + * out_width + * ic_chunk + * ic_block + * kernel_h + * kernel_w + * 2 + ) + cfg.add_flop(num_flop) + + return output + + +def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): + """Schedule conv2d NCHWc template""" + + conv = output.op.input_tensors[0] + packed_data, packed_kernel = conv.op.input_tensors + + if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag: + pad_data = packed_data + packed_data = pad_data.op.input_tensors[0] + else: + pad_data = packed_data + + # if autotvm.GLOBAL_SCOPE.in_tuning: + # # skip this part during tuning to make records accurate + # # this part will be pre-computed during NNVM's pre-compute optimization pass + # s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") + # s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region") + # else: + # if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel": + # # data and kernel are not pre-computed, schedule layout transform here + # schedule_injective_from_existing(s, packed_data) + # schedule_injective_from_existing(s, packed_kernel) + + if pad_data != packed_data: + s[pad_data].compute_inline() + + # create cache stage + AA = s.cache_read(pad_data, "shared", [conv]) + WW = s.cache_read(packed_kernel, "shared", [conv]) + + s[conv].set_scope("local") + + # handle bias + if output.op not in s.outputs: + s[output].compute_inline() + output = s.outputs[0].output(0) + + oc_chunk = nn.get_const_int(output.shape[1]) + # tile and bind spatial axes + n, f, y, x, c = s[output].op.axis + cfg.define_split("tile_n", n, num_outputs=4) + cfg.define_split("tile_f", cfg.axis(oc_chunk), num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + + # this is the scope to attach global config inside this kernel + kernel_scope, n = s[output].split(n, nparts=1) + + s[output].bind(n, te.thread_axis("blockIdx.z")) + bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi) + s[output].bind(bn, te.thread_axis("blockIdx.z")) + #s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y")) + s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x")) + s[output].bind(vn, te.thread_axis("vthread")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf + if cfg["fuse_yx"].val: + s[output].bind(tn, te.thread_axis("threadIdx.z")) + s[output].bind(tf, te.thread_axis("threadIdx.y")) + tyx = s[output].fuse(ty, tx) + s[output].bind(tyx, te.thread_axis("threadIdx.x")) + s[conv].compute_at(s[output], tyx) + + # number of threads + n_tz = cfg["tile_n"].size[2] + n_ty = cfg["tile_f"].size[2] + n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2] + else: + s[output].bind(tn, te.thread_axis("threadIdx.z")) + s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[conv].compute_at(s[output], tx) + + # number of threads + n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2] + n_ty = cfg["tile_y"].size[2] + n_tx = cfg["tile_x"].size[2] + + # tile and bind reduction axes + n, f, y, x, c = s[conv].op.axis + rc, ry, rx, rc_block = s[conv].op.reduce_axis + cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2) + cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2) + cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2) + rco, rci = cfg["tile_rc"].apply(s, conv, rc) + ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) + rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) + + s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block) + #_, rc_block = s[conv].split(rc_block, factor=4) + #s[conv].tensorize(rc_block, _dp4a) + + s[AA].compute_at(s[conv], rxo) + s[WW].compute_at(s[conv], rxo) + + # cooperative fetching + for load in [AA, WW]: + fcd = s[load].op.axis[-1] + #fcd_outer, fcd = s[load].split(fcd, factor=4) + s[load].vectorize(fcd) + #fused = s[load].op.axis[:-1] + [fcd_outer] + fused = s[load].op.axis[:-1] + fused = s[load].fuse(*fused) + + fused, tx = s[load].split(fused, factor=n_tx) + fused, ty = s[load].split(fused, factor=n_ty) + fused, tz = s[load].split(fused, factor=n_tz) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + return s + + +#@autotvm.template("matmul_vector_accumulator_tune") +def matmul_vector_acc_template(shapeA, shapeB): + placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) + s = schedule_matmul_vector_accumulator_autotvm(*placeholders) + return s, placeholders + +#@autotvm.template("conv2d_1x1_NCHWc_RSCKk_tune") +def conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape): + placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape) + s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders) + return s, placeholders + +#@autotvm.template("conv2d_1x1_WCHNc_CRSKk_tune") +def conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape): + placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape) + s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders) + return s, (placeholders[0], placeholders[1], placeholders[-1]) + +#@autotvm.template("conv2d_cuda_NCHW_KCRS_tune") +def conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape): + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + conv = compute_conv2d_cuda_NCHW_KCRS(data, filt, [1,1], [0,0], [0,0], "float32") + cfg = autotvm.get_config() + s = te.create_schedule([x.op for x in [conv]]) + schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv) + return s, (data, filt, conv) + +#@autotvm.template("conv2d_cuda_NCHWc_KCRSk_tune") +def conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape): + cfg = autotvm.get_config() + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + output = compute_conv2d_NCHWc_KCRSk(cfg, data, filt, [1,1], [0,0], [0,0], "float32") + s = te.create_schedule([x.op for x in [output]]) + s = schedule_conv2d_NCHWc_KCRSk(cfg, s, output) + return s, (data, filt, output) + +def conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape): + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + conv = compute_conv2d_NCHWc_KCRSk_tx(data, filt, [1,1], [0,0], [1,1], "float32") + cfg = autotvm.get_config() + s = te.create_schedule([x.op for x in [conv]]) + schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv) + return s, (data, filt, conv) + +def conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape): + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + output = compute_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32") + cfg = autotvm.get_config() + s = te.create_schedule([x.op for x in [output]]) + schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output) + return s, (data, filt, output) + +def depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape): + data = te.placeholder(input_shape, name="data", dtype="float32") + filt = te.placeholder(filter_shape, name="filter", dtype="float32") + output = compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32") + cfg = autotvm.get_config() + s = te.create_schedule([x.op for x in [output]]) + schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output) + return s, (data, filt, output) + +#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune") +def conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape): + return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape) + +#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune2") +def conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape): + return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape) + +#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune") +def conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape): + return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape) + +#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune2") +def conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape): + return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape) + +#@autotvm.template("depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune") +def depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape): + return depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape) + +def ref_convolution(data, kernel, stride, pad): + import mxnet as mx + groups = 1 + kernel_size = (kernel.shape[2], kernel.shape[3]) + num_filter = kernel.shape[0] + ref_res = mx.nd.Convolution( + data=mx.nd.array(data), + weight=mx.nd.array(kernel), + bias=None, + no_bias=True, + kernel=kernel_size, + stride=stride, + pad=pad, + num_filter=num_filter, + num_group=groups, + ) + return ref_res.asnumpy() + +def ref_depthwise_convolution(data, kernel, stride, pad): + import mxnet as mx + groups = kernel.shape[0] + kernel_size = (kernel.shape[2], kernel.shape[3]) + num_filter = kernel.shape[0] + multiplier = kernel.shape[1] + ref_res = mx.nd.Convolution( + data=mx.nd.array(data), + weight=mx.nd.array(kernel), + bias=None, + no_bias=True, + kernel=kernel_size, + stride=stride, + pad=pad, + num_filter=num_filter, + num_group=groups, + ) + return ref_res.asnumpy() + +def validate(workload, target, dev): + s, placeholders = workload() + func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction") + + args_tvm = [] + args_np = [] + for var in placeholders[:-1]: + var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype) + args_np.append(var_np) + args_tvm.append(tvm.nd.array(var_np, dev)) + args_tvm.append(tvm.nd.array(np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev)) + func(*args_tvm) + + if "plus_one" in workload.__name__: + np_result = args_np[0] + 1.0; + elif "matmul" in workload.__name__: + if 'inner' in workload.__name__: + np_result = np.matmul(args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0)) + elif 'accum' in workload.__name__: + np_result = np.matmul(args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64)) + else: + np_result = np.matmul(args_np[0].transpose((0, 2, 1)).reshape(128, 64), args_np[1].transpose(1, 0, 2).reshape(64,128)) + elif "conv2d_1x1_NCHWc_RSCKk_tune" in workload.__name__: + vec_length = args_np[1].shape[-1] + # nchwc -> nchw + args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3]) + # rsckk -> rsck -> kcrs + args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 2, 0, 1)) + np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) + # nkhw -> nkhwk + np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2) + elif "conv2d_1x1_WCHNc_CRSKk_tune" in workload.__name__: + vec_length = args_np[1].shape[-1] + # wchnc -> nchw + args_np[0] = args_np[0].transpose((3, 1, 4, 2, 0)).reshape(args_np[0].shape[3], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[0]) + # crskk -> crsk -> kcrs + args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 0, 1, 2)) + np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) + # nkhw -> nkkhw -> wkhnk + np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(4, 1, 3, 0, 2) + elif "NCHW_KCRS" in workload.__name__: + np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) + elif "NCHWc_KCRSk" in workload.__name__: + vec_length = args_np[1].shape[-1] + # nchwc -> nchw + args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3]) + # kcrsk/cmrsc -> kcrs/cmrs + args_np[1] = args_np[1].transpose((0, 4, 1, 2, 3)).reshape(args_np[1].shape[0] * args_np[1].shape[4], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]) + if "depthwise" in workload.__name__: + #np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID") + np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], []) + else: + #np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) + np_result = ref_convolution(args_np[0], args_np[1], [], []) + # nkhw -> nkhwk + np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2) + np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2) + +def verify_plus_one_rank3(): + shape =(32, 32, 4) + placeholders = compute(shape) + s = schedule(*placeholders) + return s, placeholders + +def verify_matmul(): + shape = (32, 64, 4) + placeholders = compute_matmul(shape) + s = schedule_matmul(*placeholders) + return s, placeholders + +def verify_matmul_with_local(): + shape = (32, 64, 4) + placeholders = compute_matmul(shape) + s = schedule_matmul(*placeholders, local=True) + return s, placeholders + +def verify_matmul_inner(): + shape = (32, 64, 4) + placeholders = compute_matmul_inner(shape) + s = schedule_matmul_inner(*placeholders) + return s, placeholders + +def verify_matmul_vector_accumulator(): + shapeA, shapeB = (32, 64, 4), (128, 16, 4) + placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) + s = schedule_matmul_vector_accumulator(*placeholders) + return s, placeholders + +def verify_matmul_vector_accumulator_with_local(): + shapeA, shapeB = (32, 64, 4), (128, 16, 4) + placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) + s = schedule_matmul_vector_accumulator(*placeholders, local=True) + return s, placeholders + +def verify_plus_one_rank5(): + shape =(32, 2, 4, 4, 4) + placeholders = compute5d(shape) + s = schedule5d(*placeholders) + return s, placeholders + +def verify_matmul_vector_accumulator_tune(): + shapeA, shapeB = (32, 64, 4), (128, 16, 4) + return matmul_vector_acc_template(shapeA, shapeB) + +def verify_conv2d_1x1_NCHWc_RSCKk_tune(): + # mobilenetv1 1x1 conv2d + input_shape, filter_shape = (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) + return conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape) + +def verify_conv2d_1x1_WCHNc_CRSKk_tune(): + input_shape, filter_shape = (56, 128//4, 56, 1, 4), (128, 1, 1, 128//4, 4) + return conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape) + +def verify_conv2d_cuda_NCHW_KCRS_tune(): + # NCHW, KCRS + input_shape, filter_shape = (1, 128, 56, 56), (128, 128, 1, 1) + return conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape) + +def verify_conv2d_cuda_NCHWc_KCRSk_tune(): + # NCHWc, KCRSk + input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) + return conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape) + +def verify_conv2d_NCHWc_KCRSk_tx_tune(): + # NCHWc, KCRSk + input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) + return conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape) + +def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune(): + # NCHWc, KCRSk + input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) + return conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape) + +def verify_conv2d_NCHWc_KCRSk_tx_tune2(): + # NCHWc, KCRSk + input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4) + # input_shape, filter_shape = (1, 128, 7, 7, 4), (256, 512, 1, 1, 4) + # input_shape, filter_shape = (1, 128, 7, 7, 4), (128, 512, 3, 3, 4) + # input_shape, filter_shape = (1, 128, 7, 7, 4), (512, 512, 1, 1, 4) + return conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape) + +def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune2(): + # NCHWc, KCRSk + input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4) + return conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape) + +def verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(): + # deeplabv3 + # [1, 144, 129, 129], [144, 1, 3, 3] + # [1, 96, 257, 257], [96, 1, 3, 3] + # [N, C, H, W], [K, 1, R, S] + # [N, C/4, H, W, 4c], [C/4, 1, R, S, 4c] + input_shape, filter_shape = (1, 96//4, 257, 257, 4), (96//4, 1, 3, 3, 4) + return depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape) + +@tvm.testing.parametrize_targets("opencl") +def test_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(target, dev): + validate(verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune, target, dev) + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) From 7d76707e15c40ab2e6eb771df0e49e65d5c318ca Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 17 Aug 2021 15:23:41 -0700 Subject: [PATCH 58/59] Refactor tests to use pytest parameterization. Blacken tests. --- .../test_target_texture_codegen_opencl.py | 1025 ++++++----------- 1 file changed, 329 insertions(+), 696 deletions(-) diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py index b155d56f1346..03944c85ade5 100644 --- a/tests/python/unittest/test_target_texture_codegen_opencl.py +++ b/tests/python/unittest/test_target_texture_codegen_opencl.py @@ -27,15 +27,16 @@ from tvm.topi import nn -def compute(shape): +def compute_plus_one_rank3(shape): X = te.placeholder(shape, name="X", dtype="float32") Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y") return X, Y -def schedule(X, Y): + +def schedule_plus_one_rank3(X, Y): s = te.create_schedule(Y.op) - #Xt = s.cache_read(X, "texture", [Y]) - #Xt = s.cache_read(X, "global", [Y]) + # Xt = s.cache_read(X, "texture", [Y]) + # Xt = s.cache_read(X, "global", [Y]) Xt = s.cache_read(X, "global.texture", [Y]) # copy to texture stage @@ -52,12 +53,14 @@ def schedule(X, Y): s[Y].vectorize(c) return s -def compute5d(shape): + +def compute_plus_one_rank5(shape): X = te.placeholder(shape, name="X", dtype="float32") Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y") return X, Y -def schedule5d(X, Y): + +def schedule_plus_one_rank5(X, Y): s = te.create_schedule(Y.op) Xt = s.cache_read(X, "global.texture", [Y]) @@ -77,19 +80,23 @@ def schedule5d(X, Y): s[Y].vectorize(e) return s + def compute_matmul(shape): A = te.placeholder(shape, name="A", dtype="float32") B = te.placeholder(shape, name="B", dtype="float32") k = te.reduce_axis((0, shape[1]), name="k") C = te.compute( - (shape[0]*shape[2], shape[0]*shape[2]), + (shape[0] * shape[2], shape[0] * shape[2]), lambda i, j: te.sum( - A[i//shape[2], k, i%shape[2]].astype("float32") * B[j//shape[2], k, j%shape[2]].astype("float32"), axis=[k] + A[i // shape[2], k, i % shape[2]].astype("float32") + * B[j // shape[2], k, j % shape[2]].astype("float32"), + axis=[k], ), name="Compute_MatMul", ) return A, B, C + def schedule_matmul(A, B, C, local=False): s = te.create_schedule(C.op) At = s.cache_read(A, "global.texture", [C]) @@ -101,6 +108,7 @@ def schedule_matmul(A, B, C, local=False): bx = te.thread_axis("blockIdx.x") tx = te.thread_axis("threadIdx.x") + def copy_to_texture(stage): _io, _k, _ii = s[stage].op.axis s[stage].vectorize(_ii) @@ -138,19 +146,22 @@ def copy_to_texture(stage): def compute_matmul_inner(shape): A = te.placeholder(shape, name="A", dtype="float32") B = te.placeholder(shape, name="B", dtype="float32") - k = te.reduce_axis((0, shape[1]*shape[2]), name="k") + k = te.reduce_axis((0, shape[1] * shape[2]), name="k") # (M, K) x (N, K) # (32, 256) x (32, 256) # (32, 64, 4) x (32, 64, 4) C = te.compute( (shape[0], shape[0]), lambda i, j: te.sum( - A[i, k//shape[2], k%shape[2]].astype("float32") * B[j, k//shape[2], k%shape[2]].astype("float32"), axis=[k] + A[i, k // shape[2], k % shape[2]].astype("float32") + * B[j, k // shape[2], k % shape[2]].astype("float32"), + axis=[k], ), name="Compute_MatMul", ) return A, B, C + def schedule_matmul_inner(A, B, C, local=False): s = te.create_schedule(C.op) At = s.cache_read(A, "global.texture", [C]) @@ -162,6 +173,7 @@ def schedule_matmul_inner(A, B, C, local=False): bx = te.thread_axis("blockIdx.x") tx = te.thread_axis("threadIdx.x") + def copy_to_texture(stage): _i, _ko, _ki = s[stage].op.axis s[stage].vectorize(_ki) @@ -196,6 +208,7 @@ def copy_to_texture(stage): return s + def compute_matmul_vector_accumulator(shapeA, shapeB): # A x B # (K/4, M, K%4) x (K, N/4, N%4) = (M, N) @@ -204,14 +217,17 @@ def compute_matmul_vector_accumulator(shapeA, shapeB): B = te.placeholder(shapeB, name="B", dtype="float32") k = te.reduce_axis((0, shapeB[0]), name="k") C = te.compute( - (shapeA[1], shapeB[1]*shapeB[2]), + (shapeA[1], shapeB[1] * shapeB[2]), lambda i, j: te.sum( - A[k//shapeA[-1], i, k%shapeA[-1]].astype("float32") * B[k, j//shapeB[-1], j%shapeB[-1]].astype("float32"), axis=[k] + A[k // shapeA[-1], i, k % shapeA[-1]].astype("float32") + * B[k, j // shapeB[-1], j % shapeB[-1]].astype("float32"), + axis=[k], ), name="Compute_MatMul", ) return A, B, C + def schedule_matmul_vector_accumulator(A, B, C, local=False): s = te.create_schedule(C.op) At = s.cache_read(A, "global.texture", [C]) @@ -261,56 +277,6 @@ def copy_to_texture(stage): return s -def schedule_matmul_vector_accumulator_autotvm(A, B, C): - s = te.create_schedule(C.op) - cfg = autotvm.get_config() - - At = s.cache_read(A, "global.texture", [C]) - Bt = s.cache_read(B, "global.texture", [C]) - Al = s.cache_read(At, "local", [C]) - Bl = s.cache_read(Bt, "local", [C]) - Cl = s.cache_write(C, "local") - - def copy_to_texture(stage): - _y, _x, _v = s[stage].op.axis - s[stage].vectorize(_v) - s[stage].bind(_y, te.thread_axis("blockIdx.x")) - s[stage].bind(_x, te.thread_axis("threadIdx.x")) - - copy_to_texture(At) - copy_to_texture(Bt) - - # copy to global stage - _i, _j = s[C].op.axis - xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) - s[C].unroll(xi) - s[C].vectorize(yi) - s[C].bind(xo, te.thread_axis("blockIdx.x")) - s[C].bind(yo, te.thread_axis("threadIdx.x")) - - # the compute stage - s[Cl].compute_at(s[C], yo) - (_k,) = Cl.op.reduce_axis - _a, _b = s[Cl].op.axis - _ko, _ki = s[Cl].split(_k, factor=4) - - s[Cl].reorder(_ko, _a, _ki, _b) - cfg.define_knob("unroll", [0, 1]) - if cfg["unroll"] == 1: - s[Cl].unroll(_ki) - s[Cl].unroll(_a) - s[Cl].vectorize(_b) - - s[Al].compute_at(s[Cl], _a) - _aa, _ka, _ba = s[Al].op.axis - s[Al].vectorize(_ba) - s[Bl].compute_at(s[Cl], _ko) - _ab, _kb, _bb = s[Bl].op.axis - s[Bl].vectorize(_bb) - s[Bl].unroll(_ab) - - - return s def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): # conv2d( [N, C, H, W, c] , [1, 1, C, K, k] @@ -323,13 +289,16 @@ def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): conv = te.compute( (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]), lambda n, ko, i, j, ki: te.sum( - data[n, c, i, j, c4].astype("float32") * filt[kh, kw, c*input_shape[-1] + c4, ko, ki].astype("float32"), axis=[kh, kw, c, c4] + data[n, c, i, j, c4].astype("float32") + * filt[kh, kw, c * input_shape[-1] + c4, ko, ki].astype("float32"), + axis=[kh, kw, c, c4], ), - #name="Compute_conv2d_1x1_NCHWc_RSCKk", - name = "conv2d_1x1" + # name="Compute_conv2d_1x1_NCHWc_RSCKk", + name="conv2d_1x1", ) return data, filt, conv + def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv): # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) # outputs: @@ -348,6 +317,7 @@ def copy_to_texture(stage): s[stage].vectorize(axes[-1]) s[stage].bind(block, te.thread_axis("blockIdx.x")) s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(At) copy_to_texture(Bt) @@ -382,8 +352,8 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): packed_data = te.compute( (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]), - lambda i, j, k, l: data[i, j, k//input_shape[3], k%input_shape[3], l], - name = "packed_data" + lambda i, j, k, l: data[i, j, k // input_shape[3], k % input_shape[3], l], + name="packed_data", ) # Logical transformation of Nd -> 3d tensor @@ -394,8 +364,14 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): # k = sk % K == (rsk % SK) % K == rsk % K packed_filter = te.compute( (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]), - lambda i, j, k: filt[i, j//(filter_shape[3] * filter_shape[2]), (j//filter_shape[3])%filter_shape[2], j%filter_shape[3], k], - name = "packed_filter" + lambda i, j, k: filt[ + i, + j // (filter_shape[3] * filter_shape[2]), + (j // filter_shape[3]) % filter_shape[2], + j % filter_shape[3], + k, + ], + name="packed_filter", ) c = te.reduce_axis((0, input_shape[1]), name="C") @@ -407,13 +383,16 @@ def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]), lambda w, ko, h, n, ki: te.sum( packed_data[w, c, h * input_shape[3] + n, c4].astype("float32") - * - packed_filter[c*input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki].astype("float32"), axis=[r, s, c, c4] + * packed_filter[ + c * input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki + ].astype("float32"), + axis=[r, s, c, c4], ), - name = "conv2d_1x1" + name="conv2d_1x1", ) return data, filt, packed_data, packed_filter, conv + def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv): # data: [W, C, H*N, c] # filter: [C, R*S*K, k] @@ -445,6 +424,7 @@ def copy_to_texture(stage): s[stage].vectorize(axes[-1]) s[stage].bind(block, te.thread_axis("blockIdx.x")) s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(At) copy_to_texture(Bt) @@ -456,7 +436,6 @@ def copy_to_texture(stage): cfg.define_split("tile_h", _h, num_outputs=4) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko) bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w) bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h) @@ -494,7 +473,7 @@ def copy_to_texture(stage): # s[C].bind(_wo, by) # s[C].bind(_hn, bz) - #s[Cl].compute_at(s[C], _hn) + # s[Cl].compute_at(s[C], _hn) s[Cl].compute_at(s[C], th) _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis @@ -504,20 +483,17 @@ def copy_to_texture(stage): cfg.define_split("tile_kh", _khl, num_outputs=2) cfg.define_split("tile_kw", _kwl, num_outputs=2) - - _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl) _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl) _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl) - #s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) + # s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl) - #s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli) + # s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli) # s[Cl].reorder(_cl, _cl4, _kil, _wl) s[Cl].unroll(_cl4) s[Cl].unroll(_wl) s[Cl].vectorize(_kil) - _wla, _cla, _hnla, _cl4a = s[Al].op.axis s[Al].compute_at(s[Cl], _cli) s[Al].vectorize(_cl4a) @@ -536,165 +512,8 @@ def copy_to_texture(stage): return s -def compute_conv2d_cuda_NCHW_KCRS(Input, Filter, stride, padding, dilation, out_dtype=None): - """Convolution operator in NCHW layout. - - Parameters - ---------- - Input : tvm.te.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] - - Filter : tvm.te.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] - - stride : int or a list/tuple of two ints - Stride size, or [stride_height, stride_width] - - padding : int or a list/tuple of 2 or 4 ints - padding size, or - [pad_height, pad_width] for 2 ints, or - [pad_top, pad_left, pad_bottom, pad_right] for 4 ints - - dilation: int or a list/tuple of two ints - dilation size, or [dilation_height, dilation_width] - - Returns - ------- - Output : tvm.te.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - if out_dtype is None: - out_dtype = Input.dtype - assert isinstance(stride, int) or len(stride) == 2 - assert isinstance(dilation, int) or len(dilation) == 2 - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - - batch, in_channel, in_height, in_width = Input.shape - num_filter, channel, kernel_h, kernel_w = Filter.shape - # compute the output shape - dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 - dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 - pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( - padding, (dilated_kernel_h, dilated_kernel_w) - ) - out_channel = num_filter - out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) - # compute graph - pad_before = [0, 0, pad_top, pad_left] - pad_after = [0, 0, pad_down, pad_right] - temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") - - rc = te.reduce_axis((0, in_channel), name="rc") - ry = te.reduce_axis((0, kernel_h), name="ry") - rx = te.reduce_axis((0, kernel_w), name="rx") - return te.compute( - (batch, out_channel, out_height, out_width), - lambda nn, ff, yy, xx: te.sum( - temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype( - out_dtype - ) - * Filter[ff, rc, ry, rx].astype(out_dtype), - axis=[rc, ry, rx], - ), - tag="conv2d_nchw", - ) - - -def schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv): - """schedule optimized for batch size = 1""" - - ##### space definition begin ##### - n, f, y, x = s[conv].op.axis - rc, ry, rx = s[conv].op.reduce_axis - cfg.define_split("tile_f", f, num_outputs=4) - cfg.define_split("tile_y", y, num_outputs=4) - cfg.define_split("tile_x", x, num_outputs=4) - cfg.define_split("tile_rc", rc, num_outputs=2) - cfg.define_split("tile_ry", ry, num_outputs=2) - cfg.define_split("tile_rx", rx, num_outputs=2) - cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - - - pad_data, kernel = s[conv].op.input_tensors - - s[pad_data].compute_inline() - if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: - s[kernel].compute_inline() - - if conv.op in s.outputs: - output = conv - OL = s.cache_write(conv, "local") - else: - output = s.outputs[0].output(0) - s[conv].set_scope("local") - OL = conv - - AA = s.cache_read(pad_data, "shared", [OL]) - WW = s.cache_read(kernel, "shared", [OL]) - - # tile and bind spatial axes - n, f, y, x = s[output].op.axis - kernel_scope, n = s[output].split(n, nparts=1) - - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - - bf = s[output].fuse(n, bf) - s[output].bind(bf, te.thread_axis("blockIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - s[output].bind(tf, te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) - s[OL].compute_at(s[output], tx) - - # tile reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - rco, rci = cfg["tile_rc"].apply(s, OL, rc) - ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) - rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - - # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) - ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) - tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) - - # unroll - s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - - N, CO, OH, OW = get_const_tuple(output.shape) - _, KH, KW, CI = get_const_tuple(kernel.shape) - - if isinstance(N, int): - cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW) - - -def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_dtype=None): +def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None): """Convolution operator in NCHWc layout. """ if out_dtype is None: @@ -739,24 +558,29 @@ def compute_conv2d_NCHWc_KCRSk_tx(Input, Filter, stride, padding, dilation, out_ # rs = crs % RS # r = rs // W == (crs // S) % R # s = rs % W == crs % S - Filter_tx = te.compute( + Filter = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), - lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb], - name = "packed_filter" + lambda ffc, crs, ffb: Filter[ + ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb + ], + name="packed_filter", ) return te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( - temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb].astype( - out_dtype - ) - * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb].astype(out_dtype), + temp[ + nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb + ].astype(out_dtype) + * Filter[ + ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb + ].astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_kcrsk_texture", ) -def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv): + +def schedule_conv2d_NCHWc_KCRSk(cfg, s, conv): """schedule optimized for batch size = 1""" ##### space definition begin ##### @@ -790,6 +614,7 @@ def schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv): # create cache stage AT = s.cache_read(pad_data, "global.texture", [OL]) WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): axes = s[stage].op.axis fused = s[stage].fuse(*axes[:-1]) @@ -797,9 +622,13 @@ def copy_to_texture(stage): s[stage].vectorize(axes[-1]) s[stage].bind(block, te.thread_axis("blockIdx.x")) s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) copy_to_texture(WT) + AA = s.cache_read(AT, "shared", [OL]) + WW = s.cache_read(WT, "shared", [OL]) + # tile and bind spatial axes n, fc, y, x, fb = s[output].op.axis @@ -864,7 +693,7 @@ def copy_to_texture(stage): cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) -def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): +def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): """Convolution operator in NCHWc layout. """ if out_dtype is None: @@ -909,26 +738,29 @@ def compute_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation # rs = crs % RS # r = rs // W == (crs // S) % R # s = rs % W == crs % S - Filter_tx = te.compute( + Filter = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), - lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb], - name = "packed_filter" + lambda ffc, crs, ffb: Filter[ + ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb + ], + name="packed_filter", ) conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( - (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] - * Filter_tx[ffc, ((rcc * in_channel_block + rcb)*kernel_h + ry)*kernel_w + rx, ffb]).astype(out_dtype), + ( + temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] + * Filter[ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb] + ).astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_kcrsk_texture", ) - output = te.compute(conv.shape, lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype("float32")) + output = te.compute(conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32")) return output - -def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): +def schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output): """schedule optimized for batch size = 1""" conv = output.op.input_tensors[0] @@ -964,6 +796,7 @@ def schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): # create cache stage AT = s.cache_read(pad_data, "global.texture", [OL]) WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): axes = s[stage].op.axis fused = s[stage].fuse(*axes[:-1]) @@ -971,6 +804,7 @@ def copy_to_texture(stage): s[stage].vectorize(axes[-1]) s[stage].bind(block, te.thread_axis("blockIdx.x")) s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) copy_to_texture(WT) @@ -1042,8 +876,9 @@ def copy_to_texture(stage): cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) - -def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): +def compute_depthwise_conv2d_NCHWc_KCRSk_acc32( + Input, Filter, stride, padding, dilation, out_dtype=None +): """Depthwise convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype @@ -1080,7 +915,6 @@ def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") - # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c] # NCHWc x CMRS # texture: NCH|W|c @@ -1090,26 +924,41 @@ def compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(Input, Filter, stride, padding # rs = mrs % RS # r = rs // W == (mrs // S) % R # s = rs % W == mrs % S - Filter_tx = te.compute( + Filter = te.compute( (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block), - lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb], - name = "packed_filter" + lambda ffc, mrs, ffb: Filter[ + ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb + ], + name="packed_filter", ) conv = te.compute( (batch, out_channel_chunk, out_height, out_width, channel_block), lambda nn, ffc, yy, xx, ffb: te.sum( - (temp[nn, ffc//channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] - * Filter_tx[ffc//channel_multiplier, ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(out_dtype), + ( + temp[ + nn, + ffc // channel_multiplier, + yy * stride_h + ry * dilation_h, + xx * stride_w + rx * dilation_w, + ffb, + ] + * Filter[ + ffc // channel_multiplier, + ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, + ffb, + ] + ).astype(out_dtype), axis=[ry, rx], ), tag="depthwise_conv2d_nchwc_kcrsk_texture", ) - return te.compute(conv.shape, lambda n,ffc,y,x,ffb: conv[n,ffc,y,x,ffb].astype("float32")) - + return te.compute( + conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype("float32") + ) -def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): +def schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output): """schedule optimized for batch size = 1""" conv = output.op.input_tensors[0] @@ -1144,6 +993,7 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output): # create cache stage AT = s.cache_read(pad_data, "global.texture", [OL]) WT = s.cache_read(kernel, "global.texture", [OL]) + def copy_to_texture(stage): axes = s[stage].op.axis fused = s[stage].fuse(*axes[:-1]) @@ -1151,6 +1001,7 @@ def copy_to_texture(stage): s[stage].vectorize(axes[-1]) s[stage].bind(block, te.thread_axis("blockIdx.x")) s[stage].bind(thread, te.thread_axis("threadIdx.x")) + copy_to_texture(AT) copy_to_texture(WT) @@ -1190,7 +1041,7 @@ def copy_to_texture(stage): s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb) s[OL].vectorize(fb) - #s[OL].unroll() + # s[OL].unroll() s[AA].compute_at(s[OL], rxo) s[WW].compute_at(s[OL], rxo) @@ -1222,353 +1073,59 @@ def copy_to_texture(stage): cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW) -def compute_conv2d_NCHWc_KCRSk( - cfg, data, kernel, stride, padding, dilation, out_dtype=None -): - """Convolution operator for 'conv2d_NCHWc_KCRSk'. - - Parameters - ---------- - data : tvm.te.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] or - 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] - - kernel : tvm.te.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] or - 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, - filter_width, num_filter_block, in_channel_block] - - stride : int or a list/tuple of two ints - Stride size, or [stride_height, stride_width] - - padding : int or str - Padding size, or ['VALID', 'SAME'] - - dilation : int or a list/tuple of two ints - dilation size, or [dilation_height, dilation_width] - - out_dtype : str - The output type. This is used for mixed precision. - - Returns - ------- - Output : tvm.te.Tensor - 5-D with shape [batch, out_channel, out_height, out_width, out_channel_block] - """ - if out_dtype is None: - out_dtype = data.dtype - ic_block_factor = 4 - oc_block_factor = 4 - - pre_computed = len(kernel.shape) == 5 - if not pre_computed: - batch, channels, height, width = get_const_tuple(data.shape) - out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape) - - assert ( - channels % ic_block_factor == 0 - ), "Number of input channels must divide {}".format(ic_block_factor) - assert ( - out_channels % oc_block_factor == 0 - ), "Number of output channels must divide {}".format(oc_block_factor) - - packed_data = te.compute( - (batch, channels // ic_block_factor, height, width, ic_block_factor), - lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w], - name="packed_data", - ) - packed_kernel = te.compute( - ( - out_channels // oc_block_factor, - in_channels, - kernel_h, - kernel_w, - oc_block_factor - ), - lambda oc_chunk, ic, kh, kw, oc_block: kernel[ - oc_chunk * oc_block_factor + oc_block, ic, kh, kw - ], - name="packed_kernel", - ) - else: - packed_data = data - packed_kernel = kernel - - batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(packed_data.shape) - oc_chunk, _, kernel_h, kernel_w, oc_block = get_const_tuple(packed_kernel.shape) - - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - - # pad the input data - pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w)) - pad_before = [0, 0, pad_top, pad_left, 0] - pad_after = [0, 0, pad_down, pad_right, 0] - pad_data = nn.pad(packed_data, pad_before, pad_after, name="pad_data") - - # compute the output shape - out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 - out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 - - oshape = (batch, oc_chunk, out_height, out_width, oc_block) - - icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") - icb = te.reduce_axis((0, ic_block_factor), name="ic_block") - kh = te.reduce_axis((0, kernel_h), name="kh") - kw = te.reduce_axis((0, kernel_w), name="kw") - - conv = te.compute( - oshape, - lambda n, occ, oh, ow, ocb: te.sum( - pad_data[ - n, - icc, - oh * stride_h + kh * dilation_h, - ow * stride_w + kw * dilation_w, - icb, - ] - * packed_kernel[occ, icc * ic_block + icb, kh, kw, ocb], - axis=[icc, kh, kw, icb], - ), - ) - - # Type conversion - output = te.compute( - oshape, lambda *index: conv(*index).astype(out_dtype), tag="conv2d_NCHWc_KCRSk" - ) - - num_flop = ( - batch - * oc_chunk - * oc_block - * out_height - * out_width - * ic_chunk - * ic_block - * kernel_h - * kernel_w - * 2 - ) - cfg.add_flop(num_flop) - - return output - - -def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): - """Schedule conv2d NCHWc template""" - - conv = output.op.input_tensors[0] - packed_data, packed_kernel = conv.op.input_tensors - - if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag: - pad_data = packed_data - packed_data = pad_data.op.input_tensors[0] - else: - pad_data = packed_data - - # if autotvm.GLOBAL_SCOPE.in_tuning: - # # skip this part during tuning to make records accurate - # # this part will be pre-computed during NNVM's pre-compute optimization pass - # s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") - # s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region") - # else: - # if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel": - # # data and kernel are not pre-computed, schedule layout transform here - # schedule_injective_from_existing(s, packed_data) - # schedule_injective_from_existing(s, packed_kernel) - - if pad_data != packed_data: - s[pad_data].compute_inline() - - # create cache stage - AA = s.cache_read(pad_data, "shared", [conv]) - WW = s.cache_read(packed_kernel, "shared", [conv]) - - s[conv].set_scope("local") - - # handle bias - if output.op not in s.outputs: - s[output].compute_inline() - output = s.outputs[0].output(0) - - oc_chunk = nn.get_const_int(output.shape[1]) - # tile and bind spatial axes - n, f, y, x, c = s[output].op.axis - cfg.define_split("tile_n", n, num_outputs=4) - cfg.define_split("tile_f", cfg.axis(oc_chunk), num_outputs=4) - cfg.define_split("tile_y", y, num_outputs=4) - cfg.define_split("tile_x", x, num_outputs=4) - - # this is the scope to attach global config inside this kernel - kernel_scope, n = s[output].split(n, nparts=1) - - s[output].bind(n, te.thread_axis("blockIdx.z")) - bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - - s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi) - s[output].bind(bn, te.thread_axis("blockIdx.z")) - #s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y")) - s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x")) - s[output].bind(vn, te.thread_axis("vthread")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf - if cfg["fuse_yx"].val: - s[output].bind(tn, te.thread_axis("threadIdx.z")) - s[output].bind(tf, te.thread_axis("threadIdx.y")) - tyx = s[output].fuse(ty, tx) - s[output].bind(tyx, te.thread_axis("threadIdx.x")) - s[conv].compute_at(s[output], tyx) - - # number of threads - n_tz = cfg["tile_n"].size[2] - n_ty = cfg["tile_f"].size[2] - n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2] - else: - s[output].bind(tn, te.thread_axis("threadIdx.z")) - s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[conv].compute_at(s[output], tx) - - # number of threads - n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2] - n_ty = cfg["tile_y"].size[2] - n_tx = cfg["tile_x"].size[2] - - # tile and bind reduction axes - n, f, y, x, c = s[conv].op.axis - rc, ry, rx, rc_block = s[conv].op.reduce_axis - cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2) - cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2) - cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2) - rco, rci = cfg["tile_rc"].apply(s, conv, rc) - ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) - rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) - - s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block) - #_, rc_block = s[conv].split(rc_block, factor=4) - #s[conv].tensorize(rc_block, _dp4a) - - s[AA].compute_at(s[conv], rxo) - s[WW].compute_at(s[conv], rxo) - - # cooperative fetching - for load in [AA, WW]: - fcd = s[load].op.axis[-1] - #fcd_outer, fcd = s[load].split(fcd, factor=4) - s[load].vectorize(fcd) - #fused = s[load].op.axis[:-1] + [fcd_outer] - fused = s[load].op.axis[:-1] - fused = s[load].fuse(*fused) - - fused, tx = s[load].split(fused, factor=n_tx) - fused, ty = s[load].split(fused, factor=n_ty) - fused, tz = s[load].split(fused, factor=n_tz) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) - return s - - -#@autotvm.template("matmul_vector_accumulator_tune") -def matmul_vector_acc_template(shapeA, shapeB): - placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) - s = schedule_matmul_vector_accumulator_autotvm(*placeholders) +def scheduler(compute, schedule, *args, **kwargs): + placeholders = compute(*args) + s = schedule(*placeholders, **kwargs) return s, placeholders -#@autotvm.template("conv2d_1x1_NCHWc_RSCKk_tune") -def conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape): + +def conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape) s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders) return s, placeholders -#@autotvm.template("conv2d_1x1_WCHNc_CRSKk_tune") -def conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape): + +def conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape) s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders) return s, (placeholders[0], placeholders[1], placeholders[-1]) -#@autotvm.template("conv2d_cuda_NCHW_KCRS_tune") -def conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape): - data = te.placeholder(input_shape, name="data", dtype="float32") - filt = te.placeholder(filter_shape, name="filter", dtype="float32") - conv = compute_conv2d_cuda_NCHW_KCRS(data, filt, [1,1], [0,0], [0,0], "float32") - cfg = autotvm.get_config() - s = te.create_schedule([x.op for x in [conv]]) - schedule_conv2d_cuda_NCHW_KCRS(cfg, s, conv) - return s, (data, filt, conv) - -#@autotvm.template("conv2d_cuda_NCHWc_KCRSk_tune") -def conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape): - cfg = autotvm.get_config() - data = te.placeholder(input_shape, name="data", dtype="float32") - filt = te.placeholder(filter_shape, name="filter", dtype="float32") - output = compute_conv2d_NCHWc_KCRSk(cfg, data, filt, [1,1], [0,0], [0,0], "float32") - s = te.create_schedule([x.op for x in [output]]) - s = schedule_conv2d_NCHWc_KCRSk(cfg, s, output) - return s, (data, filt, output) -def conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape): +def conv2d_NCHWc_KCRSk(input_shape, filter_shape): data = te.placeholder(input_shape, name="data", dtype="float32") filt = te.placeholder(filter_shape, name="filter", dtype="float32") - conv = compute_conv2d_NCHWc_KCRSk_tx(data, filt, [1,1], [0,0], [1,1], "float32") + conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1], "float32") cfg = autotvm.get_config() s = te.create_schedule([x.op for x in [conv]]) - schedule_conv2d_NCHWc_KCRSk_tx(cfg, s, conv) + schedule_conv2d_NCHWc_KCRSk(cfg, s, conv) return s, (data, filt, conv) -def conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape): + +def conv2d_NCHWc_KCRSk_fp32_acc(input_shape, filter_shape): data = te.placeholder(input_shape, name="data", dtype="float32") filt = te.placeholder(filter_shape, name="filter", dtype="float32") - output = compute_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32") + output = compute_conv2d_NCHWc_KCRSk_acc32(data, filt, [1, 1], [0, 0], [1, 1], "float32") cfg = autotvm.get_config() s = te.create_schedule([x.op for x in [output]]) - schedule_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output) + schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output) return s, (data, filt, output) -def depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape): + +def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape): data = te.placeholder(input_shape, name="data", dtype="float32") filt = te.placeholder(filter_shape, name="filter", dtype="float32") - output = compute_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(data, filt, [1,1], [0,0], [1,1], "float32") + output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32( + data, filt, [1, 1], [0, 0], [1, 1], "float32" + ) cfg = autotvm.get_config() s = te.create_schedule([x.op for x in [output]]) - schedule_depthwise_conv2d_NCHWc_KCRSk_tx_acc32(cfg, s, output) + schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output) return s, (data, filt, output) -#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune") -def conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape): - return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape) - -#@autotvm.template("conv2d_NCHWc_KCRSk_tx_tune2") -def conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape): - return conv2d_NCHWc_KCRSk_tx_template_impl(input_shape, filter_shape) - -#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune") -def conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape): - return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape) - -#@autotvm.template("conv2d_NCHWc_KCRSk_tx_fp32acc_tune2") -def conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape): - return conv2d_NCHWc_KCRSk_tx_template_fp32_acc_impl(input_shape, filter_shape) - -#@autotvm.template("depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune") -def depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape): - return depthwise_conv2d_NCHWc_KCRSk_tx_template_acc32_impl(input_shape, filter_shape) def ref_convolution(data, kernel, stride, pad): import mxnet as mx + groups = 1 kernel_size = (kernel.shape[2], kernel.shape[3]) num_filter = kernel.shape[0] @@ -1585,8 +1142,10 @@ def ref_convolution(data, kernel, stride, pad): ) return ref_res.asnumpy() + def ref_depthwise_convolution(data, kernel, stride, pad): import mxnet as mx + groups = kernel.shape[0] kernel_size = (kernel.shape[2], kernel.shape[3]) num_filter = kernel.shape[0] @@ -1604,8 +1163,9 @@ def ref_depthwise_convolution(data, kernel, stride, pad): ) return ref_res.asnumpy() -def validate(workload, target, dev): - s, placeholders = workload() + +def validate(workload, target, dev, input_shapes, *args, **kwargs): + s, placeholders = workload(*input_shapes, *args, **kwargs) func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction") args_tvm = [] @@ -1614,154 +1174,227 @@ def validate(workload, target, dev): var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype) args_np.append(var_np) args_tvm.append(tvm.nd.array(var_np, dev)) - args_tvm.append(tvm.nd.array(np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev)) + args_tvm.append( + tvm.nd.array( + np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev + ) + ) func(*args_tvm) if "plus_one" in workload.__name__: - np_result = args_np[0] + 1.0; + np_result = args_np[0] + 1.0 elif "matmul" in workload.__name__: - if 'inner' in workload.__name__: - np_result = np.matmul(args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0)) - elif 'accum' in workload.__name__: - np_result = np.matmul(args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64)) + if "inner" in workload.__name__: + np_result = np.matmul( + args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0) + ) + elif "accum" in workload.__name__: + np_result = np.matmul( + args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64) + ) else: - np_result = np.matmul(args_np[0].transpose((0, 2, 1)).reshape(128, 64), args_np[1].transpose(1, 0, 2).reshape(64,128)) - elif "conv2d_1x1_NCHWc_RSCKk_tune" in workload.__name__: + np_result = np.matmul( + args_np[0].transpose((0, 2, 1)).reshape(128, 64), + args_np[1].transpose(1, 0, 2).reshape(64, 128), + ) + elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__: vec_length = args_np[1].shape[-1] # nchwc -> nchw - args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3]) + args_np[0] = ( + args_np[0] + .transpose((0, 1, 4, 2, 3)) + .reshape( + args_np[0].shape[0], + args_np[0].shape[1] * args_np[0].shape[-1], + args_np[0].shape[2], + args_np[0].shape[3], + ) + ) # rsckk -> rsck -> kcrs - args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 2, 0, 1)) + args_np[1] = ( + args_np[1] + .reshape( + args_np[1].shape[0], + args_np[1].shape[1], + args_np[1].shape[2], + args_np[1].shape[3] * args_np[1].shape[4], + ) + .transpose((3, 2, 0, 1)) + ) np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) # nkhw -> nkhwk - np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2) - elif "conv2d_1x1_WCHNc_CRSKk_tune" in workload.__name__: + np_result = np_result.reshape( + np_result.shape[0], + np_result.shape[1] // vec_length, + vec_length, + np_result.shape[2], + np_result.shape[3], + ).transpose(0, 1, 3, 4, 2) + elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__: vec_length = args_np[1].shape[-1] # wchnc -> nchw - args_np[0] = args_np[0].transpose((3, 1, 4, 2, 0)).reshape(args_np[0].shape[3], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[0]) + args_np[0] = ( + args_np[0] + .transpose((3, 1, 4, 2, 0)) + .reshape( + args_np[0].shape[3], + args_np[0].shape[1] * args_np[0].shape[-1], + args_np[0].shape[2], + args_np[0].shape[0], + ) + ) # crskk -> crsk -> kcrs - args_np[1] = args_np[1].reshape(args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]*args_np[1].shape[4]).transpose((3, 0, 1, 2)) + args_np[1] = ( + args_np[1] + .reshape( + args_np[1].shape[0], + args_np[1].shape[1], + args_np[1].shape[2], + args_np[1].shape[3] * args_np[1].shape[4], + ) + .transpose((3, 0, 1, 2)) + ) np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) # nkhw -> nkkhw -> wkhnk - np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(4, 1, 3, 0, 2) + np_result = np_result.reshape( + np_result.shape[0], + np_result.shape[1] // vec_length, + vec_length, + np_result.shape[2], + np_result.shape[3], + ).transpose(4, 1, 3, 0, 2) elif "NCHW_KCRS" in workload.__name__: np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) elif "NCHWc_KCRSk" in workload.__name__: vec_length = args_np[1].shape[-1] # nchwc -> nchw - args_np[0] = args_np[0].transpose((0, 1, 4, 2, 3)).reshape(args_np[0].shape[0], args_np[0].shape[1]*args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3]) + args_np[0] = ( + args_np[0] + .transpose((0, 1, 4, 2, 3)) + .reshape( + args_np[0].shape[0], + args_np[0].shape[1] * args_np[0].shape[-1], + args_np[0].shape[2], + args_np[0].shape[3], + ) + ) # kcrsk/cmrsc -> kcrs/cmrs - args_np[1] = args_np[1].transpose((0, 4, 1, 2, 3)).reshape(args_np[1].shape[0] * args_np[1].shape[4], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3]) + args_np[1] = ( + args_np[1] + .transpose((0, 4, 1, 2, 3)) + .reshape( + args_np[1].shape[0] * args_np[1].shape[4], + args_np[1].shape[1], + args_np[1].shape[2], + args_np[1].shape[3], + ) + ) if "depthwise" in workload.__name__: - #np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID") + # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID") np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], []) else: - #np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) + # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) np_result = ref_convolution(args_np[0], args_np[1], [], []) # nkhw -> nkhwk - np_result = np_result.reshape(np_result.shape[0], np_result.shape[1]//vec_length, vec_length, np_result.shape[2], np_result.shape[3]).transpose(0, 1, 3, 4, 2) + np_result = np_result.reshape( + np_result.shape[0], + np_result.shape[1] // vec_length, + vec_length, + np_result.shape[2], + np_result.shape[3], + ).transpose(0, 1, 3, 4, 2) np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2) -def verify_plus_one_rank3(): - shape =(32, 32, 4) - placeholders = compute(shape) - s = schedule(*placeholders) - return s, placeholders -def verify_matmul(): - shape = (32, 64, 4) - placeholders = compute_matmul(shape) - s = schedule_matmul(*placeholders) - return s, placeholders +class BaseSingleShapeValidator: + @tvm.testing.parametrize_targets("opencl") + def test_unary(self, test_func, input_shape, target, dev): + validate(test_func, target, dev, [input_shape]) -def verify_matmul_with_local(): - shape = (32, 64, 4) - placeholders = compute_matmul(shape) - s = schedule_matmul(*placeholders, local=True) - return s, placeholders -def verify_matmul_inner(): - shape = (32, 64, 4) - placeholders = compute_matmul_inner(shape) - s = schedule_matmul_inner(*placeholders) - return s, placeholders +class TestPlusOneRank3(BaseSingleShapeValidator): + input_shape = tvm.testing.parameter((32, 32, 4)) -def verify_matmul_vector_accumulator(): - shapeA, shapeB = (32, 64, 4), (128, 16, 4) - placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) - s = schedule_matmul_vector_accumulator(*placeholders) - return s, placeholders + def plus_one(input_shape): + return scheduler(compute_plus_one_rank3, schedule_plus_one_rank3, input_shape) -def verify_matmul_vector_accumulator_with_local(): - shapeA, shapeB = (32, 64, 4), (128, 16, 4) - placeholders = compute_matmul_vector_accumulator(shapeA, shapeB) - s = schedule_matmul_vector_accumulator(*placeholders, local=True) - return s, placeholders + test_func = tvm.testing.parameter(plus_one) -def verify_plus_one_rank5(): - shape =(32, 2, 4, 4, 4) - placeholders = compute5d(shape) - s = schedule5d(*placeholders) - return s, placeholders -def verify_matmul_vector_accumulator_tune(): - shapeA, shapeB = (32, 64, 4), (128, 16, 4) - return matmul_vector_acc_template(shapeA, shapeB) - -def verify_conv2d_1x1_NCHWc_RSCKk_tune(): - # mobilenetv1 1x1 conv2d - input_shape, filter_shape = (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) - return conv2d_1x1_NCHWc_RSCKk_template(input_shape, filter_shape) - -def verify_conv2d_1x1_WCHNc_CRSKk_tune(): - input_shape, filter_shape = (56, 128//4, 56, 1, 4), (128, 1, 1, 128//4, 4) - return conv2d_1x1_WCHNc_CRSKk_template(input_shape, filter_shape) - -def verify_conv2d_cuda_NCHW_KCRS_tune(): - # NCHW, KCRS - input_shape, filter_shape = (1, 128, 56, 56), (128, 128, 1, 1) - return conv2d_cuda_NCHW_KCRS_template(input_shape, filter_shape) - -def verify_conv2d_cuda_NCHWc_KCRSk_tune(): - # NCHWc, KCRSk - input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) - return conv2d_cuda_NCHWc_KCRSk_template(input_shape, filter_shape) - -def verify_conv2d_NCHWc_KCRSk_tx_tune(): - # NCHWc, KCRSk - input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) - return conv2d_NCHWc_KCRSk_tx_template(input_shape, filter_shape) - -def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune(): - # NCHWc, KCRSk - input_shape, filter_shape = (1, 32, 56, 56, 4), (32, 128, 1, 1, 4) - return conv2d_NCHWc_KCRSk_tx_fp32acc_template(input_shape, filter_shape) - -def verify_conv2d_NCHWc_KCRSk_tx_tune2(): - # NCHWc, KCRSk - input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4) - # input_shape, filter_shape = (1, 128, 7, 7, 4), (256, 512, 1, 1, 4) - # input_shape, filter_shape = (1, 128, 7, 7, 4), (128, 512, 3, 3, 4) - # input_shape, filter_shape = (1, 128, 7, 7, 4), (512, 512, 1, 1, 4) - return conv2d_NCHWc_KCRSk_tx_template2(input_shape, filter_shape) - -def verify_conv2d_NCHWc_KCRSk_tx_fp32acc_tune2(): - # NCHWc, KCRSk - input_shape, filter_shape = (1, 32, 112, 112, 4), (32, 128, 3, 3, 4) - return conv2d_NCHWc_KCRSk_tx_fp32acc_template2(input_shape, filter_shape) - -def verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(): - # deeplabv3 - # [1, 144, 129, 129], [144, 1, 3, 3] - # [1, 96, 257, 257], [96, 1, 3, 3] - # [N, C, H, W], [K, 1, R, S] - # [N, C/4, H, W, 4c], [C/4, 1, R, S, 4c] - input_shape, filter_shape = (1, 96//4, 257, 257, 4), (96//4, 1, 3, 3, 4) - return depthwise_conv2d_NCHWc_KCRSk_tx_acc32_template(input_shape, filter_shape) - -@tvm.testing.parametrize_targets("opencl") -def test_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune(target, dev): - validate(verify_depthwise_conv2d_NCHWc_KCRSk_tx_acc32_tune, target, dev) +class TestPlusOneRank5(BaseSingleShapeValidator): + input_shape = tvm.testing.parameter((32, 2, 4, 4, 4)) + + def plus_one(input_shape): + return scheduler(compute_plus_one_rank5, schedule_plus_one_rank5, input_shape) + + test_func = tvm.testing.parameter(plus_one) + + +class TestMatmul: + input_shape = tvm.testing.parameter((32, 64, 4)) + local = tvm.testing.parameter(False, True) + + def matmul(input_shape, local): + return scheduler(compute_matmul, schedule_matmul, input_shape, local=local) + + def matmul_inner(input_shape, local): + return scheduler(compute_matmul_inner, schedule_matmul_inner, input_shape, local=local) + + test_func = tvm.testing.parameter(matmul, matmul_inner) + + @tvm.testing.parametrize_targets("opencl") + def test_matmul(self, test_func, input_shape, local, target, dev): + validate(test_func, target, dev, [input_shape], local=local) + + +class TestMatmulVectorAccumulator: + shapeA = tvm.testing.parameter((32, 64, 4)) + shapeB = tvm.testing.parameter((128, 16, 4)) + local = tvm.testing.parameter(False, True) + + def matmul_vector_accumulator(shapeA, shapeB, local): + return scheduler( + compute_matmul_vector_accumulator, + schedule_matmul_vector_accumulator, + shapeA, + shapeB, + local=local, + ) + + test_func = tvm.testing.parameter(matmul_vector_accumulator) + + @tvm.testing.parametrize_targets("opencl") + def test_matmul_vec_acc(self, test_func, shapeA, shapeB, local, target, dev): + validate(test_func, target, dev, [shapeA, shapeB], local=local) + + +class BaseConv2DValidator: + @tvm.testing.parametrize_targets("opencl") + def test_conv2d(self, test_func, input_shapes, target, dev): + validate(test_func, target, dev, input_shapes) + + +class TestConv2dNCHWcRSCKk(BaseConv2DValidator): + input_shapes = tvm.testing.parameter([(1, 32, 56, 56, 4), (1, 1, 128, 32, 4)]) + test_func = tvm.testing.parameter(conv2d_1x1_NCHWc_RSCKk) + + +class TestConv2dWCHNcCRSKk(BaseConv2DValidator): + input_shapes = tvm.testing.parameter([(56, 32, 56, 1, 4), (128, 1, 1, 32, 4)]) + test_func = tvm.testing.parameter(conv2d_1x1_WCHNc_CRSKk) + + +class TestConv2dNCHWcKCRSk(BaseConv2DValidator): + input_shapes = tvm.testing.parameter( + [(1, 32, 56, 56, 4), (32, 128, 1, 1, 4)], [(1, 32, 112, 112, 4), (32, 128, 3, 3, 4)] + ) + test_func = tvm.testing.parameter(conv2d_NCHWc_KCRSk, conv2d_NCHWc_KCRSk_fp32_acc) + + +class TestDepthwiseConv2dNCHWcKCRSk(BaseConv2DValidator): + input_shapes = tvm.testing.parameter([(1, 24, 257, 257, 4), (24, 1, 3, 3, 4)]) + test_func = tvm.testing.parameter(depthwise_conv2d_NCHWc_KCRSk_acc32) + if __name__ == "__main__": sys.exit(pytest.main(sys.argv)) From 2903e53e13d104032015484492a52bcf4363b4e8 Mon Sep 17 00:00:00 2001 From: Chris Sullivan Date: Tue, 17 Aug 2021 21:29:20 -0700 Subject: [PATCH 59/59] Respond to CRs. --- src/target/source/codegen_opencl.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 8d760a07e032..7abff36a3ddb 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -36,19 +36,19 @@ namespace codegen { class InferTextureAccess : public StmtExprVisitor { public: - static constexpr const uint8_t read_access = 1; - static constexpr const uint8_t write_access = 2; + static constexpr const uint8_t kReadAccess = 1; + static constexpr const uint8_t kWriteAccess = 2; InferTextureAccess() {} std::unordered_map Infer(const Stmt& n) { StmtExprVisitor::VisitStmt(n); std::unordered_map storage_scope_qualifiers; for (auto& texture : var_access_map_) { - if (texture.second == read_access) { + if (texture.second == kReadAccess) { storage_scope_qualifiers.insert({texture.first, "texture_read"}); - } else if (texture.second == write_access) { + } else if (texture.second == kWriteAccess) { storage_scope_qualifiers.insert({texture.first, "texture_write"}); - } else if (texture.second == (read_access | write_access)) { + } else if (texture.second == (kReadAccess | kWriteAccess)) { storage_scope_qualifiers.insert({texture.first, ""}); } } @@ -56,9 +56,9 @@ class InferTextureAccess : public StmtExprVisitor { } void VisitExpr_(const CallNode* op) { if (op->op.same_as(builtin::texture2d_load())) { - var_access_map_[op->args[0].as()] |= read_access; + var_access_map_[op->args[0].as()] |= kReadAccess; } else if (op->op.same_as(builtin::texture2d_store())) { - var_access_map_[op->args[0].as()] |= write_access; + var_access_map_[op->args[0].as()] |= kWriteAccess; } else { StmtExprVisitor::VisitExpr_(op); }