From 7d9d9c956049710fa59991170cd97f4ef9896676 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 17 Aug 2021 16:52:54 -0700
Subject: [PATCH 01/18] Remove compile_engine.h for real

---
 src/relay/backend/build_module.cc             |   4 +-
 src/relay/backend/compile_engine.cc           | 338 ------------------
 src/relay/backend/compile_engine.h            | 115 ------
 src/relay/backend/interpreter.cc              |   7 +-
 .../auto_scheduler_layout_rewrite.cc          |   4 +-
 5 files changed, 6 insertions(+), 462 deletions(-)
 delete mode 100644 src/relay/backend/compile_engine.cc
 delete mode 100644 src/relay/backend/compile_engine.h
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index b2b73e9bad02..88e9c8f058f5 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -33,7 +33,7 @@
 
 #include "../../target/func_registry_generator.h"
 #include "../../target/source/codegen_source_base.h"
-#include "compile_engine.h"
+#include "te_compiler.h"
 #include "utils.h"
 
 namespace tvm {
@@ -286,8 +286,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     executor_ = executor;
     CheckAndUpdateHostConsistency(&targets_, &target_host_);
     BuildRelay(mod, params_, mod_name);
-    // Clear compile engine so that tuning schedules can be changed between runs. See issue #6096.
-    CompileEngine::Global()->Clear();
   }
 
  protected:
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
deleted file mode 100644
index 6142e8323dea..000000000000
--- a/src/relay/backend/compile_engine.cc
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file relay/backend/compile_engine.cc
- * \brief Internal compialtion engine.
- */
-#include "compile_engine.h"
-
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/type_functor.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/device_copy.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/tags.h>
-
-#include <functional>
-#include <limits>
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "../../runtime/meta_data.h"
-#include "../transforms/pass_utils.h"
-#include "te_compiler_cache.h"
-#include "utils.h"
-
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
-
-class CompileEngineImpl : public CompileEngineNode {
- public:
-  // Lower the function.
-  CachedFunc Lower(const CCacheKey& key, std::function<String(String)> mangle_fn) {
-    return LowerInternal(key, mangle_fn)->cached_func;
-  }
-
-  CachedFunc Lower(const CCacheKey& key, const String mod_name) {
-    auto mangle_fn = [mod_name](String name) { return runtime::get_name_mangled(mod_name, name); };
-
-    return Lower(key, mangle_fn);
-  }
-
-  // For now, build one module per function.
-  PackedFunc JIT(const CCacheKey& key) final {
-    auto mangle_fn = [](String name) { return name; };
-    CCacheValue value = LowerInternal(key, mangle_fn);
-    if (value->packed_func != nullptr) return value->packed_func;
-    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
-    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
-    return value->packed_func;
-  }
-
-  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
-    return LowerShapeFuncInternal(key)->cached_func;
-  }
-
-  Array<tvm::runtime::Module> LowerExternalFunctions() {
-    Array<tvm::runtime::Module> ret;
-    std::unordered_map<std::string, std::string> cached_symbol;
-    std::vector<CCacheKey> cached_ext_funcs;
-    for (const auto& it : cache_) {
-      auto src_func = it.first->source_func;
-      ICHECK(src_func.defined());
-
-      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
-        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
-        ICHECK(code_gen.defined()) << "No external codegen is set";
-        std::string code_gen_name = code_gen.value();
-        cached_ext_funcs.push_back(it.first);
-
-        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
-                                      << AsText(src_func, false) << "\n"
-                                      << "Functions with external codegen must have the "
-                                      << tvm::attr::kGlobalSymbol << " attr set.";
-
-        std::string sn = symbol_name.value();
-        if (!cached_symbol.count(sn)) {
-          cached_symbol[sn] = code_gen_name;
-        } else {
-          ICHECK_NE(cached_symbol[sn], code_gen_name)
-              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
-        }
-
-        std::string ext_name = "relay.ext." + code_gen_name;
-        auto pf = tvm::runtime::Registry::Get(ext_name);
-        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
-        // No need to keep compiler attribute at this point, functions have been
-        // extracted for specific codegen.
-        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
-        runtime::Module ext_mod = (*pf)(src_func);
-
-        // todo(@zhiics, @jroesch): Should this be a user visible error?
-        ICHECK(ext_mod.defined()) << "No external library was generated for " << ext_name
-                                  << "even though it was requested"
-                                     "by the annotated function "
-                                  << PrettyPrint(src_func);
-
-        ret.push_back(ext_mod);
-      }
-    }
-
-    // No need to cache external functions as we collected them all to create
-    // external runtime modules.
-    for (const auto& it : cached_ext_funcs) {
-      cache_.erase(it);
-    }
-    return ret;
-  }
-
-  void Clear() final { cache_.clear(); }
-
-  // List all items in the cache.
-  Array<ObjectRef> ListItems() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    Array<ObjectRef> items;
-    for (auto& kv : cache_) {
-      items.push_back(kv.first);
-      items.push_back(kv.second);
-    }
-    return items;
-  }
-
-  // List all items in the shape_func_cache.
-  Array<ObjectRef> ListShapeFuncItems() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    Array<ObjectRef> items;
-    for (auto& kv : shape_func_cache_) {
-      items.push_back(kv.first);
-      items.push_back(kv.second);
-    }
-    return items;
-  }
-
-  /*!
-   * \brief Get the cache key of the function that is being lowered currently
-   * \return the cache key
-   */
-  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
-
- private:
-  // implement lowered func
-  CCacheValue LowerInternal(const CCacheKey& key, std::function<String(String)> mangle_fn) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    CCacheValue value;
-    auto it = cache_.find(key);
-    if (it != cache_.end()) {
-      it->second->use_count += 1;
-      if (it->second->cached_func.defined()) return it->second;
-      value = it->second;
-    } else {
-      value = CCacheValue(make_object<CCacheValueNode>());
-      value->use_count = 0;
-      if (!backend::IsCompileEngineCacheDisabled()) {
-        cache_[key] = value;
-      }
-    }
-    cur_ccache_key_ = key;
-
-    // No need to lower external functions for now. We will invoke the external
-    // codegen tool once and lower all functions together.
-    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
-      auto ir_module = IRModule();
-      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
-      auto func_name = std::string(name_node.value());
-      auto target = Target("ext_dev");
-      auto global_var = GlobalVar(func_name);
-      global_var->checked_type_ = key->source_func->checked_type();
-      ir_module->Add(global_var, key->source_func);
-      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
-      return value;
-    }
-
-    // Enforce use the target.
-    With<Target> target_scope(key->target);
-
-    ICHECK(!value->cached_func.defined());
-    auto cfunc = PrimFuncFor(key->source_func, key->target, [&](std::string name) {
-      return GetUniqueName(mangle_fn(name), &name_map_);
-    });
-
-    // Skip lowering for device copy node.
-    const Expr body = (key->source_func)->body;
-    if (const CallNode* call_node = body.as<CallNode>()) {
-      if (call_node->attrs.as<DeviceCopyAttrs>()) {
-        value->cached_func = cfunc;
-        return value;
-      }
-    }
-
-    // NOTE: array will copy on write.
-    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
-    for (te::Tensor arg : cfunc->outputs) {
-      all_args.push_back(arg);
-    }
-    // lower the function
-    std::unordered_map<te::Tensor, tir::Buffer> binds;
-    auto func_name = cfunc->prim_fn_var->name_hint;
-    cfunc->funcs->Update(tvm::LowerSchedule(cfunc->schedule, all_args, func_name, binds));
-    value->cached_func = cfunc;
-
-    return value;
-  }
-
-  // implement lowered shape func
-  CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    CCacheValue value;
-    auto it = shape_func_cache_.find(key);
-    if (it != shape_func_cache_.end()) {
-      it->second->use_count += 1;
-      if (it->second->cached_func.defined()) return it->second;
-      value = it->second;
-    } else {
-      value = CCacheValue(make_object<CCacheValueNode>());
-      value->use_count = 0;
-      shape_func_cache_[key] = value;
-    }
-    // Enforce use the target.
-    With<Target> target_scope(key->target);
-
-    ICHECK(!value->cached_func.defined());
-    using tvm::transform::PassContext;
-    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
-
-    auto cached_func = ShapeFuncFor(key->source_func, key->target, [&](std::string name) {
-      return GetUniqueName(name, &name_map_);
-    });
-
-    value->cached_func = cached_func;
-    return value;
-  }
-
-  /*! \brief compiler cache lock*/
-  std::mutex mutex_;
-  /*! \brief internal name map to get an unique name */
-  std::unordered_map<std::string, int> name_map_;
-  /*! \brief internal compiler cache */
-  std::unordered_map<CCacheKey, CCacheValue> cache_;
-  /*! \brief internal compiler cache for shape funcs */
-  std::unordered_map<CCacheKey, CCacheValue> shape_func_cache_;
-  /*! \brief the cache key of the function that is being lowered currently*/
-  CCacheKey cur_ccache_key_;
-};
-
-/*! \brief The global compile engine */
-CompileEngine& CompileEngine::Global() {
-  // intentionally allocate raw pointer to avoid
-  // free during destructuion.
-  static CompileEngine* inst = new CompileEngine(make_object<CompileEngineImpl>());
-  return *inst;
-}
-
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.disable_compile_engine_cache", Bool);
-
-TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
-    .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
-      return LoweredOutput(outputs, impl);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
-    .set_body_typed([](Function source_func, Target target) {
-      return CCacheKey(source_func, target);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal").set_body_typed([]() {
-  return CompileEngine::Global();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear").set_body_typed([](CompileEngine self) {
-  self->Clear();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
-    .set_body_typed([](CompileEngine self, CCacheKey key, const String mod_name) {
-      return self->Lower(key, mod_name);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLowerShapeFunc")
-    .set_body_typed([](CompileEngine self, CCacheKey key) { return self->LowerShapeFunc(key); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileLowerExternalFunctions")
-    .set_body_typed([](CompileEngine self) { return self->LowerExternalFunctions(); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
-    .set_body_typed([](CompileEngine self, CCacheKey key) { return self->JIT(key); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](CompileEngine self) {
-  CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-  ICHECK(ptr != nullptr);
-  return ptr->ListItems();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListShapeFuncItems")
-    .set_body_typed([](CompileEngine self) {
-      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-      ICHECK(ptr != nullptr);
-      return ptr->ListShapeFuncItems();
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey")
-    .set_body_typed([](CompileEngine self) {
-      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-      ICHECK(ptr != nullptr);
-      return ptr->GetCurrentCCacheKey();
-    });
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
deleted file mode 100644
index 4afdc6d30485..000000000000
--- a/src/relay/backend/compile_engine.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file relay/backend/compile_engine.h
- * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
- *
- * This layer represents the older design of the Relay compilation flow and is being deprecated
- * in favor of te_compiler.h which is a migration step towards a standard pass based lowering of
- * Relay functions.
- *
- */
-#ifndef TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
-#define TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
-
-#include <tvm/node/structural_equal.h>
-#include <tvm/node/structural_hash.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op_strategy.h>
-#include <tvm/relay/transform.h>
-#include <tvm/runtime/module.h>
-
-#include <functional>
-#include <string>
-
-#include "te_compiler_cache.h"
-
-namespace tvm {
-namespace relay {
-
-using namespace tvm::relay::tec;
-
-/*!
- * \brief Backend compilation engine for
- *        low level code generation.
- */
-class CompileEngineNode : public Object {
- public:
-  /*! \brief destructor */
-  virtual ~CompileEngineNode() {}
-  /*!
-   * \brief Get lowered result.
-   * \param key The key to the cached function.
-   * \param mod_name The mangling function for mangling names.
-   * \return The result.
-   */
-  virtual CachedFunc Lower(const CCacheKey& key, std::function<String(String)> mangle_fn) = 0;
-
-  /*!
-   * \brief Get lowered result.
-   * \param key The key to the cached function.
-   * \param mod_name The module name to mangle the functions.
-   * \return The result.
-   */
-  virtual CachedFunc Lower(const CCacheKey& key, const String mangle_fn) = 0;
-  /*!
-   * \brief Just in time compile to get a PackedFunc.
-   * \param key The key to the cached function.
-   * \return The result.
-   */
-  virtual PackedFunc JIT(const CCacheKey& key) = 0;
-  /*!
-   * \brief Lower the shape function.
-   * \param key The key to the cached function.
-   * \return The result.
-   */
-  virtual CachedFunc LowerShapeFunc(const CCacheKey& key) = 0;
-  /*!
-   * \brief Lower the external function using external codegen tools.
-   * \return The runtime moduels for each needed external codegen tool.
-   */
-  virtual tvm::Array<tvm::runtime::Module> LowerExternalFunctions() = 0;
-
-  /*! \brief clear the cache. */
-  virtual void Clear() = 0;
-
-  // VisitAttrs
-  void VisitAttrs(AttrVisitor*) {}
-
-  static constexpr const char* _type_key = "relay.CompileEngine";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CompileEngineNode, Object);
-};
-
-/*! \brief cache entry used in compile engine */
-class CompileEngine : public ObjectRef {
- public:
-  CompileEngine() {}
-  explicit CompileEngine(ObjectPtr<Object> n) : ObjectRef(n) {}
-  CompileEngineNode* operator->() { return static_cast<CompileEngineNode*>(get_mutable()); }
-  using ContainerType = CompileEngineNode;
-  /*! \brief The global compile engine. */
-  TVM_DLL static CompileEngine& Global();
-};
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index af2cbae1f72d..b264fe8f5c85 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -36,7 +36,6 @@
 #include <tvm/runtime/object.h>
 
 #include "../transforms/pass_utils.h"
-#include "compile_engine.h"
 #include "te_compiler.h"
 
 namespace tvm {
@@ -479,13 +478,13 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       // flattened form of this arg. Does that match what lowering actually does?
       int64_t state = prim_shape_fn_states[i]->value;
       for (const auto& nd_array : FlattenADT(args[i])) {
-        if (state & kNeedInputData) {
+        if (state & tec::kNeedInputData) {
           auto arr = nd_array.CopyTo(shape_device);
           inputs[arg_counter] = arr;
           setter(arg_counter, arr);
           ++arg_counter;
         }
-        if (state & kNeedInputShape) {
+        if (state & tec::kNeedInputShape) {
           int64_t ndim = nd_array.Shape().size();
           NDArray shape_arr;
           if (ndim == 0) {
@@ -922,7 +921,7 @@ std::pair<IRModule, Map<String, IRModule>> Prepare(IRModule mod, Device device,
   // Lower all primitive functions reachable from expr.
   // TODO(mbs): This should be just another pass in seq above, which requires LoweredModule to
   // be merged into IRModule.
-  LoweredModule lowered_module =
+  tec::LoweredModule lowered_module =
       tec::LowerTE(mod, targets, device_map, memory_plan, /*module_name=*/"intrp",
                    [](Function func) { /* no-op */ });
   return {lowered_module.main_module, lowered_module.per_target_module};
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 7a86af8aeffa..c24c41a086d4 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -34,7 +34,7 @@
 #include <functional>
 #include <vector>
 
-#include "../backend/compile_engine.h"
+#include "../backend/te_compiler.h"
 #include "pattern_utils.h"
 
 namespace tvm {
@@ -126,7 +126,7 @@ Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
       CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
       (*f)();
 
-      PrimFuncFor(GetRef<Function>(func), Target::Current(), [](std::string name) { return name; });
+      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(), [](std::string name) { return name; });
 
       f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
       CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";

From 79d42e5c81d4cdad5c65f47cdd4591987014b645 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Tue, 17 Aug 2021 22:56:42 -0700
Subject: [PATCH 02/18] Fix format

---
 src/relay/transforms/auto_scheduler_layout_rewrite.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index c24c41a086d4..c538dac048b3 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -126,7 +126,8 @@ Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
       CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
       (*f)();
 
-      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(), [](std::string name) { return name; });
+      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(),
+                       [](std::string name) { return name; });
 
       f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
       CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";

From efa80c4e1dc4161a97f56320edc9027f53d4a844 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 13 Oct 2021 11:08:01 +0300
Subject: [PATCH 03/18] RM compile_engine.cc

---
 src/relay/backend/compile_engine.cc | 338 ----------------------------
 1 file changed, 338 deletions(-)
 delete mode 100644 src/relay/backend/compile_engine.cc

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
deleted file mode 100644
index 0e7af2278375..000000000000
--- a/src/relay/backend/compile_engine.cc
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file relay/backend/compile_engine.cc
- * \brief Internal compilation engine.
- */
-#include "compile_engine.h"
-
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/type_functor.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/device_copy.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/tags.h>
-
-#include <functional>
-#include <limits>
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "../../runtime/meta_data.h"
-#include "../transforms/pass_utils.h"
-#include "te_compiler_cache.h"
-#include "utils.h"
-
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
-
-class CompileEngineImpl : public CompileEngineNode {
- public:
-  // Lower the function.
-  CachedFunc Lower(const CCacheKey& key, std::function<String(String)> mangle_fn) {
-    return LowerInternal(key, mangle_fn)->cached_func;
-  }
-
-  CachedFunc Lower(const CCacheKey& key, const String mod_name) {
-    auto mangle_fn = [mod_name](String name) { return runtime::get_name_mangled(mod_name, name); };
-
-    return Lower(key, mangle_fn);
-  }
-
-  // For now, build one module per function.
-  PackedFunc JIT(const CCacheKey& key) final {
-    auto mangle_fn = [](String name) { return name; };
-    CCacheValue value = LowerInternal(key, mangle_fn);
-    if (value->packed_func != nullptr) return value->packed_func;
-    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
-    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
-    return value->packed_func;
-  }
-
-  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
-    return LowerShapeFuncInternal(key)->cached_func;
-  }
-
-  Array<tvm::runtime::Module> LowerExternalFunctions() {
-    Array<tvm::runtime::Module> ret;
-    std::unordered_map<std::string, std::string> cached_symbol;
-    std::vector<CCacheKey> cached_ext_funcs;
-    for (const auto& it : cache_) {
-      auto src_func = it.first->source_func;
-      ICHECK(src_func.defined());
-
-      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
-        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
-        ICHECK(code_gen.defined()) << "No external codegen is set";
-        std::string code_gen_name = code_gen.value();
-        cached_ext_funcs.push_back(it.first);
-
-        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
-                                      << AsText(src_func, false) << "\n"
-                                      << "Functions with external codegen must have the "
-                                      << tvm::attr::kGlobalSymbol << " attr set.";
-
-        std::string sn = symbol_name.value();
-        if (!cached_symbol.count(sn)) {
-          cached_symbol[sn] = code_gen_name;
-        } else {
-          ICHECK_NE(cached_symbol[sn], code_gen_name)
-              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
-        }
-
-        std::string ext_name = "relay.ext." + code_gen_name;
-        auto pf = tvm::runtime::Registry::Get(ext_name);
-        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
-        // No need to keep compiler attribute at this point, functions have been
-        // extracted for specific codegen.
-        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
-        runtime::Module ext_mod = (*pf)(src_func);
-
-        // todo(@zhiics, @jroesch): Should this be a user visible error?
-        ICHECK(ext_mod.defined()) << "No external library was generated for " << ext_name
-                                  << "even though it was requested"
-                                     "by the annotated function "
-                                  << PrettyPrint(src_func);
-
-        ret.push_back(ext_mod);
-      }
-    }
-
-    // No need to cache external functions as we collected them all to create
-    // external runtime modules.
-    for (const auto& it : cached_ext_funcs) {
-      cache_.erase(it);
-    }
-    return ret;
-  }
-
-  void Clear() final { cache_.clear(); }
-
-  // List all items in the cache.
-  Array<ObjectRef> ListItems() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    Array<ObjectRef> items;
-    for (auto& kv : cache_) {
-      items.push_back(kv.first);
-      items.push_back(kv.second);
-    }
-    return items;
-  }
-
-  // List all items in the shape_func_cache.
-  Array<ObjectRef> ListShapeFuncItems() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    Array<ObjectRef> items;
-    for (auto& kv : shape_func_cache_) {
-      items.push_back(kv.first);
-      items.push_back(kv.second);
-    }
-    return items;
-  }
-
-  /*!
-   * \brief Get the cache key of the function that is being lowered currently
-   * \return the cache key
-   */
-  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
-
- private:
-  // implement lowered func
-  CCacheValue LowerInternal(const CCacheKey& key, std::function<String(String)> mangle_fn) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    CCacheValue value;
-    auto it = cache_.find(key);
-    if (it != cache_.end()) {
-      it->second->use_count += 1;
-      if (it->second->cached_func.defined()) return it->second;
-      value = it->second;
-    } else {
-      value = CCacheValue(make_object<CCacheValueNode>());
-      value->use_count = 0;
-      if (!backend::IsCompileEngineCacheDisabled()) {
-        cache_[key] = value;
-      }
-    }
-    cur_ccache_key_ = key;
-
-    // No need to lower external functions for now. We will invoke the external
-    // codegen tool once and lower all functions together.
-    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
-      auto ir_module = IRModule();
-      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
-      auto func_name = std::string(name_node.value());
-      auto target = Target("ext_dev");
-      auto global_var = GlobalVar(func_name);
-      global_var->checked_type_ = key->source_func->checked_type();
-      ir_module->Add(global_var, key->source_func);
-      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
-      return value;
-    }
-
-    // Enforce use the target.
-    With<Target> target_scope(key->target);
-
-    ICHECK(!value->cached_func.defined());
-    auto cfunc = PrimFuncFor(key->source_func, key->target, [&](std::string name) {
-      return GetUniqueName(mangle_fn(name), &name_map_);
-    });
-
-    // Skip lowering for device copy node.
-    const Expr body = (key->source_func)->body;
-    if (const CallNode* call_node = body.as<CallNode>()) {
-      if (call_node->attrs.as<DeviceCopyAttrs>()) {
-        value->cached_func = cfunc;
-        return value;
-      }
-    }
-
-    // NOTE: array will copy on write.
-    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
-    for (te::Tensor arg : cfunc->outputs) {
-      all_args.push_back(arg);
-    }
-    // lower the function
-    std::unordered_map<te::Tensor, tir::Buffer> binds;
-    auto func_name = cfunc->prim_fn_var->name_hint;
-    cfunc->funcs->Update(tvm::LowerSchedule(cfunc->schedule, all_args, func_name, binds));
-    value->cached_func = cfunc;
-
-    return value;
-  }
-
-  // implement lowered shape func
-  CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    CCacheValue value;
-    auto it = shape_func_cache_.find(key);
-    if (it != shape_func_cache_.end()) {
-      it->second->use_count += 1;
-      if (it->second->cached_func.defined()) return it->second;
-      value = it->second;
-    } else {
-      value = CCacheValue(make_object<CCacheValueNode>());
-      value->use_count = 0;
-      shape_func_cache_[key] = value;
-    }
-    // Enforce use the target.
-    With<Target> target_scope(key->target);
-
-    ICHECK(!value->cached_func.defined());
-    using tvm::transform::PassContext;
-    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
-
-    auto cached_func = ShapeFuncFor(key->source_func, key->target, [&](std::string name) {
-      return GetUniqueName(name, &name_map_);
-    });
-
-    value->cached_func = cached_func;
-    return value;
-  }
-
-  /*! \brief compiler cache lock*/
-  std::mutex mutex_;
-  /*! \brief internal name map to get an unique name */
-  std::unordered_map<std::string, int> name_map_;
-  /*! \brief internal compiler cache */
-  std::unordered_map<CCacheKey, CCacheValue> cache_;
-  /*! \brief internal compiler cache for shape funcs */
-  std::unordered_map<CCacheKey, CCacheValue> shape_func_cache_;
-  /*! \brief the cache key of the function that is being lowered currently*/
-  CCacheKey cur_ccache_key_;
-};
-
-/*! \brief The global compile engine */
-CompileEngine& CompileEngine::Global() {
-  // intentionally allocate raw pointer to avoid
-  // free during destructuion.
-  static CompileEngine* inst = new CompileEngine(make_object<CompileEngineImpl>());
-  return *inst;
-}
-
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.disable_compile_engine_cache", Bool);
-
-TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
-    .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
-      return LoweredOutput(outputs, impl);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
-    .set_body_typed([](Function source_func, Target target) {
-      return CCacheKey(source_func, target);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal").set_body_typed([]() {
-  return CompileEngine::Global();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear").set_body_typed([](CompileEngine self) {
-  self->Clear();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
-    .set_body_typed([](CompileEngine self, CCacheKey key, const String mod_name) {
-      return self->Lower(key, mod_name);
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLowerShapeFunc")
-    .set_body_typed([](CompileEngine self, CCacheKey key) { return self->LowerShapeFunc(key); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileLowerExternalFunctions")
-    .set_body_typed([](CompileEngine self) { return self->LowerExternalFunctions(); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
-    .set_body_typed([](CompileEngine self, CCacheKey key) { return self->JIT(key); });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](CompileEngine self) {
-  CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-  ICHECK(ptr != nullptr);
-  return ptr->ListItems();
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListShapeFuncItems")
-    .set_body_typed([](CompileEngine self) {
-      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-      ICHECK(ptr != nullptr);
-      return ptr->ListShapeFuncItems();
-    });
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey")
-    .set_body_typed([](CompileEngine self) {
-      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
-      ICHECK(ptr != nullptr);
-      return ptr->GetCurrentCCacheKey();
-    });
-
-}  // namespace relay
-}  // namespace tvm

From 4e5840ec71624110816c148929734c3c9f9036ad Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 13 Oct 2021 17:27:38 +0300
Subject: [PATCH 04/18] Swap compile engine with TECompiler

---
 python/tvm/relay/backend/__init__.py    |   2 +-
 python/tvm/relay/backend/te_compiler.py | 487 ++++++++++++++++++++++++
 2 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/backend/te_compiler.py

diff --git a/python/tvm/relay/backend/__init__.py b/python/tvm/relay/backend/__init__.py
index 4fc2b63748db..d76459236515 100644
--- a/python/tvm/relay/backend/__init__.py
+++ b/python/tvm/relay/backend/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """Backend codegen modules for relay."""
-from . import compile_engine
+from . import te_compiler
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
new file mode 100644
index 000000000000..9e34f0960d05
--- /dev/null
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -0,0 +1,487 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=len-as-condition,no-else-return,invalid-name
+"""TE compiler engine (replacing legacy compile_engine)."""
+from __future__ import absolute_import
+
+import logging
+import numpy as np
+import tvm
+from tvm import te, autotvm
+from tvm.ir.transform import PassContext
+from tvm.runtime import Object
+from tvm.support import libinfo
+from tvm.target import Target
+from ..backend.utils import mangle_module_name
+from .. import function as _function
+from .. import ty as _ty
+from . import _backend
+
+logger = logging.getLogger("te_compiler")
+autotvm_logger = logging.getLogger("autotvm")
+
+_first_warning = True
+
+
+@tvm._ffi.register_object("relay.CCacheKey")
+class CCacheKey(Object):
+    """Key in the CompileEngine.
+
+    Parameters
+    ----------
+    source_func : tvm.relay.Function
+        The source function.
+
+    target : tvm.Target
+        The target we want to run the function on.
+    """
+
+    def __init__(self, source_func, target):
+        self.__init_handle_by_constructor__(
+            _backend._make_CCacheKey, source_func, target)
+
+
+@tvm._ffi.register_object("relay.CCacheValue")
+class CCacheValue(Object):
+    """Value in the CompileEngine, including usage statistics."""
+
+
+def _get_cache_key(source_func, target):
+    if isinstance(source_func, _function.Function):
+        if isinstance(target, str):
+            target = Target(target)
+            if not target:
+                raise ValueError("Need target when source_func is a Function")
+        return CCacheKey(source_func, target)
+    if not isinstance(source_func, CCacheKey):
+        raise TypeError("Expect source_func to be CCacheKey")
+    return source_func
+
+
+def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True):
+    """Select the best implementation from the op strategy.
+
+    If use_autotvm is True, it'll first try to find the best implementation
+    based on AutoTVM profile results. If no AutoTVM profile result is found,
+    it'll choose the implementation with highest plevel.
+
+    If use_autotvm is False, it'll directly choose the implementation with
+    highest plevel.
+
+    Note that this function doesn't support op with symbolic input shapes.
+
+    Parameters
+    ----------
+    op : tvm.ir.Op
+        Relay operator.
+
+    attrs : object
+        The op attribute.
+
+    inputs : List[tvm.te.Tensor]
+        Input tensors to the op.
+
+    out_type : relay.Type
+        The output type.
+
+    target : tvm.target.Target
+        The target to compile the op.
+
+    use_autotvm : bool
+        Whether query AutoTVM to pick the best.
+
+    Returns
+    -------
+    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
+        The best op implementation and the corresponding output tensors.
+    """
+    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
+    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
+
+    # Disable autotvm if auto_scheduler is enabled.
+    # (i.e., always return the implementation with the highest priority for auto-scheduler).
+    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+        use_autotvm = False
+
+    # If not use autotvm, always return the implementation with the highest priority
+    if not use_autotvm:
+        logger.info(
+            "Using %s for %s based on highest priority (%d)",
+            best_plevel_impl.name,
+            op.name,
+            best_plevel_impl.plevel,
+        )
+        outs = best_plevel_impl.compute(attrs, inputs, out_type)
+        return best_plevel_impl, outs
+
+    # Otherwise, try autotvm templates
+    outputs = {}
+    workloads = {}
+    best_autotvm_impl = None
+    best_cfg = None
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    old_silent = autotvm.GLOBAL_SCOPE.silent
+    autotvm.GLOBAL_SCOPE.silent = True
+    for impl in all_impls:
+        outs = impl.compute(attrs, inputs, out_type)
+        outputs[impl] = outs
+        workload = autotvm.task.get_workload(outs)
+        workloads[impl] = workload
+        if workload is None:
+            # Not an AutoTVM tunable implementation
+            continue
+        cfg = dispatch_ctx.query(target, workload)
+        if cfg.is_fallback:
+            # Skip fallback config
+            continue
+        logger.info("Implementation %s for %s has cost %.2e",
+                    impl.name, op.name, cfg.cost)
+        if best_cfg is None or best_cfg.cost > cfg.cost:
+            best_autotvm_impl = impl
+            best_cfg = cfg
+    autotvm.GLOBAL_SCOPE.silent = old_silent
+
+    if best_autotvm_impl:
+        # The best autotvm implementation definitely doesn't use fallback config
+        logger.info(
+            "Using %s for %s based on lowest cost (%.2e)",
+            best_autotvm_impl.name,
+            op.name,
+            best_cfg.cost,
+        )
+        return best_autotvm_impl, outputs[best_autotvm_impl]
+
+    # Use the implementation with highest plevel
+    if workloads[best_plevel_impl] is not None:
+        msg = (
+            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
+            "TVM will apply a default schedule which may negatively impact performance."
+            % (target, workloads[best_plevel_impl])
+        )
+        if (
+            not autotvm.env.GLOBAL_SCOPE.silent
+            and msg not in autotvm.task.DispatchContext.warning_messages
+        ):
+            autotvm.task.DispatchContext.warning_messages.add(msg)
+            global _first_warning
+            if _first_warning:
+                _first_warning = False
+                info_msg = (
+                    "One or more operators have not been tuned. Please tune your model "
+                    "for better performance. Use DEBUG logging level to see more details."
+                )
+                autotvm_logger.warning(info_msg)
+            autotvm_logger.debug(msg)
+
+    logger.info(
+        "Using %s for %s based on highest priority (%s)",
+        best_plevel_impl.name,
+        op.name,
+        best_plevel_impl.plevel,
+    )
+    return best_plevel_impl, outputs[best_plevel_impl]
+
+
+@tvm._ffi.register_object("relay.TECompiler")
+class TECompiler(Object):
+    """TECompiler to get lowered code."""
+
+    def __init__(self):
+        raise RuntimeError("Cannot construct a TECompiler")
+
+    def lower(self, source_func, target=None, mod_name="default"):
+        """Lower a source_func to a CachedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        # pylint: disable=broad-except, import-outside-toplevel
+        try:
+            mod_name = mangle_module_name(mod_name)
+            key = _get_cache_key(source_func, target)
+            print(key)
+            return _backend._TECompilerLower(self, key, mod_name)
+        except Exception:
+            import traceback
+
+            msg = traceback.format_exc()
+            msg += "Error during compile func\n"
+            msg += "--------------------------\n"
+            msg += source_func.astext(show_meta_data=False)
+            msg += "--------------------------\n"
+            raise RuntimeError(msg)
+
+    # def lower_shape_func(self, source_func, target=None):
+    #     key = _get_cache_key(source_func, target)
+    #     return _backend._CompileEngineLowerShapeFunc(self, key)
+
+    def jit(self, source_func, target=None):
+        """JIT a source_func to a tvm.runtime.PackedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        jited_func: tvm.runtime.PackedFunc
+            The result of jited function.
+        """
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineJIT(self, key)
+
+    def clear(self):
+        """clear the existing cached functions"""
+        _backend._TECompilerClear(self)
+
+    def items(self):
+        """List items in the cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of items.
+        """
+        res = _backend._CompileEngineListItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
+    def shape_func_items(self):
+        """List items in the shape_func_cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of shape_func_items.
+        """
+        res = _backend._CompileEngineListShapeFuncItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
+    def get_current_ccache_key(self):
+        return _backend._CompileEngineGetCurrentCCacheKey(self)
+
+    def dump(self):
+        """Return a string representation of engine dump.
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for k, v in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+            res += "----relay function----\n"
+            res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
+        res += "===================================\n"
+        shape_func_items = self.shape_func_items()
+        res += "%d shape_func_items cached\n" % len(shape_func_items)
+        for k, v in shape_func_items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+            res += "----relay function----\n"
+            res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
+        res += "===================================\n"
+        return res
+
+
+def get():
+    """Get the global compile engine.
+
+    Returns
+    -------
+    engine : tvm.relay.backend.CompileEngine
+        The compile engine.
+    """
+    return _backend._CompileEngineGlobal()
+
+
+@tvm._ffi.register_object("relay.CompileEngine")
+class CompileEngine(Object):
+    """CompileEngine to get lowered code."""
+
+    def __init__(self):
+        raise RuntimeError("Cannot construct a CompileEngine")
+
+    def lower(self, source_func, target=None, mod_name="default"):
+        """Lower a source_func to a CachedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        # pylint: disable=broad-except, import-outside-toplevel
+        try:
+            mod_name = mangle_module_name(mod_name)
+            key = _get_cache_key(source_func, target)
+            return _backend._CompileEngineLower(self, key, mod_name)
+        except Exception:
+            import traceback
+
+            msg = traceback.format_exc()
+            msg += "Error during compile func\n"
+            msg += "--------------------------\n"
+            msg += source_func.astext(show_meta_data=False)
+            msg += "--------------------------\n"
+            raise RuntimeError(msg)
+
+    def lower_shape_func(self, source_func, target=None):
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineLowerShapeFunc(self, key)
+
+    def jit(self, source_func, target=None):
+        """JIT a source_func to a tvm.runtime.PackedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        jited_func: tvm.runtime.PackedFunc
+            The result of jited function.
+        """
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineJIT(self, key)
+
+    def clear(self):
+        """clear the existing cached functions"""
+        _backend._CompileEngineClear(self)
+
+    def items(self):
+        """List items in the cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of items.
+        """
+        res = _backend._CompileEngineListItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
+    def shape_func_items(self):
+        """List items in the shape_func_cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of shape_func_items.
+        """
+        res = _backend._CompileEngineListShapeFuncItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
+    def get_current_ccache_key(self):
+        return _backend._CompileEngineGetCurrentCCacheKey(self)
+
+    def dump(self):
+        """Return a string representation of engine dump.
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for k, v in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+            res += "----relay function----\n"
+            res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
+        res += "===================================\n"
+        shape_func_items = self.shape_func_items()
+        res += "%d shape_func_items cached\n" % len(shape_func_items)
+        for k, v in shape_func_items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+            res += "----relay function----\n"
+            res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
+        res += "===================================\n"
+        return res
+
+
+def get():
+    """Get the global compile engine.
+
+    Returns
+    -------
+    engine : tvm.relay.backend.CompileEngine
+        The compile engine.
+    """
+    return _backend._TECompilerGlobal()

From 00654631fa9ed11e7096df77eefa3372b285464a Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 13 Oct 2021 18:24:42 +0300
Subject: [PATCH 05/18] Cleanup on compile engine py leftovers

---
 python/tvm/relay/backend/compile_engine.py | 467 ---------------------
 python/tvm/relay/backend/te_compiler.py    | 146 ++++++-
 2 files changed, 138 insertions(+), 475 deletions(-)
 delete mode 100644 python/tvm/relay/backend/compile_engine.py

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
deleted file mode 100644
index e9129db7b200..000000000000
--- a/python/tvm/relay/backend/compile_engine.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=len-as-condition,no-else-return,invalid-name
-"""Backend code generation engine."""
-from __future__ import absolute_import
-
-import logging
-import numpy as np
-import tvm
-from tvm import te, autotvm
-from tvm.ir.transform import PassContext
-from tvm.runtime import Object
-from tvm.support import libinfo
-from tvm.target import Target
-from ..backend.utils import mangle_module_name
-from .. import function as _function
-from .. import ty as _ty
-from . import _backend
-
-logger = logging.getLogger("compile_engine")
-autotvm_logger = logging.getLogger("autotvm")
-
-_first_warning = True
-
-
-@tvm._ffi.register_object("relay.LoweredOutput")
-class LoweredOutput(Object):
-    """Lowered output"""
-
-    def __init__(self, outputs, implement):
-        self.__init_handle_by_constructor__(_backend._make_LoweredOutput, outputs, implement)
-
-
-@tvm._ffi.register_object("relay.CCacheKey")
-class CCacheKey(Object):
-    """Key in the CompileEngine.
-
-    Parameters
-    ----------
-    source_func : tvm.relay.Function
-        The source function.
-
-    target : tvm.Target
-        The target we want to run the function on.
-    """
-
-    def __init__(self, source_func, target):
-        self.__init_handle_by_constructor__(_backend._make_CCacheKey, source_func, target)
-
-
-@tvm._ffi.register_object("relay.CCacheValue")
-class CCacheValue(Object):
-    """Value in the CompileEngine, including usage statistics."""
-
-
-def _get_cache_key(source_func, target):
-    if isinstance(source_func, _function.Function):
-        if isinstance(target, str):
-            target = Target(target)
-            if not target:
-                raise ValueError("Need target when source_func is a Function")
-        return CCacheKey(source_func, target)
-    if not isinstance(source_func, CCacheKey):
-        raise TypeError("Expect source_func to be CCacheKey")
-    return source_func
-
-
-def get_shape(shape):
-    """Convert the shape to correct dtype and vars."""
-    ret = []
-    for dim in shape:
-        if isinstance(dim, tvm.tir.IntImm):
-            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
-                ret.append(dim)
-            else:
-                val = int(dim)
-                assert val <= np.iinfo(np.int32).max
-                ret.append(tvm.tir.IntImm("int32", val))
-        elif isinstance(dim, tvm.tir.Any):
-            ret.append(te.var("any_dim", "int32"))
-        else:
-            ret.append(dim)
-    return ret
-
-
-def get_valid_implementations(op, attrs, inputs, out_type, target):
-    """Get all valid implementations from the op strategy.
-
-    Note that this function doesn't support op with symbolic input shapes.
-
-    Parameters
-    ----------
-    op : tvm.ir.Op
-        Relay operator.
-
-    attrs : object
-        The op attribute.
-
-    inputs : List[tvm.te.Tensor]
-        Input tensors to the op.
-
-    out_type : relay.Type
-        The output type.
-
-    target : tvm.target.Target
-        The target to compile the op.
-
-    Returns
-    -------
-    ret : List[relay.op.OpImplementation]
-        The list of all valid op implementations.
-    """
-    fstrategy = op.get_attr("FTVMStrategy")
-    assert fstrategy is not None, (
-        "%s doesn't have an FTVMStrategy registered. You can register "
-        "one in python with `tvm.relay.op.register_strategy`." % op.name
-    )
-    with target:
-        strategy = fstrategy(attrs, inputs, out_type, target)
-    analyzer = tvm.arith.Analyzer()
-    ret = []
-    for spec in strategy.specializations:
-        if spec.condition:
-            # check if all the clauses in the specialized condition are true
-            flag = True
-            for clause in spec.condition.clauses:
-                clause = analyzer.canonical_simplify(clause)
-                if isinstance(clause, tvm.tir.IntImm) and clause.value:
-                    continue
-                flag = False
-                break
-            if flag:
-                for impl in spec.implementations:
-                    ret.append(impl)
-        else:
-            for impl in spec.implementations:
-                ret.append(impl)
-    return ret
-
-
-def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True):
-    """Select the best implementation from the op strategy.
-
-    If use_autotvm is True, it'll first try to find the best implementation
-    based on AutoTVM profile results. If no AutoTVM profile result is found,
-    it'll choose the implementation with highest plevel.
-
-    If use_autotvm is False, it'll directly choose the implementation with
-    highest plevel.
-
-    Note that this function doesn't support op with symbolic input shapes.
-
-    Parameters
-    ----------
-    op : tvm.ir.Op
-        Relay operator.
-
-    attrs : object
-        The op attribute.
-
-    inputs : List[tvm.te.Tensor]
-        Input tensors to the op.
-
-    out_type : relay.Type
-        The output type.
-
-    target : tvm.target.Target
-        The target to compile the op.
-
-    use_autotvm : bool
-        Whether query AutoTVM to pick the best.
-
-    Returns
-    -------
-    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
-        The best op implementation and the corresponding output tensors.
-    """
-    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
-    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
-
-    # Disable autotvm if auto_scheduler is enabled.
-    # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
-        use_autotvm = False
-
-    # If not use autotvm, always return the implementation with the highest priority
-    if not use_autotvm:
-        logger.info(
-            "Using %s for %s based on highest priority (%d)",
-            best_plevel_impl.name,
-            op.name,
-            best_plevel_impl.plevel,
-        )
-        outs = best_plevel_impl.compute(attrs, inputs, out_type)
-        return best_plevel_impl, outs
-
-    # Otherwise, try autotvm templates
-    outputs = {}
-    workloads = {}
-    best_autotvm_impl = None
-    best_cfg = None
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    old_silent = autotvm.GLOBAL_SCOPE.silent
-    autotvm.GLOBAL_SCOPE.silent = True
-    for impl in all_impls:
-        outs = impl.compute(attrs, inputs, out_type)
-        outputs[impl] = outs
-        workload = autotvm.task.get_workload(outs)
-        workloads[impl] = workload
-        if workload is None:
-            # Not an AutoTVM tunable implementation
-            continue
-        cfg = dispatch_ctx.query(target, workload)
-        if cfg.is_fallback:
-            # Skip fallback config
-            continue
-        logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost)
-        if best_cfg is None or best_cfg.cost > cfg.cost:
-            best_autotvm_impl = impl
-            best_cfg = cfg
-    autotvm.GLOBAL_SCOPE.silent = old_silent
-
-    if best_autotvm_impl:
-        # The best autotvm implementation definitely doesn't use fallback config
-        logger.info(
-            "Using %s for %s based on lowest cost (%.2e)",
-            best_autotvm_impl.name,
-            op.name,
-            best_cfg.cost,
-        )
-        return best_autotvm_impl, outputs[best_autotvm_impl]
-
-    # Use the implementation with highest plevel
-    if workloads[best_plevel_impl] is not None:
-        msg = (
-            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
-            "TVM will apply a default schedule which may negatively impact performance."
-            % (target, workloads[best_plevel_impl])
-        )
-        if (
-            not autotvm.env.GLOBAL_SCOPE.silent
-            and msg not in autotvm.task.DispatchContext.warning_messages
-        ):
-            autotvm.task.DispatchContext.warning_messages.add(msg)
-            global _first_warning
-            if _first_warning:
-                _first_warning = False
-                info_msg = (
-                    "One or more operators have not been tuned. Please tune your model "
-                    "for better performance. Use DEBUG logging level to see more details."
-                )
-                autotvm_logger.warning(info_msg)
-            autotvm_logger.debug(msg)
-
-    logger.info(
-        "Using %s for %s based on highest priority (%s)",
-        best_plevel_impl.name,
-        op.name,
-        best_plevel_impl.plevel,
-    )
-    return best_plevel_impl, outputs[best_plevel_impl]
-
-
-@tvm._ffi.register_func("relay.backend.lower_call")
-def lower_call(call, inputs, target):
-    """Lower the call expression to op implementation and tensor outputs."""
-    assert isinstance(call.op, tvm.ir.Op)
-    op = call.op
-
-    # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
-    # the shape is Int32. Following code ensures the same for the output as well.
-    # TODO(@icemelon9): Support recursive tuple
-    ret_type = call.checked_type
-    if isinstance(ret_type, _ty.TensorType):
-        ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
-    elif isinstance(ret_type, _ty.TupleType):
-        new_fields = []
-        for field in ret_type.fields:
-            if isinstance(field, _ty.TensorType):
-                new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
-            else:
-                new_fields.append(field)
-        ret_type = _ty.TupleType(new_fields)
-
-    is_dyn = _ty.is_dynamic(call.checked_type)
-    for arg in call.args:
-        is_dyn = is_dyn or _ty.is_dynamic(arg.checked_type)
-
-    # check if in the AutoTVM tracing mode, and disable if op is not in wanted list
-    env = autotvm.task.TaskExtractEnv.current
-    reenable_tracing = False
-    if env is not None and env.tracing:
-        if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops:
-            env.tracing = False
-            reenable_tracing = True
-
-    if not is_dyn:
-        best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
-    else:
-        # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
-        best_impl, outputs = select_implementation(
-            op, call.attrs, inputs, ret_type, target, use_autotvm=False
-        )
-
-    # re-enable AutoTVM tracing
-    if reenable_tracing:
-        env.tracing = True
-    return LoweredOutput(outputs, best_impl)
-
-
-@tvm._ffi.register_object("relay.CompileEngine")
-class CompileEngine(Object):
-    """CompileEngine to get lowered code."""
-
-    def __init__(self):
-        raise RuntimeError("Cannot construct a CompileEngine")
-
-    def lower(self, source_func, target=None, mod_name="default"):
-        """Lower a source_func to a CachedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        cached_func: CachedFunc
-            The result of lowering.
-        """
-        # pylint: disable=broad-except, import-outside-toplevel
-        try:
-            mod_name = mangle_module_name(mod_name)
-            key = _get_cache_key(source_func, target)
-            return _backend._CompileEngineLower(self, key, mod_name)
-        except Exception:
-            import traceback
-
-            msg = traceback.format_exc()
-            msg += "Error during compile func\n"
-            msg += "--------------------------\n"
-            msg += source_func.astext(show_meta_data=False)
-            msg += "--------------------------\n"
-            raise RuntimeError(msg)
-
-    def lower_shape_func(self, source_func, target=None):
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineLowerShapeFunc(self, key)
-
-    def jit(self, source_func, target=None):
-        """JIT a source_func to a tvm.runtime.PackedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        jited_func: tvm.runtime.PackedFunc
-            The result of jited function.
-        """
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineJIT(self, key)
-
-    def clear(self):
-        """clear the existing cached functions"""
-        _backend._CompileEngineClear(self)
-
-    def items(self):
-        """List items in the cache.
-
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of items.
-        """
-        res = _backend._CompileEngineListItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-    def shape_func_items(self):
-        """List items in the shape_func_cache.
-
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of shape_func_items.
-        """
-        res = _backend._CompileEngineListShapeFuncItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-    def get_current_ccache_key(self):
-        return _backend._CompileEngineGetCurrentCCacheKey(self)
-
-    def dump(self):
-        """Return a string representation of engine dump.
-
-        Returns
-        -------
-        dump : str
-            The dumped string representation
-        """
-        items = self.items()
-        res = "====================================\n"
-        res += "CompilerEngine dump, %d items cached\n" % len(items)
-        for k, v in items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        shape_func_items = self.shape_func_items()
-        res += "%d shape_func_items cached\n" % len(shape_func_items)
-        for k, v in shape_func_items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        return res
-
-
-def get():
-    """Get the global compile engine.
-
-    Returns
-    -------
-    engine : tvm.relay.backend.CompileEngine
-        The compile engine.
-    """
-    return _backend._CompileEngineGlobal()
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 9e34f0960d05..fd94d2e3ed72 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import
 
 import logging
-import numpy as np
 import tvm
 from tvm import te, autotvm
 from tvm.ir.transform import PassContext
@@ -37,6 +36,15 @@
 _first_warning = True
 
 
+@tvm._ffi.register_object("relay.LoweredOutput")
+class LoweredOutput(Object):
+    """Lowered output"""
+
+    def __init__(self, outputs, implement):
+        self.__init_handle_by_constructor__(
+            _backend._make_LoweredOutput, outputs, implement)
+
+
 @tvm._ffi.register_object("relay.CCacheKey")
 class CCacheKey(Object):
     """Key in the CompileEngine.
@@ -72,6 +80,61 @@ def _get_cache_key(source_func, target):
     return source_func
 
 
+def get_valid_implementations(op, attrs, inputs, out_type, target):
+    """Get all valid implementations from the op strategy.
+
+    Note that this function doesn't support op with symbolic input shapes.
+
+    Parameters
+    ----------
+    op : tvm.ir.Op
+        Relay operator.
+
+    attrs : object
+        The op attribute.
+
+    inputs : List[tvm.te.Tensor]
+        Input tensors to the op.
+
+    out_type : relay.Type
+        The output type.
+
+    target : tvm.target.Target
+        The target to compile the op.
+
+    Returns
+    -------
+    ret : List[relay.op.OpImplementation]
+        The list of all valid op implementations.
+    """
+    fstrategy = op.get_attr("FTVMStrategy")
+    assert fstrategy is not None, (
+        "%s doesn't have an FTVMStrategy registered. You can register "
+        "one in python with `tvm.relay.op.register_strategy`." % op.name
+    )
+    with target:
+        strategy = fstrategy(attrs, inputs, out_type, target)
+    analyzer = tvm.arith.Analyzer()
+    ret = []
+    for spec in strategy.specializations:
+        if spec.condition:
+            # check if all the clauses in the specialized condition are true
+            flag = True
+            for clause in spec.condition.clauses:
+                clause = analyzer.canonical_simplify(clause)
+                if isinstance(clause, tvm.tir.IntImm) and clause.value:
+                    continue
+                flag = False
+                break
+            if flag:
+                for impl in spec.implementations:
+                    ret.append(impl)
+        else:
+            for impl in spec.implementations:
+                ret.append(impl)
+    return ret
+
+
 def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True):
     """Select the best implementation from the op strategy.
 
@@ -235,6 +298,55 @@ def lower(self, source_func, target=None, mod_name="default"):
             msg += "--------------------------\n"
             raise RuntimeError(msg)
 
+
+@tvm._ffi.register_func("relay.backend.lower_call")
+def lower_call(call, inputs, target):
+    """Lower the call expression to op implementation and tensor outputs."""
+    assert isinstance(call.op, tvm.ir.Op)
+    op = call.op
+
+    # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
+    # the shape is Int32. Following code ensures the same for the output as well.
+    # TODO(@icemelon9): Support recursive tuple
+    ret_type = call.checked_type
+    if isinstance(ret_type, _ty.TensorType):
+        ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
+    elif isinstance(ret_type, _ty.TupleType):
+        new_fields = []
+        for field in ret_type.fields:
+            if isinstance(field, _ty.TensorType):
+                new_fields.append(_ty.TensorType(
+                    get_shape(field.shape), field.dtype))
+            else:
+                new_fields.append(field)
+        ret_type = _ty.TupleType(new_fields)
+
+    is_dyn = _ty.is_dynamic(call.checked_type)
+    for arg in call.args:
+        is_dyn = is_dyn or _ty.is_dynamic(arg.checked_type)
+
+    # check if in the AutoTVM tracing mode, and disable if op is not in wanted list
+    env = autotvm.task.TaskExtractEnv.current
+    reenable_tracing = False
+    if env is not None and env.tracing:
+        if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops:
+            env.tracing = False
+            reenable_tracing = True
+
+    if not is_dyn:
+        best_impl, outputs = select_implementation(
+            op, call.attrs, inputs, ret_type, target)
+    else:
+        # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
+        best_impl, outputs = select_implementation(
+            op, call.attrs, inputs, ret_type, target, use_autotvm=False
+        )
+
+    # re-enable AutoTVM tracing
+    if reenable_tracing:
+        env.tracing = True
+    return LoweredOutput(outputs, best_impl)(outputs, best_impl)
+
     # def lower_shape_func(self, source_func, target=None):
     #     key = _get_cache_key(source_func, target)
     #     return _backend._CompileEngineLowerShapeFunc(self, key)
@@ -331,6 +443,24 @@ def dump(self):
         return res
 
 
+def get_shape(shape):
+    """Convert the shape to correct dtype and vars."""
+    ret = []
+    for dim in shape:
+        if isinstance(dim, tvm.tir.IntImm):
+            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
+                ret.append(dim)
+            else:
+                val = int(dim)
+                assert val <= np.iinfo(np.int32).max
+                ret.append(tvm.tir.IntImm("int32", val))
+        elif isinstance(dim, tvm.tir.Any):
+            ret.append(te.var("any_dim", "int32"))
+        else:
+            ret.append(dim)
+    return ret
+
+
 def get():
     """Get the global compile engine.
 
@@ -339,11 +469,11 @@ def get():
     engine : tvm.relay.backend.CompileEngine
         The compile engine.
     """
-    return _backend._CompileEngineGlobal()
+    return _backend._TECompilerGlobal()
 
 
-@tvm._ffi.register_object("relay.CompileEngine")
-class CompileEngine(Object):
+@tvm._ffi.register_object("relay.TECompiler")
+class TECompiler(Object):
     """CompileEngine to get lowered code."""
 
     def __init__(self):
@@ -369,7 +499,7 @@ def lower(self, source_func, target=None, mod_name="default"):
         try:
             mod_name = mangle_module_name(mod_name)
             key = _get_cache_key(source_func, target)
-            return _backend._CompileEngineLower(self, key, mod_name)
+            return _backend._TECompilerLower(self, key, mod_name)
         except Exception:
             import traceback
 
@@ -380,9 +510,9 @@ def lower(self, source_func, target=None, mod_name="default"):
             msg += "--------------------------\n"
             raise RuntimeError(msg)
 
-    def lower_shape_func(self, source_func, target=None):
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineLowerShapeFunc(self, key)
+    # def lower_shape_func(self, source_func, target=None):
+    #     key = _get_cache_key(source_func, target)
+    #     return _backend._CompileEngineLowerShapeFunc(self, key)
 
     def jit(self, source_func, target=None):
         """JIT a source_func to a tvm.runtime.PackedFunc.

From b1f817ac51651a7978eda42c3547c45e62f4e2a8 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 13 Oct 2021 20:02:19 +0300
Subject: [PATCH 06/18] [WIP] Exposing legacy compile engine capabilities
 through TE Compiler

---
 python/tvm/relay/backend/te_compiler.py | 172 +++++++++++++-----------
 src/relay/backend/te_compiler.cc        |  29 ++++
 src/relay/backend/te_compiler.h         |   3 +-
 3 files changed, 121 insertions(+), 83 deletions(-)

diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index fd94d2e3ed72..ef37e4c6f6ac 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -345,102 +345,110 @@ def lower_call(call, inputs, target):
     # re-enable AutoTVM tracing
     if reenable_tracing:
         env.tracing = True
+
     return LoweredOutput(outputs, best_impl)(outputs, best_impl)
 
-    # def lower_shape_func(self, source_func, target=None):
-    #     key = _get_cache_key(source_func, target)
-    #     return _backend._CompileEngineLowerShapeFunc(self, key)
 
-    def jit(self, source_func, target=None):
-        """JIT a source_func to a tvm.runtime.PackedFunc.
+def lower_shape_func(self, source_func, target=None):
+    key = _get_cache_key(source_func, target)
+    return _backend._CompileEngineLowerShapeFunc(self, key)
 
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
 
-        target : tvm.Target
-            The target platform.
+def jit(self, source_func, target=None):
+    """JIT a source_func to a tvm.runtime.PackedFunc.
 
-        Returns
-        -------
-        jited_func: tvm.runtime.PackedFunc
-            The result of jited function.
-        """
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineJIT(self, key)
+    Parameters
+    ----------
+    source_func : Union[tvm.relay.Function, CCacheKey]
+        The source relay function.
 
-    def clear(self):
-        """clear the existing cached functions"""
-        _backend._TECompilerClear(self)
+    target : tvm.Target
+        The target platform.
 
-    def items(self):
-        """List items in the cache.
+    Returns
+    -------
+    jited_func: tvm.runtime.PackedFunc
+        The result of jited function.
+    """
+    key = _get_cache_key(source_func, target)
+    return _backend._CompileEngineJIT(self, key)
 
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of items.
-        """
-        res = _backend._CompileEngineListItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
-    def shape_func_items(self):
-        """List items in the shape_func_cache.
+def clear(self):
+    """clear the existing cached functions"""
+    _backend._TECompilerClear(self)
 
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of shape_func_items.
-        """
-        res = _backend._CompileEngineListShapeFuncItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
-    def get_current_ccache_key(self):
-        return _backend._CompileEngineGetCurrentCCacheKey(self)
+def items(self):
+    """List items in the cache.
 
-    def dump(self):
-        """Return a string representation of engine dump.
+    Returns
+    -------
+    item_list : List[Tuple[CCacheKey, CCacheValue]]
+        The list of items.
+    """
+    res = _backend._CompileEngineListItems(self)
+    assert len(res) % 2 == 0
+    return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
-        Returns
-        -------
-        dump : str
-            The dumped string representation
-        """
-        items = self.items()
-        res = "====================================\n"
-        res += "CompilerEngine dump, %d items cached\n" % len(items)
-        for k, v in items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        shape_func_items = self.shape_func_items()
-        res += "%d shape_func_items cached\n" % len(shape_func_items)
-        for k, v in shape_func_items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        return res
+
+def shape_func_items(self):
+    """List items in the shape_func_cache.
+
+    Returns
+    -------
+    item_list : List[Tuple[CCacheKey, CCacheValue]]
+        The list of shape_func_items.
+    """
+    res = _backend._CompileEngineListShapeFuncItems(self)
+    assert len(res) % 2 == 0
+    return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
+
+def get_current_ccache_key(self):
+    return _backend._CompileEngineGetCurrentCCacheKey(self)
+
+
+def dump(self):
+    """Return a string representation of engine dump.
+
+    Returns
+    -------
+    dump : str
+        The dumped string representation
+    """
+    items = self.items()
+    res = "====================================\n"
+    res += "CompilerEngine dump, %d items cached\n" % len(items)
+    for k, v in items:
+        res += "------------------------------------\n"
+        res += "target={}\n".format(k.target)
+        res += "use_count={}\n".format(v.use_count)
+        res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+        res += "----relay function----\n"
+        res += k.source_func.astext() + "\n"
+        res += "----tir function----- \n"
+        res += "inputs={}\n".format(v.cached_func.inputs)
+        res += "outputs={}\n".format(v.cached_func.outputs)
+        res += "function: \n"
+        res += v.cached_func.funcs.astext() + "\n"
+    res += "===================================\n"
+    shape_func_items = self.shape_func_items()
+    res += "%d shape_func_items cached\n" % len(shape_func_items)
+    for k, v in shape_func_items:
+        res += "------------------------------------\n"
+        res += "target={}\n".format(k.target)
+        res += "use_count={}\n".format(v.use_count)
+        res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
+        res += "----relay function----\n"
+        res += k.source_func.astext() + "\n"
+        res += "----tir function----- \n"
+        res += "inputs={}\n".format(v.cached_func.inputs)
+        res += "outputs={}\n".format(v.cached_func.outputs)
+        res += "function: \n"
+        res += v.cached_func.funcs.astext() + "\n"
+    res += "===================================\n"
+    return res
 
 
 def get_shape(shape):
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 445602540dbb..d06ac0e470a2 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -313,6 +313,35 @@ TECompiler::TECompiler() {
   data_ = object;
 }
 
+/*! \brief The global TE compiler */
+TECompiler& TECompiler::Global() {
+  static TECompiler* inst = new TECompiler(make_object<TECompilerImpl>());
+  return *inst;
+}
+
+TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() {
+  return TECompiler::Global();
+});
+
+TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
+    .set_body_typed([](Function source_func, Target target) {
+      return CCacheKey(source_func, target);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
+    .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
+      return LoweredOutput(outputs, impl);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._TECompilerClear").set_body_typed([](TECompiler self) {
+  self->Clear();
+});
+
+TVM_REGISTER_GLOBAL("relay.backend._TECompilerLower")
+    .set_body_typed([](TECompiler self, CCacheKey key, const String mod_name) {
+      return self->Lower(key, mod_name);
+    });
+
 using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual>;
 
 std::tuple<bool, int, int> IsDeviceCopy(const Function& func) {
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 248fd40f98eb..e3b7d46457ad 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -127,6 +127,7 @@ class TECompiler : public ObjectRef {
   explicit TECompiler(ObjectPtr<Object> n) : ObjectRef(n) {}
   TECompilerNode* operator->() { return static_cast<TECompilerNode*>(get_mutable()); }
   using ContainerType = TECompilerNode;
+  TVM_DLL static TECompiler& Global();
 };
 
 /*!
@@ -193,7 +194,7 @@ IRModule LowerTE(
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
- * \returns The pass which lowers primative functions to TIR
+ * \returns The pass which lowers primitive functions to TIR
  */
 transform::Pass LowerTEPass(TargetMap targets, const String& module_name,
                             std::function<void(Function)> process_fn);

From e4bd0d81e9904548eac263d4d1f6e4f51ac1f739 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:07:54 +0300
Subject: [PATCH 07/18] Swap usages for depreciated compile engine with TE
 compiler

---
 python/tvm/relay/backend/te_compiler.py | 158 +-----------------------
 src/relay/backend/te_compiler.cc        |   3 +
 src/relay/backend/te_compiler_cache.cc  |   6 +
 src/relay/backend/te_compiler_cache.h   |   2 +-
 4 files changed, 16 insertions(+), 153 deletions(-)

diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index ef37e4c6f6ac..b7b31944f6e9 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -47,7 +47,7 @@ def __init__(self, outputs, implement):
 
 @tvm._ffi.register_object("relay.CCacheKey")
 class CCacheKey(Object):
-    """Key in the CompileEngine.
+    """Key in the TE Compiler.
 
     Parameters
     ----------
@@ -65,7 +65,7 @@ def __init__(self, source_func, target):
 
 @tvm._ffi.register_object("relay.CCacheValue")
 class CCacheValue(Object):
-    """Value in the CompileEngine, including usage statistics."""
+    """Value in the TE Compiler, including usage statistics."""
 
 
 def _get_cache_key(source_func, target):
@@ -345,8 +345,7 @@ def lower_call(call, inputs, target):
     # re-enable AutoTVM tracing
     if reenable_tracing:
         env.tracing = True
-
-    return LoweredOutput(outputs, best_impl)(outputs, best_impl)
+    return LoweredOutput(outputs, best_impl)
 
 
 def lower_shape_func(self, source_func, target=None):
@@ -371,7 +370,7 @@ def jit(self, source_func, target=None):
         The result of jited function.
     """
     key = _get_cache_key(source_func, target)
-    return _backend._CompileEngineJIT(self, key)
+    return _backend._TECompilerJIT(self, key)
 
 
 def clear(self):
@@ -470,156 +469,11 @@ def get_shape(shape):
 
 
 def get():
-    """Get the global compile engine.
-
-    Returns
-    -------
-    engine : tvm.relay.backend.CompileEngine
-        The compile engine.
-    """
-    return _backend._TECompilerGlobal()
-
-
-@tvm._ffi.register_object("relay.TECompiler")
-class TECompiler(Object):
-    """CompileEngine to get lowered code."""
-
-    def __init__(self):
-        raise RuntimeError("Cannot construct a CompileEngine")
-
-    def lower(self, source_func, target=None, mod_name="default"):
-        """Lower a source_func to a CachedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        cached_func: CachedFunc
-            The result of lowering.
-        """
-        # pylint: disable=broad-except, import-outside-toplevel
-        try:
-            mod_name = mangle_module_name(mod_name)
-            key = _get_cache_key(source_func, target)
-            return _backend._TECompilerLower(self, key, mod_name)
-        except Exception:
-            import traceback
-
-            msg = traceback.format_exc()
-            msg += "Error during compile func\n"
-            msg += "--------------------------\n"
-            msg += source_func.astext(show_meta_data=False)
-            msg += "--------------------------\n"
-            raise RuntimeError(msg)
-
-    # def lower_shape_func(self, source_func, target=None):
-    #     key = _get_cache_key(source_func, target)
-    #     return _backend._CompileEngineLowerShapeFunc(self, key)
-
-    def jit(self, source_func, target=None):
-        """JIT a source_func to a tvm.runtime.PackedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        jited_func: tvm.runtime.PackedFunc
-            The result of jited function.
-        """
-        key = _get_cache_key(source_func, target)
-        return _backend._CompileEngineJIT(self, key)
-
-    def clear(self):
-        """clear the existing cached functions"""
-        _backend._CompileEngineClear(self)
-
-    def items(self):
-        """List items in the cache.
-
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of items.
-        """
-        res = _backend._CompileEngineListItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-    def shape_func_items(self):
-        """List items in the shape_func_cache.
-
-        Returns
-        -------
-        item_list : List[Tuple[CCacheKey, CCacheValue]]
-            The list of shape_func_items.
-        """
-        res = _backend._CompileEngineListShapeFuncItems(self)
-        assert len(res) % 2 == 0
-        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-    def get_current_ccache_key(self):
-        return _backend._CompileEngineGetCurrentCCacheKey(self)
-
-    def dump(self):
-        """Return a string representation of engine dump.
-
-        Returns
-        -------
-        dump : str
-            The dumped string representation
-        """
-        items = self.items()
-        res = "====================================\n"
-        res += "CompilerEngine dump, %d items cached\n" % len(items)
-        for k, v in items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        shape_func_items = self.shape_func_items()
-        res += "%d shape_func_items cached\n" % len(shape_func_items)
-        for k, v in shape_func_items:
-            res += "------------------------------------\n"
-            res += "target={}\n".format(k.target)
-            res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-            res += "----relay function----\n"
-            res += k.source_func.astext() + "\n"
-            res += "----tir function----- \n"
-            res += "inputs={}\n".format(v.cached_func.inputs)
-            res += "outputs={}\n".format(v.cached_func.outputs)
-            res += "function: \n"
-            res += v.cached_func.funcs.astext() + "\n"
-        res += "===================================\n"
-        return res
-
-
-def get():
-    """Get the global compile engine.
+    """Get the global TE Compiler.
 
     Returns
     -------
-    engine : tvm.relay.backend.CompileEngine
+    engine : tvm.relay.backend.TECompiler
         The compile engine.
     """
     return _backend._TECompilerGlobal()
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index d06ac0e470a2..894b1422cd95 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -342,6 +342,9 @@ TVM_REGISTER_GLOBAL("relay.backend._TECompilerLower")
       return self->Lower(key, mod_name);
     });
 
+TVM_REGISTER_GLOBAL("relay.backend._TECompilerJIT")
+    .set_body_typed([](TECompiler self, CCacheKey key) { return self->JIT(key); });
+
 using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual>;
 
 std::tuple<bool, int, int> IsDeviceCopy(const Function& func) {
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index ec87cfc98931..824a2ca8f0be 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -58,9 +58,15 @@ LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation im
   auto n = make_object<LoweredOutputNode>();
   n->outputs = std::move(outputs);
   n->implementation = std::move(impl);
+  VLOG(1) << outputs;
   data_ = std::move(n);
 }
 
+TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
+    .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
+      return LoweredOutput(outputs, impl);
+    });
+
 CCacheKey::CCacheKey(Function source_func, Target target) {
   auto n = make_object<CCacheKeyNode>();
   n->source_func = std::move(source_func);
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index 47ba96b2c77e..9c9306ac234c 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -62,7 +62,7 @@ struct LoweredOutputNode : public Object {
     v->Visit("outputs", &outputs);
     v->Visit("implementation", &implementation);
   }
-
+  print("lowered  out struct");
   static constexpr const char* _type_key = "relay.LoweredOutput";
   TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object);
 };

From d9b54eb7194aa11a9d1465423ab3c9dc0472dd7f Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:49:51 +0300
Subject: [PATCH 08/18] Track and replace usages of compile engine refactor
 them to TE compiler

---
 python/tvm/autotvm/task/relay_integration.py  |  4 +-
 python/tvm/relay/backend/te_compiler.py       | 54 +++----------------
 python/tvm/relay/testing/py_converter.py      |  7 +--
 python/tvm/topi/bifrost/conv2d.py             |  2 +-
 python/tvm/topi/x86/conv2d_alter_op.py        |  2 +-
 .../contrib/test_ethosn/infrastructure.py     |  4 +-
 tests/python/relay/aot/aot_test_utils.py      |  6 ++-
 .../relay/dyn/test_dynamic_op_level3.py       |  5 +-
 tests/python/relay/test_json_runtime.py       |  8 +--
 tests/python/relay/test_op_level3.py          |  8 +--
 .../python/relay/test_pass_partition_graph.py | 10 ++--
 ...le_engine.py => test_relay_te_compiler.py} | 26 ++++-----
 .../test_tir_transform_narrow_datatype.py     |  7 +--
 13 files changed, 55 insertions(+), 88 deletions(-)
 rename tests/python/relay/{test_backend_compile_engine.py => test_relay_te_compiler.py} (93%)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 714dd540d3ab..4716116a1b83 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -127,12 +127,12 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
             assert isinstance(
                 mod, tvm.IRModule
             ), "only support relay Module or Function to be tuned"
-            relay.backend.compile_engine.get().clear()
+            relay.backend.te_compiler.get().clear()
             # wrap build call in thread to avoid multiprocessing problems
             build_thread = threading.Thread(target=_lower, args=(mod, target, param))
             build_thread.start()
             build_thread.join()
-            relay.backend.compile_engine.get().clear()
+            relay.backend.te_compiler.get().clear()
             # Clear the warning message cache in FallbackContext
             if isinstance(DispatchContext.current, FallbackContext):
                 DispatchContext.current.memory = {}
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index b7b31944f6e9..6f5cfc4f19fa 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -41,8 +41,7 @@ class LoweredOutput(Object):
     """Lowered output"""
 
     def __init__(self, outputs, implement):
-        self.__init_handle_by_constructor__(
-            _backend._make_LoweredOutput, outputs, implement)
+        self.__init_handle_by_constructor__(_backend._make_LoweredOutput, outputs, implement)
 
 
 @tvm._ffi.register_object("relay.CCacheKey")
@@ -59,8 +58,7 @@ class CCacheKey(Object):
     """
 
     def __init__(self, source_func, target):
-        self.__init_handle_by_constructor__(
-            _backend._make_CCacheKey, source_func, target)
+        self.__init_handle_by_constructor__(_backend._make_CCacheKey, source_func, target)
 
 
 @tvm._ffi.register_object("relay.CCacheValue")
@@ -211,8 +209,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         if cfg.is_fallback:
             # Skip fallback config
             continue
-        logger.info("Implementation %s for %s has cost %.2e",
-                    impl.name, op.name, cfg.cost)
+        logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost)
         if best_cfg is None or best_cfg.cost > cfg.cost:
             best_autotvm_impl = impl
             best_cfg = cfg
@@ -315,8 +312,7 @@ def lower_call(call, inputs, target):
         new_fields = []
         for field in ret_type.fields:
             if isinstance(field, _ty.TensorType):
-                new_fields.append(_ty.TensorType(
-                    get_shape(field.shape), field.dtype))
+                new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
             else:
                 new_fields.append(field)
         ret_type = _ty.TupleType(new_fields)
@@ -334,8 +330,7 @@ def lower_call(call, inputs, target):
             reenable_tracing = True
 
     if not is_dyn:
-        best_impl, outputs = select_implementation(
-            op, call.attrs, inputs, ret_type, target)
+        best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
     else:
         # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
         best_impl, outputs = select_implementation(
@@ -348,11 +343,6 @@ def lower_call(call, inputs, target):
     return LoweredOutput(outputs, best_impl)
 
 
-def lower_shape_func(self, source_func, target=None):
-    key = _get_cache_key(source_func, target)
-    return _backend._CompileEngineLowerShapeFunc(self, key)
-
-
 def jit(self, source_func, target=None):
     """JIT a source_func to a tvm.runtime.PackedFunc.
 
@@ -378,36 +368,6 @@ def clear(self):
     _backend._TECompilerClear(self)
 
 
-def items(self):
-    """List items in the cache.
-
-    Returns
-    -------
-    item_list : List[Tuple[CCacheKey, CCacheValue]]
-        The list of items.
-    """
-    res = _backend._CompileEngineListItems(self)
-    assert len(res) % 2 == 0
-    return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-
-def shape_func_items(self):
-    """List items in the shape_func_cache.
-
-    Returns
-    -------
-    item_list : List[Tuple[CCacheKey, CCacheValue]]
-        The list of shape_func_items.
-    """
-    res = _backend._CompileEngineListShapeFuncItems(self)
-    assert len(res) % 2 == 0
-    return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
-
-
-def get_current_ccache_key(self):
-    return _backend._CompileEngineGetCurrentCCacheKey(self)
-
-
 def dump(self):
     """Return a string representation of engine dump.
 
@@ -418,7 +378,7 @@ def dump(self):
     """
     items = self.items()
     res = "====================================\n"
-    res += "CompilerEngine dump, %d items cached\n" % len(items)
+    res += "TE Compiler cached func dump, %d items cached\n" % len(items)
     for k, v in items:
         res += "------------------------------------\n"
         res += "target={}\n".format(k.target)
@@ -474,6 +434,6 @@ def get():
     Returns
     -------
     engine : tvm.relay.backend.TECompiler
-        The compile engine.
+        The TE Compiler.
     """
     return _backend._TECompilerGlobal()
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index b9d6806306f4..6da14358c8be 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -24,7 +24,6 @@
 import tvm
 from tvm import relay
 from tvm.relay.adt import Pattern
-from tvm.relay.backend import compile_engine
 from tvm.relay.expr import Expr, GlobalVar, Var
 from tvm.relay.function import Function
 from tvm.relay.expr_functor import ExprFunctor
@@ -61,7 +60,6 @@ def __init__(self, mod, target) -> None:
         super().__init__()
         self.mod = mod
         self.tgt = target
-        self.engine = compile_engine.get()
         self.fun_no = 0
         self.var_no = 0
         self.var_map = {}
@@ -153,7 +151,10 @@ def parse_name(self, name: str):
     def parse_numpy_array(self, arr):
         """Given a Numpy array, produces an appropriate Python array
         or numerical literal representing its contents."""
-        parse_single = lambda i: NameConstant(i) if isinstance(i, bool) else Num(i)
+
+        def parse_single(i):
+            return NameConstant(i) if isinstance(i, bool) else Num(i)
+
         if arr.ndim == 0:
             return parse_single(arr.item())
         if arr.ndim == 1:
diff --git a/python/tvm/topi/bifrost/conv2d.py b/python/tvm/topi/bifrost/conv2d.py
index 3b6cca6aaea4..633f36c0e7ff 100644
--- a/python/tvm/topi/bifrost/conv2d.py
+++ b/python/tvm/topi/bifrost/conv2d.py
@@ -477,7 +477,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    _, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 8e47dff37ce6..3f2df655a615 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -57,7 +57,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         cfg = dispatch_ctx.query(target, None)
         workload = cfg.workload
     else:
-        impl, outs = relay.backend.compile_engine.select_implementation(
+        impl, outs = relay.backend.te_compiler.select_implementation(
             relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
         )
         workload = autotvm.task.get_workload(outs)
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 92e8f11a2312..c5ebde4b9c61 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -149,7 +149,7 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
     npu_partitions : int, optional
         The number of Ethos-N partitions expected.
     """
-    relay.backend.compile_engine.get().clear()
+    relay.backend.te_compiler.get().clear()
     with tvm.transform.PassContext(
         opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
     ):
@@ -262,7 +262,7 @@ def test_error(mod, params, err_msg):
             except tvm.error.TVMError as e:
                 caught = e.args[0]
             finally:
-                relay.backend.compile_engine.get().clear()
+                relay.backend.te_compiler.get().clear()
 
     assert caught is not None
     assert err_msg in caught, caught
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 746f595a4422..f2b49ecbded7 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -33,8 +33,10 @@
 
 import tvm
 from tvm import relay
+from tvm import te
 from tvm.contrib import utils, graph_executor
-from tvm.relay.backend import compile_engine
+from tvm.relay.backend import te_compiler
+from tvm.relay.backend.te_compiler import TECompiler
 from tvm.relay.backend.utils import mangle_module_name
 from tvm.micro import export_model_library_format
 
@@ -721,7 +723,7 @@ def compile_and_run(
 
 def generate_ref_data(mod, input_data, params=None, target="llvm"):
     """Generate reference data through executing the relay module"""
-    compile_engine.get().clear()
+    te_compiler.get().clear()
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 22583eda4a40..7669d02cd536 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -41,7 +41,7 @@ def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()
                     tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
             else:
                 tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-            relay.backend.compile_engine.get().clear()
+            relay.backend.te_compiler.get().clear()
 
 
 @tvm.testing.uses_gpu
@@ -251,7 +251,8 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     verify_sparse_to_dense(
         [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
     )  # floats
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+    # default value not specified
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])
 
 
 @pytest.mark.parametrize(
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index ca792204c835..c6eb7531f635 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -26,7 +26,7 @@
 from tvm import relay, runtime
 from tvm.contrib import utils
 from tvm.relay import transform
-from tvm.relay.backend import compile_engine
+from tvm.relay.backend import te_compiler
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.op.contrib.register import get_pattern_table
 
@@ -47,7 +47,7 @@ def check_result(
         return
 
     # Run the reference result
-    compile_engine.get().clear()
+    te_compiler.get().clear()
     with tvm.transform.PassContext(opt_level=3):
         json, lib, param = relay.build(ref_mod, target=target, params=params)
     rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
@@ -61,7 +61,7 @@ def check_result(
     ref_result = out.numpy()
 
     def check_vm_result():
-        compile_engine.get().clear()
+        te_compiler.get().clear()
         with relay.build_config(opt_level=3):
             exe = relay.vm.compile(mod, target=target, params=params)
         code, lib = exe.save()
@@ -71,7 +71,7 @@ def check_vm_result():
         tvm.testing.assert_allclose(out.numpy(), ref_result, rtol=tol, atol=tol)
 
     def check_graph_executor_result():
-        compile_engine.get().clear()
+        te_compiler.get().clear()
         with relay.build_config(opt_level=3):
             json, lib, param = relay.build(mod, target=target, params=params)
         rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index eaddd33678df..754c9d1c4a74 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1422,7 +1422,8 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     verify_sparse_to_dense(
         [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
     )  # floats
-    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+    # default value not specified
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])
 
     # negative test cases
     # sparse indices should be ints
@@ -1757,7 +1758,7 @@ def verify_func(target, dev, func, data, ref_res):
                 tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
         else:
             tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-        relay.backend.compile_engine.get().clear()
+        relay.backend.te_compiler.get().clear()
 
 
 def test_adv_index(target, dev, executor_kind):
@@ -1970,7 +1971,8 @@ def calc_numpy_unique(data, is_sorted=False):
             uniq = uniq[order].astype(data.dtype)
             inverse = np.array([reverse_order[i] for i in inverse]).astype("int32")
             counts = counts[order].astype("int32")
-            index = np.sort(index)  # In unsorted case, need to sort the index of first occurence
+            # In unsorted case, need to sort the index of first occurence
+            index = np.sort(index)
         return [
             uniq.astype(data.dtype),
             index.astype("int32"),
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 93cd6f791765..da33aa5d4c4f 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 import tvm
+from tvm.relay.backend import te_compiler
 import tvm.relay.testing
 import tvm.relay.op as reg
 from tvm import relay
@@ -29,7 +30,6 @@
 from tvm.relay import transform
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
-from tvm.relay.backend import compile_engine
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.op.contrib.register import get_pattern_table
@@ -143,7 +143,7 @@ def update_lib(lib):
         return lib
 
     def check_vm_result():
-        compile_engine.get().clear()
+        te_compiler.get().clear()
         with tvm.transform.PassContext(opt_level=3):
             exe = relay.vm.compile(mod, target=target, params=params)
         code, lib = exe.save()
@@ -157,7 +157,7 @@ def check_vm_result():
             tvm.testing.assert_allclose(out.numpy(), ref, rtol=tol, atol=tol)
 
     def check_graph_executor_result():
-        compile_engine.get().clear()
+        te_compiler.get().clear()
         with tvm.transform.PassContext(opt_level=3):
             json, lib, param = relay.build(mod, target=target, params=params)
         lib = update_lib(lib)
@@ -508,7 +508,7 @@ def test_extern_dnnl_mobilenet():
     ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
         i_data, **params
     )
-    compile_engine.get().clear()
+    te_compiler.get().clear()
 
     check_result(mod, {"data": i_data}, (1, 1000), ref_res.numpy(), tol=1e-5, params=params)
 
@@ -950,7 +950,7 @@ def test_exec(mod, params, ref_mod, ref_params, out_shape):
         ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
             i_data, **ref_params
         )
-        compile_engine.get().clear()
+        # compile_engine.get().clear()
 
         mod = get_partitoned_mod(mod, params, dnnl_patterns)
 
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_relay_te_compiler.py
similarity index 93%
rename from tests/python/relay/test_backend_compile_engine.py
rename to tests/python/relay/test_relay_te_compiler.py
index 092cae01f568..4248a3b0c5cd 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_relay_te_compiler.py
@@ -98,7 +98,7 @@ def _get_impls(dshape, wshape):
         weight = relay.var("wshape", shape=wshape)
         out = relay.nn.conv2d(data, weight, padding=(1, 1))
         out = run_infer_type(out)
-        return relay.backend.compile_engine.get_valid_implementations(
+        return relay.backend.te_compiler.get_valid_implementations(
             relay.op.get("nn.conv2d"),
             out.attrs,
             [te.placeholder(dshape), te.placeholder(wshape)],
@@ -121,7 +121,7 @@ def _select_impl(dshape, wshape, use_autotvm=False):
         weight = relay.var("wshape", shape=wshape)
         out = relay.nn.conv2d(data, weight, padding=(1, 1))
         out = run_infer_type(out)
-        return relay.backend.compile_engine.select_implementation(
+        return relay.backend.te_compiler.select_implementation(
             relay.op.get("nn.conv2d"),
             out.attrs,
             [te.placeholder(dshape), te.placeholder(wshape)],
@@ -161,8 +161,8 @@ def _select_impl(dshape, wshape, use_autotvm=False):
                 assert impl.name == "conv2d_1"
 
 
-def test_compile_engine():
-    engine = relay.backend.compile_engine.get()
+def test_te_compiler():
+    te_compiler = relay.backend.te_compiler.get()
 
     def get_func(shape):
         x = relay.var("x", shape=shape)
@@ -173,31 +173,31 @@ def get_func(shape):
         mod = relay.transform.InferType()(mod)
         return mod["main"]
 
-    z1 = engine.lower(get_func((10,)), "llvm")
-    z2 = engine.lower(get_func((10,)), "llvm")
-    z3 = engine.lower(get_func(()), "llvm")
+    z1 = te_compiler.lower(get_func((10,)), "llvm")
+    z2 = te_compiler.lower(get_func((10,)), "llvm")
+    z3 = te_compiler.lower(get_func(()), "llvm")
     assert z1.same_as(z2)
     assert not z3.same_as(z1)
     if tvm.testing.device_enabled("cuda"):
-        z4 = engine.lower(get_func(()), "cuda")
+        z4 = te_compiler.lower(get_func(()), "cuda")
         assert not z3.same_as(z4)
 
     # Test JIT target
     for target in ["llvm"]:
         dev = tvm.device(target)
         if tvm.testing.device_enabled(target):
-            f = engine.jit(get_func((10,)), target)
+            f = te_compiler.jit(get_func((10,)), target)
             x = tvm.nd.array(np.ones(10).astype("float32"), device=dev)
             y = tvm.nd.empty((10,), device=dev)
             f(x, y)
             tvm.testing.assert_allclose(y.numpy(), x.numpy() * 3)
-    engine.dump()
+    te_compiler.dump()
 
 
-# Note: Once compile engine is removed, we should keep this test so that
+# Note: Once the te compiler is removed, we should keep this test so that
 # we make sure that opt_level=0 passes are being called correctly.
 def test_compile_placeholder_bypass():
-    engine = relay.backend.compile_engine.get()
+    te_compiler = relay.backend.te_compiler.get()
     x = relay.var("x", shape=(2, 3))
     y = relay.var("y", shape=(2, 3))
     z = relay.var("z", shape=(2, 3))
@@ -264,7 +264,7 @@ def test_compile_nhwc_pack():
 if __name__ == "__main__":
     test_get_valid_implementations()
     test_select_implementation()
-    test_compile_engine()
+    test_te_compiler()
     test_compile_placeholder_bypass()
     test_compile_injective_with_tuple()
     test_compile_tuple_dup()
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index cb8968cfc880..c76d7e145ecf 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -66,7 +66,8 @@ def check(m, n, target_bits, target_dtype):
     # const shape
     # i32 -> i32
     check(2, 2, 32, "int32")
-    check(2 ** 16, 2 ** 16, 32, "int32")  # i32 + i32 is not promoted to i64 even if overflow
+    # i32 + i32 is not promoted to i64 even if overflow
+    check(2 ** 16, 2 ** 16, 32, "int32")
     # i64 -> i32
     check(const(2, dtype="int64"), const(2, dtype="int64"), 32, "int32")
     check(const(2 ** 16, dtype="int64"), const(2 ** 16, dtype="int64"), 32, "int64")
@@ -188,7 +189,7 @@ def check(m, n, target_bits, target_dtype):
 
 
 def test_relay_basic():
-    engine = relay.backend.compile_engine.get()
+    engine = relay.backend.te_compiler.get()
 
     def check(shapex, shapey, target_bits, target_dtype):
         x = relay.var("x", shape=shapex)
@@ -230,7 +231,7 @@ def check(shapex, shapey, target_bits, target_dtype):
 
 
 def test_relay_take():
-    engine = relay.backend.compile_engine.get()
+    engine = relay.backend.te_compiler.get()
 
     def check(shape, index, target_bits, target_dtype):
         x = relay.var("x", shape=shape)

From 13260db0b6cec152ebfd9f5d62f7704d21642b71 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:50:30 +0300
Subject: [PATCH 09/18] [Docs] Log helper mod

---
 docs/arch/relay_op_strategy.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/arch/relay_op_strategy.rst b/docs/arch/relay_op_strategy.rst
index c40251d22433..dbac7c821827 100644
--- a/docs/arch/relay_op_strategy.rst
+++ b/docs/arch/relay_op_strategy.rst
@@ -269,14 +269,14 @@ will then be chosen. Implementations with same priority level in this case leads
 to an undefined behavior, and any of them might be selected.
 
 The selection policy for ops with symbolic input shapes is still work in
-progess. Currently, if any input tensor has a symbolic shape, only the
+progress. Currently, if any input tensor has a symbolic shape, only the
 implementation with highest priority level will be used for this operator. This
-will be updated after the implemention finishes.
+will be updated after the implementation finishes.
 
 For debug purpose, you can add the following lines before you compile the Relay
 model to learn which implementation is used for each operator.
 
 .. code:: python
 
-    logging.getLogger("compile_engine").setLevel(logging.INFO)
-    logging.getLogger("compile_engine").addHandler(logging.StreamHandler(sys.stdout))
+    logging.getLogger("te_compiler").setLevel(logging.INFO)
+    logging.getLogger("te_compiler").addHandler(logging.StreamHandler(sys.stdout))

From 820689eeb74803d6b2bccab4c8d81449f3c1da75 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:51:28 +0300
Subject: [PATCH 10/18] Remove depreciated function for lookup compile engine
 cachce

---
 src/relay/backend/utils.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 6d59b858927c..febb550d45c0 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -427,15 +427,6 @@ inline bool IsAutoSchedulerEnabled() {
       .value();
 }
 
-/*!
- * \brief Return whether the compile engine cache is disabled in the pass context.
- */
-inline bool IsCompileEngineCacheDisabled() {
-  return transform::PassContext::Current()
-      ->GetConfig<Bool>("relay.backend.disable_compile_engine_cache", Bool(false))
-      .value();
-}
-
 /*!
  * \brief Get the sequence of Relay optimization passes based on backend type.
  * The prefix of the Relay passes almost overlaps between the vm and graph backend, with some slight

From c85da07428de7387f4f3c44789e099706005a3af Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:52:06 +0300
Subject: [PATCH 11/18] Fix typos

---
 src/runtime/object.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index 3cd5df613f4a..4e24434642d8 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -41,7 +41,7 @@ namespace runtime {
 struct TypeInfo {
   /*! \brief The current index. */
   uint32_t index{0};
-  /*! \brief Index of the parent in the type hierachy */
+  /*! \brief Index of the parent in the type hierarchy */
   uint32_t parent_index{0};
   // NOTE: the indices in [index, index + num_reserved_slots) are
   // reserved for the child-class of this type.
@@ -58,7 +58,7 @@ struct TypeInfo {
 };
 
 /*!
- * \brief Type context that manages the type hierachy information.
+ * \brief Type context that manages the type hierarchy information.
  */
 class TypeContext {
  public:

From b8b9e82712efd0557c1c4ee05e3d3aa57cc24efc Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 14 Oct 2021 13:57:13 +0300
Subject: [PATCH 12/18] Debug misc cleanups

---
 src/relay/backend/te_compiler_cache.cc | 6 ------
 src/relay/backend/te_compiler_cache.h  | 1 -
 2 files changed, 7 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 824a2ca8f0be..ec87cfc98931 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -58,15 +58,9 @@ LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation im
   auto n = make_object<LoweredOutputNode>();
   n->outputs = std::move(outputs);
   n->implementation = std::move(impl);
-  VLOG(1) << outputs;
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
-    .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
-      return LoweredOutput(outputs, impl);
-    });
-
 CCacheKey::CCacheKey(Function source_func, Target target) {
   auto n = make_object<CCacheKeyNode>();
   n->source_func = std::move(source_func);
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index 9c9306ac234c..7975ef873173 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -62,7 +62,6 @@ struct LoweredOutputNode : public Object {
     v->Visit("outputs", &outputs);
     v->Visit("implementation", &implementation);
   }
-  print("lowered  out struct");
   static constexpr const char* _type_key = "relay.LoweredOutput";
   TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object);
 };

From 62740c53fcb6a37f2b830e83b59dc96eef7583e5 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 15 Oct 2021 12:07:09 +0300
Subject: [PATCH 13/18] Register global pass for using te compiler for auto
 scheduler

---
 src/relay/backend/te_compiler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 894b1422cd95..8b8cf8755bcd 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -318,6 +318,7 @@ TECompiler& TECompiler::Global() {
   static TECompiler* inst = new TECompiler(make_object<TECompilerImpl>());
   return *inst;
 }
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
 
 TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() {
   return TECompiler::Global();

From cd021b79e2f4e2cb876054d58e96519264b951dc Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 15 Oct 2021 12:28:00 +0300
Subject: [PATCH 14/18] Fix tests using the legacy compile engine

---
 .../tvm/auto_scheduler/relay_integration.py   |  4 +--
 python/tvm/relay/testing/py_converter.py      |  5 ++--
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |  2 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       |  2 +-
 python/tvm/topi/cuda/conv3d_alter_op.py       |  2 +-
 .../topi/intel_graphics/conv2d_alter_op.py    |  2 +-
 python/tvm/topi/mali/conv2d.py                |  2 +-
 python/tvm/topi/x86/dense_alter_op.py         |  2 +-
 .../test_arm_compute_lib/infrastructure.py    |  2 +-
 .../contrib/test_bnns/infrastructure.py       |  2 +-
 .../contrib/test_vitis_ai/infrastructure.py   | 25 +++++++++----------
 .../python/relay/test_pass_partition_graph.py |  2 +-
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 0eacd1a1f667..6f35e021daf8 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -58,7 +58,6 @@ def call_all_topi_funcs(mod, params, target, opt_level=3):
         opt_level=opt_level,
         config={
             "relay.backend.use_auto_scheduler": True,
-            "relay.backend.disable_compile_engine_cache": True,
         },
         disabled_pass={"AutoSchedulerLayoutRewrite"},
     ):
@@ -165,7 +164,8 @@ class TracingMode:
     """Two modes for tracing"""
 
     EXTRACT_TASK = 0  # trace all topi calls to extract tasks
-    EXTRACT_COMPLEX_TASK_ONLY = 1  # same as EXTRACT_TASK but ignore the task without complex ops
+    # same as EXTRACT_TASK but ignore the task without complex ops
+    EXTRACT_COMPLEX_TASK_ONLY = 1
     PREPARE_LAYOUT_REWRITE = 2  # trace topi calls to prepare layout rewrite
 
 
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index 6da14358c8be..931b032ca1ed 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -24,6 +24,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.adt import Pattern
+from tvm.relay.backend import te_compiler
 from tvm.relay.expr import Expr, GlobalVar, Var
 from tvm.relay.function import Function
 from tvm.relay.expr_functor import ExprFunctor
@@ -241,11 +242,11 @@ def create_op_call(self, op: Function, relay_args, py_args):
         the generated Python code."""
 
         # compile the function and register globally
-        cc_key = compile_engine.CCacheKey(op, self.tgt)
+        cc_key = te_compiler.CCacheKey(op, self.tgt)
         func_hash = tvm.ir.structural_hash(op)
         op_name = "_lowered_op_{}".format(func_hash)
         if not tvm.get_global_func(op_name, allow_missing=True):
-            jitted = self.engine.jit(cc_key, self.tgt)
+            jitted = self.te_compiler.jit(cc_key, self.tgt)
             tvm.register_func(op_name, jitted)
 
         def convert_input(py_input, arg_type):
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index c7c572c81110..cbe8644c885f 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -90,7 +90,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    _, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 4863a06b728d..3d05058ff52c 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -46,7 +46,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    impl, outs = relay.backend.compile_engine.select_implementation(
+    impl, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/cuda/conv3d_alter_op.py b/python/tvm/topi/cuda/conv3d_alter_op.py
index faf73e77255a..c7ec7cb21fcf 100644
--- a/python/tvm/topi/cuda/conv3d_alter_op.py
+++ b/python/tvm/topi/cuda/conv3d_alter_op.py
@@ -35,7 +35,7 @@ def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    _, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv3d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/intel_graphics/conv2d_alter_op.py b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
index 0b59a849c2c9..199d984af1e4 100644
--- a/python/tvm/topi/intel_graphics/conv2d_alter_op.py
+++ b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
@@ -35,7 +35,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         cfg = dispatch_ctx.query(target, None)
         workload = cfg.workload
     else:
-        _, outs = relay.backend.compile_engine.select_implementation(
+        _, outs = relay.backend.te_compiler.select_implementation(
             relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
         )
         workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
index f3ef55b9a30c..051914113a5b 100644
--- a/python/tvm/topi/mali/conv2d.py
+++ b/python/tvm/topi/mali/conv2d.py
@@ -531,7 +531,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    impl, outs = relay.backend.compile_engine.select_implementation(
+    impl, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index 8db84497f82d..1d64261a50d7 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -35,7 +35,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     M, K = get_const_tuple(data_tensor.shape)
     N, _ = get_const_tuple(weight_tensor.shape)
 
-    impl, outs = relay.backend.compile_engine.select_implementation(
+    impl, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.dense"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index f151a85ec5b1..e582874d1de2 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -184,7 +184,7 @@ def build_module(mod, target, params=None, enable_acl=True, tvm_ops=0, acl_parti
             ), "Got {} Arm Compute Library partitions, expected {}".format(
                 partition_count, acl_partitions
             )
-        relay.backend.compile_engine.get().clear()
+        relay.backend.te_compiler.get().clear()
         return relay.build(mod, target=target, params=params)
 
 
diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py
index 46bd049402a9..5a12b0487408 100644
--- a/tests/python/contrib/test_bnns/infrastructure.py
+++ b/tests/python/contrib/test_bnns/infrastructure.py
@@ -142,7 +142,7 @@ def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0):
     with tvm.transform.PassContext(opt_level=3):
         if enable_bnns:
             mod = partition_for_bnns(mod)
-        relay.backend.compile_engine.get().clear()
+        relay.backend.te_compiler.get().clear()
         return relay.build(mod, target=target, target_host=target, params=params)
 
 
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index e87d4f874630..d0270deb1671 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -18,24 +18,23 @@
 
 """Expose Vitis-AI test functions to the Python frontend"""
 
+from tvm.contrib import utils
+from tvm.contrib import graph_executor
+from tvm.contrib.target import vitis_ai
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
+from tvm.relay import transform
+from tvm import runtime
+from tvm import relay
+import tvm
+import pyxir.contrib.target.DPUCZDX8G
+import pyxir.contrib.target.DPUCADX8G
 import sys
 import numpy as np
 
 import pytest
 
 pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCADX8G
-import pyxir.contrib.target.DPUCZDX8G
-
-import tvm
-from tvm import relay
-from tvm import runtime
-from tvm.relay import transform
-from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
-from tvm.relay.build_module import bind_params_by_name
-from tvm.contrib.target import vitis_ai
-from tvm.contrib import graph_executor
-from tvm.contrib import utils
 
 
 def get_cpu_op_count(mod):
@@ -99,7 +98,7 @@ def build_module(
             ), "Got {} Vitis-AI partitions, expected {}".format(
                 partition_count, vitis_ai_partitions
             )
-        relay.backend.compile_engine.get().clear()
+        relay.backend.te_compiler.get().clear()
         return relay.build(mod, target, params=params)
 
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index da33aa5d4c4f..5aba6229c5e2 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -950,7 +950,7 @@ def test_exec(mod, params, ref_mod, ref_params, out_shape):
         ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)).evaluate()(
             i_data, **ref_params
         )
-        # compile_engine.get().clear()
+        te_compiler.get().clear()
 
         mod = get_partitoned_mod(mod, params, dnnl_patterns)
 

From 1df81328b1a9302bc18300169ebe8c7e112e48ba Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Mon, 18 Oct 2021 11:31:44 +0300
Subject: [PATCH 15/18] Fix broken autotuner tests and minor cleanups

---
 .../graph_tuner/utils/traverse_graph.py       |   2 +-
 python/tvm/relay/backend/te_compiler.py       | 189 ++++++++----------
 src/relay/backend/te_compiler.cc              |   6 +
 tests/python/relay/test_relay_te_compiler.py  |   2 +-
 4 files changed, 86 insertions(+), 113 deletions(-)

diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index 723e7fa77006..7299875bf28d 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -142,7 +142,7 @@ def _traverse_expr(node):
                     params.append(free_var)
                 call = relay.Call(node.op, params, node.attrs)
                 mod = tvm.IRModule.from_expr(relay.Function(params, call))
-                relay.backend.compile_engine.get().clear()
+                relay.backend.te_compiler.get().clear()
                 tracing_target = _replace_device_with_tracing(tvm_target)
                 build_thread = threading.Thread(
                     target=relay.build, args=(mod, tracing_target, None, None)
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 6f5cfc4f19fa..d3d32125d6f9 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -256,44 +256,22 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     return best_plevel_impl, outputs[best_plevel_impl]
 
 
-@tvm._ffi.register_object("relay.TECompiler")
-class TECompiler(Object):
-    """TECompiler to get lowered code."""
-
-    def __init__(self):
-        raise RuntimeError("Cannot construct a TECompiler")
-
-    def lower(self, source_func, target=None, mod_name="default"):
-        """Lower a source_func to a CachedFunc.
-
-        Parameters
-        ----------
-        source_func : Union[tvm.relay.Function, CCacheKey]
-            The source relay function.
-
-        target : tvm.Target
-            The target platform.
-
-        Returns
-        -------
-        cached_func: CachedFunc
-            The result of lowering.
-        """
-        # pylint: disable=broad-except, import-outside-toplevel
-        try:
-            mod_name = mangle_module_name(mod_name)
-            key = _get_cache_key(source_func, target)
-            print(key)
-            return _backend._TECompilerLower(self, key, mod_name)
-        except Exception:
-            import traceback
-
-            msg = traceback.format_exc()
-            msg += "Error during compile func\n"
-            msg += "--------------------------\n"
-            msg += source_func.astext(show_meta_data=False)
-            msg += "--------------------------\n"
-            raise RuntimeError(msg)
+def get_shape(shape):
+    """Convert the shape to correct dtype and vars."""
+    ret = []
+    for dim in shape:
+        if isinstance(dim, tvm.tir.IntImm):
+            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
+                ret.append(dim)
+            else:
+                val = int(dim)
+                assert val <= np.iinfo(np.int32).max
+                ret.append(tvm.tir.IntImm("int32", val))
+        elif isinstance(dim, tvm.tir.Any):
+            ret.append(te.var("any_dim", "int32"))
+        else:
+            ret.append(dim)
+    return ret
 
 
 @tvm._ffi.register_func("relay.backend.lower_call")
@@ -343,89 +321,78 @@ def lower_call(call, inputs, target):
     return LoweredOutput(outputs, best_impl)
 
 
-def jit(self, source_func, target=None):
-    """JIT a source_func to a tvm.runtime.PackedFunc.
+@tvm._ffi.register_object("relay.TECompiler")
+class TECompiler(Object):
+    """TECompiler to get lowered code."""
 
-    Parameters
-    ----------
-    source_func : Union[tvm.relay.Function, CCacheKey]
-        The source relay function.
+    def __init__(self):
+        raise RuntimeError("Cannot construct a TECompiler")
 
-    target : tvm.Target
-        The target platform.
+    def lower(self, source_func, target=None, mod_name="default"):
+        """Lower a source_func to a CachedFunc.
 
-    Returns
-    -------
-    jited_func: tvm.runtime.PackedFunc
-        The result of jited function.
-    """
-    key = _get_cache_key(source_func, target)
-    return _backend._TECompilerJIT(self, key)
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
 
+        target : tvm.Target
+            The target platform.
 
-def clear(self):
-    """clear the existing cached functions"""
-    _backend._TECompilerClear(self)
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        # pylint: disable=broad-except, import-outside-toplevel
+        try:
+            mod_name = mangle_module_name(mod_name)
+            key = _get_cache_key(source_func, target)
+            return _backend._TECompilerLower(self, key, mod_name)
+        except Exception:
+            import traceback
 
+            msg = traceback.format_exc()
+            msg += "Error during compile func\n"
+            msg += "--------------------------\n"
+            msg += source_func.astext(show_meta_data=False)
+            msg += "--------------------------\n"
+            raise RuntimeError(msg)
 
-def dump(self):
-    """Return a string representation of engine dump.
+    def jit(self, source_func, target=None):
+        """JIT a source_func to a tvm.runtime.PackedFunc.
 
-    Returns
-    -------
-    dump : str
-        The dumped string representation
-    """
-    items = self.items()
-    res = "====================================\n"
-    res += "TE Compiler cached func dump, %d items cached\n" % len(items)
-    for k, v in items:
-        res += "------------------------------------\n"
-        res += "target={}\n".format(k.target)
-        res += "use_count={}\n".format(v.use_count)
-        res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-        res += "----relay function----\n"
-        res += k.source_func.astext() + "\n"
-        res += "----tir function----- \n"
-        res += "inputs={}\n".format(v.cached_func.inputs)
-        res += "outputs={}\n".format(v.cached_func.outputs)
-        res += "function: \n"
-        res += v.cached_func.funcs.astext() + "\n"
-    res += "===================================\n"
-    shape_func_items = self.shape_func_items()
-    res += "%d shape_func_items cached\n" % len(shape_func_items)
-    for k, v in shape_func_items:
-        res += "------------------------------------\n"
-        res += "target={}\n".format(k.target)
-        res += "use_count={}\n".format(v.use_count)
-        res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
-        res += "----relay function----\n"
-        res += k.source_func.astext() + "\n"
-        res += "----tir function----- \n"
-        res += "inputs={}\n".format(v.cached_func.inputs)
-        res += "outputs={}\n".format(v.cached_func.outputs)
-        res += "function: \n"
-        res += v.cached_func.funcs.astext() + "\n"
-    res += "===================================\n"
-    return res
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
 
+        target : tvm.Target
+            The target platform.
 
-def get_shape(shape):
-    """Convert the shape to correct dtype and vars."""
-    ret = []
-    for dim in shape:
-        if isinstance(dim, tvm.tir.IntImm):
-            if libinfo()["INDEX_DEFAULT_I64"] == "ON":
-                ret.append(dim)
-            else:
-                val = int(dim)
-                assert val <= np.iinfo(np.int32).max
-                ret.append(tvm.tir.IntImm("int32", val))
-        elif isinstance(dim, tvm.tir.Any):
-            ret.append(te.var("any_dim", "int32"))
-        else:
-            ret.append(dim)
-    return ret
+        Returns
+        -------
+        jited_func: tvm.runtime.PackedFunc
+            The result of jited function.
+        """
+        print("caling jit \n")
+        key = _get_cache_key(source_func, target)
+        return _backend._TECompilerJIT(self, key)
+
+    def clear(self):
+        """clear the existing cached functions"""
+        _backend._TECompilerClear(self)
+
+    def items(self):
+        """List items in the cache.
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of items.
+        """
+        res = _backend._TECompilerListItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
 
 def get():
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 8b8cf8755bcd..a8c27a126032 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -346,6 +346,12 @@ TVM_REGISTER_GLOBAL("relay.backend._TECompilerLower")
 TVM_REGISTER_GLOBAL("relay.backend._TECompilerJIT")
     .set_body_typed([](TECompiler self, CCacheKey key) { return self->JIT(key); });
 
+TVM_REGISTER_GLOBAL("relay.backend._TECompilerListItems").set_body_typed([](TECompiler self) {
+  TECompilerImpl* ptr = dynamic_cast<TECompilerImpl*>(self.operator->());
+  ICHECK(ptr != nullptr);
+  return ptr->ListItems();
+});
+
 using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual>;
 
 std::tuple<bool, int, int> IsDeviceCopy(const Function& func) {
diff --git a/tests/python/relay/test_relay_te_compiler.py b/tests/python/relay/test_relay_te_compiler.py
index 4248a3b0c5cd..e3e3d0f523cc 100644
--- a/tests/python/relay/test_relay_te_compiler.py
+++ b/tests/python/relay/test_relay_te_compiler.py
@@ -163,6 +163,7 @@ def _select_impl(dshape, wshape, use_autotvm=False):
 
 def test_te_compiler():
     te_compiler = relay.backend.te_compiler.get()
+    # print(te_compiler)
 
     def get_func(shape):
         x = relay.var("x", shape=shape)
@@ -191,7 +192,6 @@ def get_func(shape):
             y = tvm.nd.empty((10,), device=dev)
             f(x, y)
             tvm.testing.assert_allclose(y.numpy(), x.numpy() * 3)
-    te_compiler.dump()
 
 
 # Note: Once the te compiler is removed, we should keep this test so that

From 510026318ec3ce2728b9709ee8c407baad1967f2 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Mon, 18 Oct 2021 13:01:13 +0300
Subject: [PATCH 16/18] Swap compile engine with te_compiler in rst config

---
 docs/reference/api/python/relay/backend.rst  |  2 +-
 python/tvm/relay/testing/py_converter.py     |  5 +++--
 tests/python/relay/test_relay_te_compiler.py | 14 +++++++-------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/docs/reference/api/python/relay/backend.rst b/docs/reference/api/python/relay/backend.rst
index ffe8a9a8ce79..e717ee10ffab 100644
--- a/docs/reference/api/python/relay/backend.rst
+++ b/docs/reference/api/python/relay/backend.rst
@@ -23,7 +23,7 @@ tvm.relay.backend
 .. automodule:: tvm.relay.backend.interpreter
     :members:
 
-.. automodule:: tvm.relay.backend.compile_engine
+.. automodule:: tvm.relay.backend.te_compiler
     :members:
 
 .. automodule:: tvm.relay.backend.graph_executor_codegen
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index 931b032ca1ed..16d87c821753 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -61,6 +61,7 @@ def __init__(self, mod, target) -> None:
         super().__init__()
         self.mod = mod
         self.tgt = target
+        self.tec = te_compiler.get()
         self.fun_no = 0
         self.var_no = 0
         self.var_map = {}
@@ -242,11 +243,11 @@ def create_op_call(self, op: Function, relay_args, py_args):
         the generated Python code."""
 
         # compile the function and register globally
-        cc_key = te_compiler.CCacheKey(op, self.tgt)
+        cc_key = self.tec.CCacheKey(op, self.tgt)
         func_hash = tvm.ir.structural_hash(op)
         op_name = "_lowered_op_{}".format(func_hash)
         if not tvm.get_global_func(op_name, allow_missing=True):
-            jitted = self.te_compiler.jit(cc_key, self.tgt)
+            jitted = self.tec.jit(cc_key, self.tgt)
             tvm.register_func(op_name, jitted)
 
         def convert_input(py_input, arg_type):
diff --git a/tests/python/relay/test_relay_te_compiler.py b/tests/python/relay/test_relay_te_compiler.py
index e3e3d0f523cc..f8498ae83648 100644
--- a/tests/python/relay/test_relay_te_compiler.py
+++ b/tests/python/relay/test_relay_te_compiler.py
@@ -21,6 +21,7 @@
 from tvm import relay
 from tvm import autotvm
 from tvm import topi
+from tvm.relay.backend import te_compiler
 from tvm.relay.testing import run_infer_type
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 import tvm.testing
@@ -162,8 +163,7 @@ def _select_impl(dshape, wshape, use_autotvm=False):
 
 
 def test_te_compiler():
-    te_compiler = relay.backend.te_compiler.get()
-    # print(te_compiler)
+    tec = relay.backend.te_compiler.get()
 
     def get_func(shape):
         x = relay.var("x", shape=shape)
@@ -174,20 +174,20 @@ def get_func(shape):
         mod = relay.transform.InferType()(mod)
         return mod["main"]
 
-    z1 = te_compiler.lower(get_func((10,)), "llvm")
-    z2 = te_compiler.lower(get_func((10,)), "llvm")
-    z3 = te_compiler.lower(get_func(()), "llvm")
+    z1 = tec.lower(get_func((10,)), "llvm")
+    z2 = tec.lower(get_func((10,)), "llvm")
+    z3 = tec.lower(get_func(()), "llvm")
     assert z1.same_as(z2)
     assert not z3.same_as(z1)
     if tvm.testing.device_enabled("cuda"):
-        z4 = te_compiler.lower(get_func(()), "cuda")
+        z4 = tec.lower(get_func(()), "cuda")
         assert not z3.same_as(z4)
 
     # Test JIT target
     for target in ["llvm"]:
         dev = tvm.device(target)
         if tvm.testing.device_enabled(target):
-            f = te_compiler.jit(get_func((10,)), target)
+            f = tec.jit(get_func((10,)), target)
             x = tvm.nd.array(np.ones(10).astype("float32"), device=dev)
             y = tvm.nd.empty((10,), device=dev)
             f(x, y)

From 1a537132f2a14ad60a2230285d3d2286cf56a2f6 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Mon, 18 Oct 2021 15:54:05 +0300
Subject: [PATCH 17/18] PR nits

---
 python/tvm/relay/backend/te_compiler.py       |  1 -
 .../contrib/test_vitis_ai/infrastructure.py   | 23 ++++++++++---------
 tests/python/relay/aot/aot_test_utils.py      |  1 -
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index d3d32125d6f9..db7504915887 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -375,7 +375,6 @@ def jit(self, source_func, target=None):
         jited_func: tvm.runtime.PackedFunc
             The result of jited function.
         """
-        print("caling jit \n")
         key = _get_cache_key(source_func, target)
         return _backend._TECompilerJIT(self, key)
 
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index d0270deb1671..578ac37da25b 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -18,23 +18,24 @@
 
 """Expose Vitis-AI test functions to the Python frontend"""
 
-from tvm.contrib import utils
-from tvm.contrib import graph_executor
-from tvm.contrib.target import vitis_ai
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
-from tvm.relay import transform
-from tvm import runtime
-from tvm import relay
-import tvm
-import pyxir.contrib.target.DPUCZDX8G
-import pyxir.contrib.target.DPUCADX8G
 import sys
 import numpy as np
 
 import pytest
 
 pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+import pyxir.contrib.target.DPUCZDX8G
+
+import tvm
+from tvm import relay
+from tvm import runtime
+from tvm.relay import transform
+from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
+from tvm.relay.build_module import bind_params_by_name
+from tvm.contrib.target import vitis_ai
+from tvm.contrib import graph_executor
+from tvm.contrib import utils
 
 
 def get_cpu_op_count(mod):
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index f2b49ecbded7..276cad375357 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -723,7 +723,6 @@ def compile_and_run(
 
 def generate_ref_data(mod, input_data, params=None, target="llvm"):
     """Generate reference data through executing the relay module"""
-    te_compiler.get().clear()
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 

From 3f0842451c0ebf15382111a86cd9d9dd7a2a73b9 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Mon, 18 Oct 2021 19:34:32 +0300
Subject: [PATCH 18/18] Fix failed test

---
 python/tvm/relay/testing/py_converter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index 16d87c821753..50f473aea1f2 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -243,7 +243,7 @@ def create_op_call(self, op: Function, relay_args, py_args):
         the generated Python code."""
 
         # compile the function and register globally
-        cc_key = self.tec.CCacheKey(op, self.tgt)
+        cc_key = te_compiler.CCacheKey(op, self.tgt)
         func_hash = tvm.ir.structural_hash(op)
         op_name = "_lowered_op_{}".format(func_hash)
         if not tvm.get_global_func(op_name, allow_missing=True):